Missing conjugate already fixed in develop

Faster copy region
Accelerator peek poke
2025-11-16 11:39:31 +00:00 · 2020-04-10 11:11:24 -04:00 · 2020-04-10 11:10:52 -04:00 · 2020-04-10 11:09:59 -04:00 · 2020-04-10 11:09:11 -04:00 · 2020-04-10 11:08:19 -04:00
137 changed files with 8996 additions and 2012 deletions
--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@@ -35,6 +35,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 #include <Grid/algorithms/approx/Zolotarev.h>
 #include <Grid/algorithms/approx/Chebyshev.h>
+#include <Grid/algorithms/approx/JacobiPolynomial.h>
 #include <Grid/algorithms/approx/Remez.h>
 #include <Grid/algorithms/approx/MultiShiftFunction.h>
 #include <Grid/algorithms/approx/Forecast.h>
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -1,3 +1,14 @@
+    // blockZaxpy in bockPromote - 3s, 5%
+    // noncoalesced linalg in Preconditionoer ~ 3s 5%
+    // Lancos tuning or replace 10-20s ~ 25%, open ended
+    // setup tuning   5s  ~  8%
+    //    -- e.g. ordermin, orderstep tunables.
+    // MdagM path without norm in LinOp code.     few seconds
+
+    // Mdir calc blocking kernels
+    // Fuse kernels in blockMaskedInnerProduct
+    // preallocate Vectors in Cayley 5D ~ few percent few seconds
+
 /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 
@@ -34,15 +45,36 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

+template<class vobj,class CComplex>
+inline void blockMaskedInnerProduct(Lattice<CComplex> &CoarseInner,
+				    const Lattice<decltype(innerProduct(vobj(),vobj()))> &FineMask,
+				    const Lattice<vobj> &fineX,
+				    const Lattice<vobj> &fineY)
+{
+  typedef decltype(innerProduct(vobj(),vobj())) dotp;
+
+  GridBase *coarse(CoarseInner.Grid());
+  GridBase *fine  (fineX.Grid());
+
+  Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
+  Lattice<dotp> fine_inner_msk(fine);
+
+  // Multiply could be fused with innerProduct
+  // Single block sum kernel could do both masks.
+  fine_inner = localInnerProduct(fineX,fineY);
+  mult(fine_inner_msk, fine_inner,FineMask);
+  blockSum(CoarseInner,fine_inner_msk);
+}
+
+
 class Geometry {
-  //    int dimension;
 public:
  int npoint;
  std::vector<int> directions   ;
  std::vector<int> displacements;

  Geometry(int _d)  {
-  
+    
    int base = (_d==5) ? 1:0;

    // make coarse grid stencil for 4d , not 5d
@@ -52,10 +84,10 @@ public:
    directions.resize(npoint);
    displacements.resize(npoint);
    for(int d=0;d<_d;d++){
-      directions[2*d  ] = d+base;
-      directions[2*d+1] = d+base;
-      displacements[2*d  ] = +1;
-      displacements[2*d+1] = -1;
+      directions[d   ] = d+base;
+      directions[d+_d] = d+base;
+      displacements[d  ] = +1;
+      displacements[d+_d]= -1;
    }
    directions   [2*_d]=0;
    displacements[2*_d]=0;
@@ -63,7 +95,7 @@ public:
    //// report back
    std::cout<<GridLogMessage<<"directions    :";
    for(int d=0;d<npoint;d++) std::cout<< directions[d]<< " ";
-    std::cout <<std::endl;
+    std::cout<<std::endl;
    std::cout<<GridLogMessage<<"displacements :";
    for(int d=0;d<npoint;d++) std::cout<< displacements[d]<< " ";
    std::cout<<std::endl;
@@ -115,10 +147,10 @@ public:
  
  void Orthogonalise(void){
    CoarseScalar InnerProd(CoarseGrid); 
-    std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
-    blockOrthogonalise(InnerProd,subspace);
-    std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
+    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
    blockOrthogonalise(InnerProd,subspace);
+    //    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 2"<<std::endl; // Really have to do twice? Yuck
+    //    blockOrthogonalise(InnerProd,subspace);
    //      std::cout << GridLogMessage <<" Gramm-Schmidt checking orthogonality"<<std::endl;
    //      CheckOrthogonal();
  } 
@@ -128,7 +160,7 @@ public:
    for(int i=0;i<nbasis;i++){
      blockProject(iProj,subspace[i],subspace);
      eProj=Zero(); 
-      thread_for(ss, CoarseGrid->oSites(),{
+      accelerator_for(ss, CoarseGrid->oSites(),1,{
 	eProj[ss](i)=CComplex(1.0);
      });
      eProj=eProj - iProj;
@@ -146,66 +178,14 @@ public:
  void CreateSubspaceRandom(GridParallelRNG &RNG){
    for(int i=0;i<nbasis;i++){
      random(RNG,subspace[i]);
-      std::cout<<GridLogMessage<<" norm subspace["<<i<<"] "<<norm2(subspace[i])<<std::endl;
    }
-    Orthogonalise();
  }

-  /*
-    virtual void CreateSubspaceLanczos(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) 
-    {
-    // Run a Lanczos with sloppy convergence
-    const int Nstop = nn;
-    const int Nk = nn+20;
-    const int Np = nn+20;
-    const int Nm = Nk+Np;
-    const int MaxIt= 10000;
-    RealD resid = 1.0e-3;
-
-    Chebyshev<FineField> Cheb(0.5,64.0,21);
-    ImplicitlyRestartedLanczos<FineField> IRL(hermop,Cheb,Nstop,Nk,Nm,resid,MaxIt);
-    //	IRL.lock = 1;
-
-    FineField noise(FineGrid); gaussian(RNG,noise);
-    FineField tmp(FineGrid); 
-    std::vector<RealD>     eval(Nm);
-    std::vector<FineField> evec(Nm,FineGrid);
-
-    int Nconv;
-    IRL.calc(eval,evec,
-    noise,
-    Nconv);
-
-    // pull back nn vectors
-    for(int b=0;b<nn;b++){
-
-    subspace[b]   = evec[b];
-
-    std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
-
-    hermop.Op(subspace[b],tmp); 
-    std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(tmp)<<std::endl;
-
-    noise = tmp -  sqrt(eval[b])*subspace[b] ;
-
-    std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<"  ;  [ M - Lambda ]_"<<b<<" vec_"<<b<<"  = " <<norm2(noise)<<std::endl;
-
-    noise = tmp +  eval[b]*subspace[b] ;
-
-    std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<"  ;  [ M - Lambda ]_"<<b<<" vec_"<<b<<"  = " <<norm2(noise)<<std::endl;
-
-    }
-    Orthogonalise();
-    for(int b=0;b<nn;b++){
-    std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
-    }
-    }
-  */
  virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {

    RealD scale;

-    ConjugateGradient<FineField> CG(1.0e-2,10000);
+    ConjugateGradient<FineField> CG(1.0e-2,100,false);
    FineField noise(FineGrid);
    FineField Mn(FineGrid);

@@ -232,21 +212,316 @@ public:
      subspace[b]   = noise;

    }
-
-    Orthogonalise();
-
  }
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
+  // and this is the best I found
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+#if 1
+  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
+				       int nn,
+				       double hi,
+				       double lo,
+				       int orderfilter,
+				       int ordermin,
+				       int orderstep,
+				       double filterlo
+				       ) {
+
+    RealD scale;
+
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+    FineField tmp(FineGrid);
+
+    // New normalised noise
+    gaussian(RNG,noise);
+    scale = std::pow(norm2(noise),-0.5); 
+    noise=noise*scale;
+
+    // Initial matrix element
+    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+    int b =0;
+    {
+      // Filter
+      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
+      Cheb(hermop,noise,Mn);
+      // normalise
+      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
+      subspace[b]   = Mn;
+      hermop.Op(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+      b++;
+    }
+
+    // Generate a full sequence of Chebyshevs
+    {
+      lo=filterlo;
+      noise=Mn;
+
+      FineField T0(FineGrid); T0 = noise;  
+      FineField T1(FineGrid); 
+      FineField T2(FineGrid);
+      FineField y(FineGrid);
+      
+      FineField *Tnm = &T0;
+      FineField *Tn  = &T1;
+      FineField *Tnp = &T2;
+
+      // Tn=T1 = (xscale M + mscale)in
+      RealD xscale = 2.0/(hi-lo);
+      RealD mscale = -(hi+lo)/(hi-lo);
+      hermop.HermOp(T0,y);
+      T1=y*xscale+noise*mscale;
+
+      for(int n=2;n<=ordermin+orderstep*(nn-2);n++){
+	
+	hermop.HermOp(*Tn,y);
+
+	auto y_v = y.View();
+	auto Tn_v = Tn->View();
+	auto Tnp_v = Tnp->View();
+	auto Tnm_v = Tnm->View();
+	const int Nsimd = CComplex::Nsimd();
+	accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
+	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
+	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
+        });
+
+	// Possible more fine grained control is needed than a linear sweep,
+	// but huge productivity gain if this is simple algorithm and not a tunable
+	int m =1;
+	if ( n>=ordermin ) m=n-ordermin;
+	if ( (m%orderstep)==0 ) { 
+	  Mn=*Tnp;
+	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale;
+	  subspace[b] = Mn;
+	  hermop.Op(Mn,tmp); 
+	  std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+	  b++;
+	}
+
+	// Cycle pointers to avoid copies
+	FineField *swizzle = Tnm;
+	Tnm    =Tn;
+	Tn     =Tnp;
+	Tnp    =swizzle;
+	  
+      }
+    }
+    assert(b==nn);
+  }
+#endif
+#if 0
+  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
+				       int nn,
+				       double hi,
+				       double lo,
+				       int orderfilter,
+				       int ordermin,
+				       int orderstep,
+				       double filterlo
+				       ) {
+
+    RealD scale;
+
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+    FineField tmp(FineGrid);
+    FineField combined(FineGrid);
+
+    // New normalised noise
+    gaussian(RNG,noise);
+    scale = std::pow(norm2(noise),-0.5); 
+    noise=noise*scale;
+
+    // Initial matrix element
+    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+    int b =0;
+#define FILTERb(llo,hhi,oorder)						\
+    {									\
+      Chebyshev<FineField> Cheb(llo,hhi,oorder);			\
+      Cheb(hermop,noise,Mn);						\
+      scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;			\
+      subspace[b]   = Mn;						\
+      hermop.Op(Mn,tmp);						\
+      std::cout<<GridLogMessage << oorder<< " Cheb filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
+      b++;								\
+    }									
+
+    //      JacobiPolynomial<FineField> Cheb(0.002,60.0,1500,-0.5,3.5);	\
+
+    RealD alpha=-0.8;
+    RealD beta =-0.8;
+#define FILTER(llo,hhi,oorder)						\
+    {									\
+      Chebyshev<FineField> Cheb(llo,hhi,oorder);			\
+      /* JacobiPolynomial<FineField> Cheb(0.0,60.0,oorder,alpha,beta);*/\
+      Cheb(hermop,noise,Mn);						\
+      scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;			\
+      subspace[b]   = Mn;						\
+      hermop.Op(Mn,tmp);						\
+      std::cout<<GridLogMessage << oorder<< "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
+      b++;								\
+    }									
+    
+#define FILTERc(llo,hhi,oorder)				\
+    {							\
+      Chebyshev<FineField> Cheb(llo,hhi,oorder);	\
+      Cheb(hermop,noise,combined);			\
+    }									
+
+    double node = 0.000;
+    FILTERb(lo,hi,orderfilter);// 0
+    //    FILTERc(node,hi,51);// 0
+    noise = Mn;
+    int base = 0;
+    int mult = 100;
+    FILTER(node,hi,base+1*mult);
+    FILTER(node,hi,base+2*mult);
+    FILTER(node,hi,base+3*mult);
+    FILTER(node,hi,base+4*mult);
+    FILTER(node,hi,base+5*mult);
+    FILTER(node,hi,base+6*mult);
+    FILTER(node,hi,base+7*mult);
+    FILTER(node,hi,base+8*mult);
+    FILTER(node,hi,base+9*mult);
+    FILTER(node,hi,base+10*mult);
+    FILTER(node,hi,base+11*mult);
+    FILTER(node,hi,base+12*mult);
+    FILTER(node,hi,base+13*mult);
+    FILTER(node,hi,base+14*mult);
+    FILTER(node,hi,base+15*mult);
+    assert(b==nn);
+  }
+#endif
+
+#if 0
+  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
+				       int nn,
+				       double hi,
+				       double lo,
+				       int orderfilter,
+				       int ordermin,
+				       int orderstep,
+				       double filterlo
+				       ) {
+
+    RealD scale;
+
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+    FineField tmp(FineGrid);
+    FineField combined(FineGrid);
+
+    // New normalised noise
+    gaussian(RNG,noise);
+    scale = std::pow(norm2(noise),-0.5); 
+    noise=noise*scale;
+
+    // Initial matrix element
+    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+    int b =0;
+    {						
+      Chebyshev<FineField> JacobiPoly(0.005,60.,1500);
+      //      JacobiPolynomial<FineField> JacobiPoly(0.002,60.0,1500,-0.5,3.5);
+      //JacobiPolynomial<FineField> JacobiPoly(0.03,60.0,500,-0.5,3.5);
+      //      JacobiPolynomial<FineField> JacobiPoly(0.00,60.0,1000,-0.5,3.5);
+      JacobiPoly(hermop,noise,Mn);
+      scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
+      subspace[b]   = Mn;
+      hermop.Op(Mn,tmp);
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; 
+      b++;
+      //      scale = std::pow(norm2(tmp),-0.5);     tmp=tmp*scale;
+      //      subspace[b]   = tmp;      b++;
+      //    }									
+    }									
+
+#define FILTER(lambda)						\
+    {								\
+      hermop.HermOp(subspace[0],tmp);				\
+      tmp = tmp - lambda *subspace[0];				\
+      scale = std::pow(norm2(tmp),-0.5);			\
+      tmp=tmp*scale;							\
+      subspace[b]   = tmp;						\
+      hermop.Op(subspace[b],tmp);					\
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
+      b++;								\
+    }									
+    //      scale = std::pow(norm2(tmp),-0.5);     tmp=tmp*scale;
+    //      subspace[b]   = tmp;      b++;
+    //    }									
+
+    FILTER(2.0e-5);
+    FILTER(2.0e-4);
+    FILTER(4.0e-4);
+    FILTER(8.0e-4);
+    FILTER(8.0e-4);
+
+    FILTER(2.0e-3);
+    FILTER(3.0e-3);
+    FILTER(4.0e-3);
+    FILTER(5.0e-3);
+    FILTER(6.0e-3);
+
+    FILTER(2.5e-3);
+    FILTER(3.5e-3);
+    FILTER(4.5e-3);
+    FILTER(5.5e-3);
+    FILTER(6.5e-3);
+
+    //    FILTER(6.0e-5);//6
+    //    FILTER(7.0e-5);//8
+    //    FILTER(8.0e-5);//9
+    //    FILTER(9.0e-5);//3
+
+    /*
+    //    FILTER(1.0e-4);//10
+    FILTER(2.0e-4);//11
+    //   FILTER(3.0e-4);//12
+    //    FILTER(4.0e-4);//13
+    FILTER(5.0e-4);//14
+
+    FILTER(6.0e-3);//4
+    FILTER(7.0e-4);//1
+    FILTER(8.0e-4);//7
+    FILTER(9.0e-4);//15
+    FILTER(1.0e-3);//2
+
+    FILTER(2.0e-3);//2
+    FILTER(3.0e-3);//2
+    FILTER(4.0e-3);//2
+    FILTER(5.0e-3);//2
+    FILTER(6.0e-3);//2
+
+    FILTER(7.0e-3);//2
+    FILTER(8.0e-3);//2
+    FILTER(1.0e-2);//2
+    */
+    std::cout << GridLogMessage <<"Jacobi filtering done" <<std::endl;
+    assert(b==nn);
+  }
+#endif
+
+
 };
+
 // Fine Object == (per site) type of fine field
 // nbasis      == number of deflation vectors
 template<class Fobj,class CComplex,int nbasis>
 class CoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > >  {
 public:
    
-  typedef iVector<CComplex,nbasis >             siteVector;
+  typedef iVector<CComplex,nbasis >           siteVector;
+  typedef Lattice<CComplex >                  CoarseComplexField;
  typedef Lattice<siteVector>                 CoarseVector;
  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
-
+  typedef iMatrix<CComplex,nbasis >  Cobj;
  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
  typedef Lattice<Fobj >        FineField;

@@ -255,11 +530,11 @@ public:
  ////////////////////
  Geometry         geom;
  GridBase *       _grid; 
+  int hermitian;

  CartesianStencil<siteVector,siteVector,int> Stencil; 

  std::vector<CoarseMatrix> A;
-
      
  ///////////////////////
  // Interface
@@ -271,64 +546,136 @@ public:
    conformable(_grid,in.Grid());
    conformable(in.Grid(),out.Grid());

+    //    RealD Nin = norm2(in);
    SimpleCompressor<siteVector> compressor;
+
+    double comms_usec = -usecond();
    Stencil.HaloExchange(in,compressor);
+    comms_usec += usecond();
+
    auto in_v = in.View();
-    auto out_v = in.View();
-    thread_for(ss,Grid()->oSites(),{
-      siteVector res = Zero();
-      siteVector nbr;
+    auto out_v = out.View();
+    typedef LatticeView<Cobj> Aview;
+
+    Vector<Aview> AcceleratorViewContainer;
+    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View());
+    Aview *Aview_p = & AcceleratorViewContainer[0];
+
+    const int Nsimd = CComplex::Nsimd();
+    typedef decltype(coalescedRead(in_v[0])) calcVector;
+    typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
+
+    GridStopWatch ArithmeticTimer;
+    int osites=Grid()->oSites();
+    //    double flops = osites*Nsimd*nbasis*nbasis*8.0*geom.npoint;
+    //    double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex);
+    double usecs =-usecond();
+    // assert(geom.npoint==9);
+
+    accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
+      int ss = sss/nbasis;
+      int b  = sss%nbasis;
+      calcComplex res = Zero();
+      calcVector nbr;
      int ptype;
      StencilEntry *SE;
+
+      int lane=SIMTlane(Nsimd);
      for(int point=0;point<geom.npoint;point++){

 	SE=Stencil.GetEntry(ptype,point,ss);
 	  
-	if(SE->_is_local&&SE->_permute) { 
-	  permute(nbr,in_v[SE->_offset],ptype);
-	} else if(SE->_is_local) { 
-	  nbr = in_v[SE->_offset];
+	if(SE->_is_local) { 
+	  nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
 	} else {
-	  nbr = Stencil.CommBuf()[SE->_offset];
+	  nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
+	}
+	synchronise();
+
+	for(int bb=0;bb<nbasis;bb++) {
+	  res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
 	}
-	auto A_point = A[point].View();
-	res = res + A_point[ss]*nbr;
      }
-      vstream(out_v[ss],res);
+      coalescedWrite(out_v[ss](b),res,lane);
    });
-    return norm2(out);
+    usecs +=usecond();
+
+    double nrm_usec=-usecond();
+    RealD Nout= norm2(out);
+    nrm_usec+=usecond();
+
+    /*
+        std::cout << GridLogMessage << "\tNorm        " << nrm_usec << " us" <<std::endl;
+        std::cout << GridLogMessage << "\tHalo        " << comms_usec << " us" <<std::endl;
+        std::cout << GridLogMessage << "\tMatrix      " << usecs << " us" <<std::endl;
+        std::cout << GridLogMessage << "\t  mflop/s   " << flops/usecs<<std::endl;
+        std::cout << GridLogMessage << "\t  MB/s      " << bytes/usecs<<std::endl;
+    */
+    return Nout;
  };

-  RealD Mdag (const CoarseVector &in, CoarseVector &out){
-    // // corresponds to Petrov-Galerkin coarsening
-    // return M(in,out);
-    
-    // corresponds to Galerkin coarsening
-    CoarseVector tmp(Grid());
-    G5C(tmp, in);
-    M(tmp, out);
-    G5C(out, out);
-    return norm2(out);
+  RealD Mdag (const CoarseVector &in, CoarseVector &out)
+  {
+    if(hermitian) {
+      // corresponds to Petrov-Galerkin coarsening
+      return M(in,out);
+    } else {
+      // corresponds to Galerkin coarsening
+      CoarseVector tmp(Grid());
+      G5C(tmp, in); 
+      M(tmp, out);
+      G5C(out, out);
+      return norm2(out);
+    }
  };
-
-  void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){
-    
-    conformable(_grid,in.Grid());
-    conformable(in.Grid(),out.Grid());
-    
+  void MdirComms(const CoarseVector &in)
+  {
    SimpleCompressor<siteVector> compressor;
    Stencil.HaloExchange(in,compressor);
-    
-    auto point = [dir, disp](){
-      if(dir == 0 and disp == 0)
-	return 8;
-      else
-	return (4 * dir + 1 - disp) / 2;
-    }();
+  }
+  void MdirCalc(const CoarseVector &in, CoarseVector &out, int point)
+  {
+    conformable(_grid,in.Grid());
+    conformable(_grid,out.Grid());
+
+    typedef LatticeView<Cobj> Aview;
+    Vector<Aview> AcceleratorViewContainer;
+    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View());
+    Aview *Aview_p = & AcceleratorViewContainer[0];

    auto out_v = out.View();
    auto in_v  = in.View();
-    thread_for(ss,Grid()->oSites(),{
+
+    const int Nsimd = CComplex::Nsimd();
+    typedef decltype(coalescedRead(in_v[0])) calcVector;
+    typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
+
+    accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
+      int ss = sss/nbasis;
+      int b  = sss%nbasis;
+      calcComplex res = Zero();
+      calcVector nbr;
+      int ptype;
+      StencilEntry *SE;
+
+      int lane=SIMTlane(Nsimd);
+      SE=Stencil.GetEntry(ptype,point,ss);
+	  
+      if(SE->_is_local) { 
+	nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
+      } else {
+	nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
+      }
+      synchronise();
+
+      for(int bb=0;bb<nbasis;bb++) {
+	res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
+      }
+      coalescedWrite(out_v[ss](b),res,lane);
+    });
+#if 0
+    accelerator_for(ss,Grid()->oSites(),1,{
+
      siteVector res = Zero();
      siteVector nbr;
      int ptype;
@@ -343,43 +690,112 @@ public:
      } else {
 	nbr = Stencil.CommBuf()[SE->_offset];
      }
+      synchronise();

-      auto A_point = A[point].View();
-      res = res + A_point[ss]*nbr;
+      res = res + Aview_p[point][ss]*nbr;
      
-      vstream(out_v[ss],res);
+      out_v[ss]=res;
    });
+#endif
+  }
+  void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
+  {
+    this->MdirComms(in);
+    int ndir=geom.npoint-1;
+    if ((out.size()!=ndir)&&(out.size()!=ndir+1)) { 
+      std::cout <<"MdirAll out size "<< out.size()<<std::endl;
+      std::cout <<"MdirAll ndir "<< ndir<<std::endl;
+      assert(0);
+    }
+    for(int p=0;p<ndir;p++){
+      MdirCalc(in,out[p],p);
+    }
+  };
+  void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){
+
+    this->MdirComms(in);
+
+    int ndim = in.Grid()->Nd();
+
+    //////////////
+    // 4D action like wilson
+    // 0+ => 0 
+    // 0- => 1
+    // 1+ => 2 
+    // 1- => 3
+    // etc..
+    //////////////
+    // 5D action like DWF
+    // 1+ => 0 
+    // 1- => 1
+    // 2+ => 2 
+    // 2- => 3
+    // etc..
+    auto point = [dir, disp, ndim](){
+      if(dir == 0 and disp == 0)
+	return 8;
+      else if ( ndim==4 ) { 
+	return (4 * dir + 1 - disp) / 2;
+      } else { 
+	return (4 * (dir-1) + 1 - disp) / 2;
+      }
+    }();
+
+    MdirCalc(in,out,point);
+
  };

-  void Mdiag(const CoarseVector &in, CoarseVector &out){
-    Mdir(in, out, 0, 0); // use the self coupling (= last) point of the stencil
+  void Mdiag(const CoarseVector &in, CoarseVector &out)
+  {
+    int point=geom.npoint-1;
+    MdirCalc(in, out, point); // No comms
  };

  
- CoarsenedMatrix(GridCartesian &CoarseGrid) 	: 
+ CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) 	: 

    _grid(&CoarseGrid),
    geom(CoarseGrid._ndimension),
+    hermitian(hermitian_),
    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
-    A(geom.npoint,&CoarseGrid)
+      A(geom.npoint,&CoarseGrid)
  {
  };

  void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
-		       Aggregation<Fobj,CComplex,nbasis> & Subspace){
+		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
+  {
+    typedef Lattice<typename Fobj::tensor_reduced> FineComplexField;
+    typedef typename Fobj::scalar_type scalar_type;

-    FineField iblock(FineGrid); // contributions from within this block
-    FineField oblock(FineGrid); // contributions from outwith this block
+    FineComplexField one(FineGrid); one=scalar_type(1.0,0.0);
+    FineComplexField zero(FineGrid); zero=scalar_type(0.0,0.0);
+
+    std::vector<FineComplexField> masks(geom.npoint,FineGrid);
+    FineComplexField imask(FineGrid); // contributions from within this block
+    FineComplexField omask(FineGrid); // contributions from outwith this block
+
+    FineComplexField evenmask(FineGrid);
+    FineComplexField oddmask(FineGrid); 

    FineField     phi(FineGrid);
    FineField     tmp(FineGrid);
    FineField     zz(FineGrid); zz=Zero();
    FineField    Mphi(FineGrid);
+    FineField    Mphie(FineGrid);
+    FineField    Mphio(FineGrid);
+    std::vector<FineField>     Mphi_p(geom.npoint,FineGrid);

-    Lattice<iScalar<vInteger> > coor(FineGrid);
+    Lattice<iScalar<vInteger> > coor (FineGrid);
+    Lattice<iScalar<vInteger> > bcoor(FineGrid);
+    Lattice<iScalar<vInteger> > bcb  (FineGrid); bcb = Zero();

    CoarseVector iProj(Grid()); 
    CoarseVector oProj(Grid()); 
+    CoarseVector SelfProj(Grid()); 
+    CoarseComplexField iZProj(Grid()); 
+    CoarseComplexField oZProj(Grid()); 
+
    CoarseScalar InnerProd(Grid()); 

    // Orthogonalise the subblocks over the basis
@@ -388,69 +804,117 @@ public:
    // Compute the matrix elements of linop between this orthonormal
    // set of vectors.
    int self_stencil=-1;
-    for(int p=0;p<geom.npoint;p++){ 
+    for(int p=0;p<geom.npoint;p++)
+    { 
+      int dir   = geom.directions[p];
+      int disp  = geom.displacements[p];
      A[p]=Zero();
      if( geom.displacements[p]==0){
 	self_stencil=p;
      }
+
+      Integer block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]);
+
+      LatticeCoordinate(coor,dir);
+
+      ///////////////////////////////////////////////////////
+      // Work out even and odd block checkerboarding for fast diagonal term
+      ///////////////////////////////////////////////////////
+      if ( disp==1 ) {
+	bcb   = bcb + div(coor,block);
+      }
+	
+      if ( disp==0 ) {
+	  masks[p]= Zero();
+      } else if ( disp==1 ) {
+	masks[p] = where(mod(coor,block)==(block-1),one,zero);
+      } else if ( disp==-1 ) {
+	masks[p] = where(mod(coor,block)==(Integer)0,one,zero);
+      }
    }
+    evenmask = where(mod(bcb,2)==(Integer)0,one,zero);
+    oddmask  = one-evenmask;
+
    assert(self_stencil!=-1);

    for(int i=0;i<nbasis;i++){
+
      phi=Subspace.subspace[i];
-	
-      std::cout<<GridLogMessage<<"("<<i<<").."<<std::endl;
+
+      //      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl;
+      linop.OpDirAll(phi,Mphi_p);
+      linop.OpDiag  (phi,Mphi_p[geom.npoint-1]);

      for(int p=0;p<geom.npoint;p++){ 

+	Mphi = Mphi_p[p];
+
 	int dir   = geom.directions[p];
 	int disp  = geom.displacements[p];

-	Integer block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]);
+	if ( (disp==-1) || (!hermitian ) ) {

-	LatticeCoordinate(coor,dir);
-
-	if ( disp==0 ){
-	  linop.OpDiag(phi,Mphi);
-	}
-	else  {
-	  linop.OpDir(phi,Mphi,dir,disp); 
-	}
-
-	////////////////////////////////////////////////////////////////////////
-	// Pick out contributions coming from this cell and neighbour cell
-	////////////////////////////////////////////////////////////////////////
-	if ( disp==0 ) {
-	  iblock = Mphi;
-	  oblock = Zero();
-	} else if ( disp==1 ) {
-	  oblock = where(mod(coor,block)==(block-1),Mphi,zz);
-	  iblock = where(mod(coor,block)!=(block-1),Mphi,zz);
-	} else if ( disp==-1 ) {
-	  oblock = where(mod(coor,block)==(Integer)0,Mphi,zz);
-	  iblock = where(mod(coor,block)!=(Integer)0,Mphi,zz);
-	} else {
-	  assert(0);
-	}
-
-	Subspace.ProjectToSubspace(iProj,iblock);
-	Subspace.ProjectToSubspace(oProj,oblock);
-	//	  blockProject(iProj,iblock,Subspace.subspace);
-	//	  blockProject(oProj,oblock,Subspace.subspace);
-	auto iProj_v = iProj.View() ;
-	auto oProj_v = oProj.View() ;
-	auto A_p     =  A[p].View();
-	auto A_self  = A[self_stencil].View();
-	thread_for(ss, Grid()->oSites(),{
+	  ////////////////////////////////////////////////////////////////////////
+	  // Pick out contributions coming from this cell and neighbour cell
+	  ////////////////////////////////////////////////////////////////////////
+	  omask = masks[p];
+	  imask = one-omask;
+	
 	  for(int j=0;j<nbasis;j++){
-	    if( disp!= 0 ) {
-	      A_p[ss](j,i) = oProj_v[ss](j);
-	    }
-	    A_self[ss](j,i) =	A_self[ss](j,i) + iProj_v[ss](j);
+	    
+	    blockMaskedInnerProduct(oZProj,omask,Subspace.subspace[j],Mphi);
+	    
+	    auto iZProj_v = iZProj.View() ;
+	    auto oZProj_v = oZProj.View() ;
+	    auto A_p     =  A[p].View();
+	    auto A_self  = A[self_stencil].View();
+
+	    accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
+	    //      if( disp!= 0 ) { accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });}
+	    //	    accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_self[ss](j,i),A_self(ss)(j,i)+iZProj_v(ss)); });
+
+	  }
+	}
+      }
+
+      ///////////////////////////////////////////
+      // Faster alternate self coupling.. use hermiticity to save 2x
+      ///////////////////////////////////////////
+      {
+	mult(tmp,phi,evenmask);  linop.Op(tmp,Mphie);
+	mult(tmp,phi,oddmask );  linop.Op(tmp,Mphio);
+
+	{
+	  auto tmp_      = tmp.View();
+	  auto evenmask_ = evenmask.View();
+	  auto oddmask_  =  oddmask.View();
+	  auto Mphie_    =  Mphie.View();
+	  auto Mphio_    =  Mphio.View();
+	  accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{ 
+	      coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss));
+	    });
+	}
+
+	blockProject(SelfProj,tmp,Subspace.subspace);
+
+	auto SelfProj_ = SelfProj.View();
+	auto A_self  = A[self_stencil].View();
+
+	accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{
+	  for(int j=0;j<nbasis;j++){
+	    coalescedWrite(A_self[ss](j,i), SelfProj_(ss)(j));
 	  }
 	});
+
      }
    }
+    if(hermitian) {
+      std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
+      ForceHermitian();
+    }
+      // AssertHermitian();
+      // ForceDiagonal();
+  }

 #if 0
    ///////////////////////////
@@ -473,17 +937,26 @@ public:
    std::cout<<GridLogMessage<< iProj <<std::endl;
    std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
 #endif
-      //      ForceHermitian();
-      // AssertHermitian();
-      // ForceDiagonal();
-  }
+

  void ForceHermitian(void) {
-    for(int d=0;d<4;d++){
-      int dd=d+1;
-      A[2*d] = adj(Cshift(A[2*d+1],dd,1));
+    CoarseMatrix Diff  (Grid());
+    for(int p=0;p<geom.npoint;p++){
+      int dir   = geom.directions[p];
+      int disp  = geom.displacements[p];
+      if(disp==-1) {
+	// Find the opposite link
+	for(int pp=0;pp<geom.npoint;pp++){
+	  int dirp   = geom.directions[pp];
+	  int dispp  = geom.displacements[pp];
+	  if ( (dirp==dir) && (dispp==1) ){
+	    //	    Diff = adj(Cshift(A[p],dir,1)) - A[pp]; 
+	    //	    std::cout << GridLogMessage<<" Replacing stencil leg "<<pp<<" with leg "<<p<< " diff "<<norm2(Diff) <<std::endl;
+	    A[pp] = adj(Cshift(A[p],dir,1));
+	  }
+	}
+      }
    }
-    //      A[8] = 0.5*(A[8] + adj(A[8]));
  }
  void AssertHermitian(void) {
    CoarseMatrix AA    (Grid());
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -47,6 +47,7 @@ public:
  // Support for coarsening to a multigrid
  virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base
  virtual void OpDir  (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base
+  virtual void OpDirAll  (const Field &in, std::vector<Field> &out) = 0; // Abstract base

  virtual void Op     (const Field &in, Field &out) = 0; // Abstract base
  virtual void AdjOp  (const Field &in, Field &out) = 0; // Abstract base
@@ -83,6 +84,9 @@ public:
  void OpDir  (const Field &in, Field &out,int dir,int disp) {
    _Mat.Mdir(in,out,dir,disp);
  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    _Mat.MdirAll(in,out);
+  };
  void Op     (const Field &in, Field &out){
    _Mat.M(in,out);
  }
@@ -93,8 +97,7 @@ public:
    _Mat.MdagM(in,out,n1,n2);
  }
  void HermOp(const Field &in, Field &out){
-    RealD n1,n2;
-    HermOpAndNorm(in,out,n1,n2);
+    _Mat.MdagM(in,out);
  }
 };

@@ -116,6 +119,9 @@ public:
    _Mat.Mdir(in,out,dir,disp);
    assert(0);
  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    assert(0);
+  };
  void Op     (const Field &in, Field &out){
    _Mat.M(in,out);
    assert(0);
@@ -154,6 +160,9 @@ public:
  void OpDir  (const Field &in, Field &out,int dir,int disp) {
    _Mat.Mdir(in,out,dir,disp);
  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    _Mat.MdirAll(in,out);
+  };
  void Op     (const Field &in, Field &out){
    _Mat.M(in,out);
  }
@@ -162,7 +171,6 @@ public:
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
    _Mat.M(in,out);
-	
    ComplexD dot= innerProduct(in,out); n1=real(dot);
    n2=norm2(out);
  }
@@ -171,6 +179,35 @@ public:
  }
 };

+template<class Matrix,class Field>
+class NonHermitianLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+public:
+  NonHermitianLinearOperator(Matrix &Mat): _Mat(Mat){};
+  // Support for coarsening to a multigrid
+  void OpDiag (const Field &in, Field &out) {
+    _Mat.Mdiag(in,out);
+  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {
+    _Mat.Mdir(in,out,dir,disp);
+  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    _Mat.MdirAll(in,out);
+  };
+  void Op     (const Field &in, Field &out){
+    _Mat.M(in,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    _Mat.Mdag(in,out);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    assert(0);
+  }
+  void HermOp(const Field &in, Field &out){
+    assert(0);
+  }
+};
+
    //////////////////////////////////////////////////////////
    // Even Odd Schur decomp operators; there are several
    // ways to introduce the even odd checkerboarding
@@ -208,6 +245,9 @@ public:
      void OpDir  (const Field &in, Field &out,int dir,int disp) {
 	assert(0);
      }
+      void OpDirAll  (const Field &in, std::vector<Field> &out){
+	assert(0);
+      };
    };
    template<class Matrix,class Field>
    class SchurDiagMooeeOperator :  public SchurOperatorBase<Field> {
@@ -296,7 +336,7 @@ public:
    };
    ///////////////////////////////////////////////////////////////////////////////////////////////////
    // Left  handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta  -->  ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta
-    // Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta  -->  ( 1 - Moe Mee^-1 Meo ) Moo^-1 phi=eta ; psi = Moo^-1 phi
+    // Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta  -->  ( 1 - Moe Mee^-1 Meo Moo^-1) phi=eta ; psi = Moo^-1 phi
    ///////////////////////////////////////////////////////////////////////////////////////////////////
    template<class Matrix,class Field> using SchurDiagOneRH = SchurDiagTwoOperator<Matrix,Field> ;
    template<class Matrix,class Field> using SchurDiagOneLH = SchurDiagOneOperator<Matrix,Field> ;
--- a/Grid/algorithms/SparseMatrix.h
+++ b/Grid/algorithms/SparseMatrix.h
@@ -45,8 +45,13 @@ public:
    ni=M(in,tmp);
    no=Mdag(tmp,out);
  }
+  virtual void  MdagM(const Field &in, Field &out) {
+    RealD ni, no;
+    MdagM(in,out,ni,no);
+  }
  virtual  void Mdiag    (const Field &in, Field &out)=0;
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
+  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)=0;
 };

 /////////////////////////////////////////////////////////////////////////////////////////////
@@ -56,12 +61,12 @@ template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrix
 public:
  virtual GridBase *RedBlackGrid(void)=0;

-      //////////////////////////////////////////////////////////////////////
-      // Query the even even properties to make algorithmic decisions
-      //////////////////////////////////////////////////////////////////////
-      virtual RealD  Mass(void)        { return 0.0; };
-      virtual int    ConstEE(void)     { return 1; }; // Disable assumptions unless overridden
-      virtual int    isTrivialEE(void) { return 0; }; // by a derived class that knows better
+  //////////////////////////////////////////////////////////////////////
+  // Query the even even properties to make algorithmic decisions
+  //////////////////////////////////////////////////////////////////////
+  virtual RealD  Mass(void)        { return 0.0; };
+  virtual int    ConstEE(void)     { return 1; }; // Disable assumptions unless overridden
+  virtual int    isTrivialEE(void) { return 0; }; // by a derived class that knows better

  // half checkerboard operaions
  virtual  void Meooe    (const Field &in, Field &out)=0;
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@@ -94,6 +94,24 @@ public:
    Coeffs.assign(0.,order);
    Coeffs[order-1] = 1.;
  };
+  
+  // PB - more efficient low pass drops high modes above the low as 1/x uses all Chebyshev's.
+  // Similar kick effect below the threshold as Lanczos filter approach
+  void InitLowPass(RealD _lo,RealD _hi,int _order)
+  {
+    lo=_lo;
+    hi=_hi;
+    order=_order;
+      
+    if(order < 2) exit(-1);
+    Coeffs.resize(order);
+    for(int j=0;j<order;j++){
+      RealD k=(order-1.0);
+      RealD s=std::cos( j*M_PI*(k+0.5)/order );
+      Coeffs[j] = s * 2.0/order;
+    }
+    
+  };

  void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
  {
@@ -234,20 +252,20 @@ public:
    RealD xscale = 2.0/(hi-lo);
    RealD mscale = -(hi+lo)/(hi-lo);
    Linop.HermOp(T0,y);
-    T1=y*xscale+in*mscale;
+    axpby(T1,xscale,mscale,y,in);

    // sum = .5 c[0] T0 + c[1] T1
-    out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
+    //    out = ()*T0 + Coeffs[1]*T1;
+    axpby(out,0.5*Coeffs[0],Coeffs[1],T0,T1);
    for(int n=2;n<order;n++){
 	
      Linop.HermOp(*Tn,y);
-
-      y=xscale*y+mscale*(*Tn);
-
-      *Tnp=2.0*y-(*Tnm);
-
-      out=out+Coeffs[n]* (*Tnp);
-
+      //     y=xscale*y+mscale*(*Tn);
+      //      *Tnp=2.0*y-(*Tnm);
+      //      out=out+Coeffs[n]* (*Tnp);
+      axpby(y,xscale,mscale,y,(*Tn));
+      axpby(*Tnp,2.0,-1.0,y,(*Tnm));
+      axpy(out,Coeffs[n],*Tnp,out);
      // Cycle pointers to avoid copies
      Field *swizzle = Tnm;
      Tnm    =Tn;
--- a/Grid/algorithms/approx/JacobiPolynomial.h
+++ b/Grid/algorithms/approx/JacobiPolynomial.h
@@ -0,0 +1,129 @@
+#ifndef GRID_JACOBIPOLYNOMIAL_H
+#define GRID_JACOBIPOLYNOMIAL_H
+
+#include <Grid/algorithms/LinearOperator.h>
+
+NAMESPACE_BEGIN(Grid);
+
+template<class Field>
+class JacobiPolynomial : public OperatorFunction<Field> {
+ private:
+  using OperatorFunction<Field>::operator();
+
+  int order;
+  RealD hi;
+  RealD lo;
+  RealD alpha;
+  RealD beta;
+
+ public:
+  void csv(std::ostream &out){
+    csv(out,lo,hi);
+  }
+  void csv(std::ostream &out,RealD llo,RealD hhi){
+    RealD diff = hhi-llo;
+    RealD delta = diff*1.0e-5;
+    for (RealD x=llo-delta; x<=hhi; x+=delta) {
+      RealD f = approx(x);
+      out<< x<<" "<<f <<std::endl;
+    }
+    return;
+  }
+
+  JacobiPolynomial(){};
+  JacobiPolynomial(RealD _lo,RealD _hi,int _order,RealD _alpha, RealD _beta)
+  {
+      lo=_lo;
+      hi=_hi;
+      alpha=_alpha;
+      beta=_beta;
+      order=_order;
+  };
+
+  RealD approx(RealD x) // Convenience for plotting the approximation                                                       
+  {
+    RealD Tn;
+    RealD Tnm;
+    RealD Tnp;
+
+    RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
+
+    RealD T0=1.0;
+    RealD T1=(alpha-beta)*0.5+(alpha+beta+2.0)*0.5*y;
+
+    Tn =T1;
+    Tnm=T0;
+    for(int n=2;n<=order;n++){
+      RealD cnp = 2.0*n*(n+alpha+beta)*(2.0*n-2.0+alpha+beta);
+      RealD cny = (2.0*n-2.0+alpha+beta)*(2.0*n-1.0+alpha+beta)*(2.0*n+alpha+beta);
+      RealD cn1 = (2.0*n+alpha+beta-1.0)*(alpha*alpha-beta*beta);
+      RealD cnm = - 2.0*(n+alpha-1.0)*(n+beta-1.0)*(2.0*n+alpha+beta);
+      Tnp= ( cny * y *Tn + cn1 * Tn + cnm * Tnm )/ cnp;
+      Tnm=Tn;
+      Tn =Tnp;
+    }
+    return Tnp;
+  };
+
+  // Implement the required interface                                                                                       
+  void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
+    GridBase *grid=in.Grid();
+
+    int vol=grid->gSites();
+
+    Field T0(grid);
+    Field T1(grid);
+    Field T2(grid);
+    Field y(grid);
+
+
+    Field *Tnm = &T0;
+    Field *Tn  = &T1;
+    Field *Tnp = &T2;
+
+    //    RealD T0=1.0;                                                                                                     
+    T0=in;
+
+    //    RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));                                                                           
+    //           = x * 2/(hi-lo) - (hi+lo)/(hi-lo)                                                                          
+    Linop.HermOp(T0,y);
+    RealD xscale = 2.0/(hi-lo);
+    RealD mscale = -(hi+lo)/(hi-lo);
+    Linop.HermOp(T0,y);
+    y=y*xscale+in*mscale;
+
+    // RealD T1=(alpha-beta)*0.5+(alpha+beta+2.0)*0.5*y;
+    RealD halfAmB  = (alpha-beta)*0.5;
+    RealD halfApBp2= (alpha+beta+2.0)*0.5;
+    T1 = halfAmB * in + halfApBp2*y;
+
+    for(int n=2;n<=order;n++){
+
+      Linop.HermOp(*Tn,y);
+      y=xscale*y+mscale*(*Tn);
+
+      RealD cnp = 2.0*n*(n+alpha+beta)*(2.0*n-2.0+alpha+beta);
+      RealD cny = (2.0*n-2.0+alpha+beta)*(2.0*n-1.0+alpha+beta)*(2.0*n+alpha+beta);
+      RealD cn1 = (2.0*n+alpha+beta-1.0)*(alpha*alpha-beta*beta);
+      RealD cnm = - 2.0*(n+alpha-1.0)*(n+beta-1.0)*(2.0*n+alpha+beta);
+
+      //      Tnp= ( cny * y *Tn + cn1 * Tn + cnm * Tnm )/ cnp;                                                             
+      cny=cny/cnp;
+      cn1=cn1/cnp;
+      cn1=cn1/cnp;
+      cnm=cnm/cnp;
+
+      *Tnp=cny*y + cn1 *(*Tn) + cnm * (*Tnm);
+
+      // Cycle pointers to avoid copies                                                                                     
+      Field *swizzle = Tnm;
+      Tnm    =Tn;
+      Tn     =Tnp;
+      Tnp    =swizzle;
+    }
+    out=*Tnp;
+
+  }
+};
+NAMESPACE_END(Grid);
+#endif
--- a/Grid/algorithms/iterative/ConjugateGradient.h
+++ b/Grid/algorithms/iterative/ConjugateGradient.h
@@ -71,7 +71,6 @@ public:
    // Initial residual computation & set up
    RealD guess = norm2(psi);
    assert(std::isnan(guess) == 0);
-
    
    Linop.HermOpAndNorm(psi, mmp, d, b);
    
@@ -154,18 +153,18 @@ public:
        RealD resnorm = std::sqrt(norm2(p));
        RealD true_residual = resnorm / srcnorm;

-        std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl;
-        std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)<<std::endl;
-	std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
-	std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
+        std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k 
+		  << "\tComputed residual " << std::sqrt(cp / ssq)
+		  << "\tTrue residual " << true_residual
+		  << "\tTarget " << Tolerance << std::endl;

-        std::cout << GridLogMessage << "Time breakdown "<<std::endl;
-	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
+        std::cout << GridLogIterative << "Time breakdown "<<std::endl;
+	std::cout << GridLogIterative << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;

        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);

--- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -43,6 +43,11 @@ NAMESPACE_BEGIN(Grid);
 template<class Field>
 void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k) 
 {
+  // If assume basis[j] are already orthonormal,
+  // can take all inner products in parallel saving 2x bandwidth
+  // Save 3x bandwidth on the second line of loop.
+  // perhaps 2.5x speed up.
+  // 2x overall in Multigrid Lanczos  
  for(int j=0; j<k; ++j){
    auto ip = innerProduct(basis[j],w);
    w = w - ip*basis[j];
@@ -54,17 +59,19 @@ void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, i
 {
  typedef decltype(basis[0].View()) View;
  auto tmp_v = basis[0].View();
-  std::vector<View> basis_v(basis.size(),tmp_v);
+  Vector<View> basis_v(basis.size(),tmp_v);
  typedef typename Field::vector_object vobj;
  GridBase* grid = basis[0].Grid();
-      
+
  for(int k=0;k<basis.size();k++){
    basis_v[k] = basis[k].View();
  }
-
+#if 0
+  std::vector < vobj , commAllocator<vobj> > Bt(thread_max() * Nm); // Thread private
  thread_region
  {
-    std::vector < vobj , commAllocator<vobj> > B(Nm); // Thread private
+    vobj* B = Bt.data() + Nm * thread_num();
+
    thread_for_in_region(ss, grid->oSites(),{
      for(int j=j0; j<j1; ++j) B[j]=0.;
      
@@ -78,24 +85,89 @@ void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, i
      }
    });
  }
+#else
+
+  int nrot = j1-j0;
+
+
+  uint64_t oSites   =grid->oSites();
+  uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
+
+  //  printf("BasisRotate %d %d nrot %d siteBlock %d\n",j0,j1,nrot,siteBlock);
+
+  Vector <vobj> Bt(siteBlock * nrot); 
+  auto Bp=&Bt[0];
+
+  // GPU readable copy of Eigen matrix
+  Vector<double> Qt_jv(Nm*Nm);
+  double *Qt_p = & Qt_jv[0];
+  for(int k=0;k<Nm;++k){
+    for(int j=0;j<Nm;++j){
+      Qt_p[j*Nm+k]=Qt(j,k);
+    }
+  }
+
+  // Block the loop to keep storage footprint down
+  vobj zz=Zero();
+  for(uint64_t s=0;s<oSites;s+=siteBlock){
+
+    // remaining work in this block
+    int ssites=MIN(siteBlock,oSites-s);
+
+    // zero out the accumulators
+    accelerator_for(ss,siteBlock*nrot,vobj::Nsimd(),{
+	auto z=coalescedRead(zz);
+	coalescedWrite(Bp[ss],z);
+    });
+
+    accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
+	
+      int j =sj%nrot;
+      int jj  =j0+j;
+      int ss =sj/nrot;
+      int sss=ss+s;
+
+      for(int k=k0; k<k1; ++k){
+	auto tmp = coalescedRead(Bp[ss*nrot+j]);
+	coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_v[k][sss]));
+      }
+    });
+
+    accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
+      int j =sj%nrot;
+      int jj  =j0+j;
+      int ss =sj/nrot;
+      int sss=ss+s;
+      coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
+    });
+  }
+#endif
 }

 // Extract a single rotated vector
 template<class Field>
 void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) 
 {
+  typedef decltype(basis[0].View()) View;
  typedef typename Field::vector_object vobj;
  GridBase* grid = basis[0].Grid();

  result.Checkerboard() = basis[0].Checkerboard();
  auto result_v=result.View();
-  thread_for(ss, grid->oSites(),{
-    vobj B = Zero();
+  Vector<View> basis_v(basis.size(),result_v);
+  for(int k=0;k<basis.size();k++){
+    basis_v[k] = basis[k].View();
+  }
+  vobj zz=Zero();
+  Vector<double> Qt_jv(Nm);
+  double * Qt_j = & Qt_jv[0];
+  for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
+  accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
+    auto B=coalescedRead(zz);
    for(int k=k0; k<k1; ++k){
-      auto basis_k = basis[k].View();
-      B +=Qt(j,k) * basis_k[ss];
+      B +=Qt_j[k] * coalescedRead(basis_v[k][ss]);
    }
-    result_v[ss] = B;
+    coalescedWrite(result_v[ss], B);
  });
 }

@@ -279,7 +351,7 @@ public:
 			    RealD _eresid, // resid in lmdue deficit 
 			    int _MaxIter, // Max iterations
 			    RealD _betastp=0.0, // if beta(k) < betastp: converged
-			    int _MinRestart=1, int _orth_period = 1,
+			    int _MinRestart=0, int _orth_period = 1,
 			    IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
    SimpleTester(HermOp), _PolyOp(PolyOp),      _HermOp(HermOp), _Tester(Tester),
    Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm),
@@ -295,7 +367,7 @@ public:
 			       RealD _eresid, // resid in lmdue deficit 
 			       int _MaxIter, // Max iterations
 			       RealD _betastp=0.0, // if beta(k) < betastp: converged
-			       int _MinRestart=1, int _orth_period = 1,
+			       int _MinRestart=0, int _orth_period = 1,
 			       IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
    SimpleTester(HermOp),  _PolyOp(PolyOp),      _HermOp(HermOp), _Tester(SimpleTester),
    Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm),
@@ -344,7 +416,7 @@ until convergence
    GridBase *grid = src.Grid();
    assert(grid == evec[0].Grid());
    
-    GridLogIRL.TimingMode(1);
+    //    GridLogIRL.TimingMode(1);
    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
    std::cout << GridLogIRL <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 /  "<< MaxIter<< std::endl;
    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
@@ -369,14 +441,17 @@ until convergence
    {
      auto src_n = src;
      auto tmp = src;
+      std::cout << GridLogIRL << " IRL source norm " << norm2(src) << std::endl;
      const int _MAX_ITER_IRL_MEVAPP_ = 50;
      for (int i=0;i<_MAX_ITER_IRL_MEVAPP_;i++) {
 	normalise(src_n);
 	_HermOp(src_n,tmp);
+	//	std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
+	//	std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
 	RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
 	RealD vden = norm2(src_n);
 	RealD na = vnum/vden;
-	if (fabs(evalMaxApprox/na - 1.0) < 0.05)
+	if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
 	  i=_MAX_ITER_IRL_MEVAPP_;
 	evalMaxApprox = na;
 	std::cout << GridLogIRL << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
@@ -574,11 +649,11 @@ until convergence
 /* Saad PP. 195
 1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0
 2. For k = 1,2,...,m Do:
-3. wk:=Avk−βkv_{k−1}      
-4. αk:=(wk,vk)       // 
-5. wk:=wk−αkvk       // wk orthog vk 
-6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
-7. vk+1 := wk/βk+1
+3. wk:=Avk - b_k v_{k-1}      
+4. ak:=(wk,vk)       // 
+5. wk:=wk-akvk       // wk orthog vk 
+6. bk+1 := ||wk||_2. If b_k+1 = 0 then Stop
+7. vk+1 := wk/b_k+1
 8. EndDo
 */
  void step(std::vector<RealD>& lmd,
@@ -586,6 +661,7 @@ until convergence
 	    std::vector<Field>& evec,
 	    Field& w,int Nm,int k)
  {
+    std::cout<<GridLogIRL << "Lanczos step " <<k<<std::endl;
    const RealD tiny = 1.0e-20;
    assert( k< Nm );

@@ -597,20 +673,20 @@ until convergence

    if(k>0) w -= lme[k-1] * evec[k-1];

-    ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk)
+    ComplexD zalph = innerProduct(evec_k,w);
    RealD     alph = real(zalph);

-    w = w - alph * evec_k;// 5. wk:=wk−αkvk
+    w = w - alph * evec_k;

-    RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
-    // 7. vk+1 := wk/βk+1
+    RealD beta = normalise(w); 

    lmd[k] = alph;
    lme[k] = beta;

-    if (k>0 && k % orth_period == 0) {
+    if ( (k>0) && ( (k % orth_period) == 0 )) {
+      std::cout<<GridLogIRL << "Orthogonalising " <<k<<std::endl;
      orthogonalize(w,evec,k); // orthonormalise
-      std::cout<<GridLogIRL << "Orthogonalised " <<std::endl;
+      std::cout<<GridLogIRL << "Orthogonalised " <<k<<std::endl;
    }

    if(k < Nm-1) evec[k+1] = w;
@@ -618,6 +694,8 @@ until convergence
    std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
    if ( beta < tiny ) 
      std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl;
+
+    std::cout<<GridLogIRL << "Lanczos step complete " <<k<<std::endl;
  }

  void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
--- a/Grid/algorithms/iterative/NormalEquations.h
+++ b/Grid/algorithms/iterative/NormalEquations.h
@@ -33,26 +33,78 @@ NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // Take a matrix and form an NE solver calling a Herm solver
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
-template<class Field> class NormalEquations : public OperatorFunction<Field>{
+template<class Field> class NormalEquations {
 private:
  SparseMatrixBase<Field> & _Matrix;
  OperatorFunction<Field> & _HermitianSolver;
-
+  LinearFunction<Field>   & _Guess;
 public:

  /////////////////////////////////////////////////////
  // Wrap the usual normal equations trick
  /////////////////////////////////////////////////////
-  NormalEquations(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver) 
-    :  _Matrix(Matrix), _HermitianSolver(HermitianSolver) {}; 
+ NormalEquations(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
+		 LinearFunction<Field> &Guess) 
+   :  _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {}; 

  void operator() (const Field &in, Field &out){
 
    Field src(in.Grid());
+    Field tmp(in.Grid());

+    MdagMLinearOperator<SparseMatrixBase<Field>,Field> MdagMOp(_Matrix);
    _Matrix.Mdag(in,src);
-    _HermitianSolver(src,out);  // Mdag M out = Mdag in
+    _Guess(src,out);
+    _HermitianSolver(MdagMOp,src,out);  // Mdag M out = Mdag in
+
+  }     
+};
+
+template<class Field> class HPDSolver {
+private:
+  LinearOperatorBase<Field> & _Matrix;
+  OperatorFunction<Field> & _HermitianSolver;
+  LinearFunction<Field>   & _Guess;
+public:
+
+  /////////////////////////////////////////////////////
+  // Wrap the usual normal equations trick
+  /////////////////////////////////////////////////////
+ HPDSolver(LinearOperatorBase<Field> &Matrix,
+	   OperatorFunction<Field> &HermitianSolver,
+	   LinearFunction<Field> &Guess) 
+   :  _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {}; 
+
+  void operator() (const Field &in, Field &out){
 
+    _Guess(in,out);
+    _HermitianSolver(_Matrix,in,out);  // Mdag M out = Mdag in
+
+  }     
+};
+
+
+template<class Field> class MdagMSolver {
+private:
+  SparseMatrixBase<Field> & _Matrix;
+  OperatorFunction<Field> & _HermitianSolver;
+  LinearFunction<Field>   & _Guess;
+public:
+
+  /////////////////////////////////////////////////////
+  // Wrap the usual normal equations trick
+  /////////////////////////////////////////////////////
+ MdagMSolver(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
+	     LinearFunction<Field> &Guess) 
+   :  _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {}; 
+
+  void operator() (const Field &in, Field &out){
+ 
+    MdagMLinearOperator<SparseMatrixBase<Field>,Field> MdagMOp(_Matrix);
+    _Guess(in,out);
+
+    _HermitianSolver(MdagMOp,in,out);  // Mdag M out = Mdag in
+
  }     
 };

--- a/Grid/algorithms/iterative/PowerMethod.h
+++ b/Grid/algorithms/iterative/PowerMethod.h
@@ -30,12 +30,12 @@ template<class Field> class PowerMethod
      RealD vden = norm2(src_n); 
      RealD na = vnum/vden; 
      
-      if ( (fabs(evalMaxApprox/na - 1.0) < 0.01) || (i==_MAX_ITER_EST_-1) ) { 
+      if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) { 
 	evalMaxApprox = na; 
+	std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
 	return evalMaxApprox; 
      } 
      evalMaxApprox = na; 
-      std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
      src_n = tmp;
    }
    assert(0);
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
@@ -38,10 +38,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 NAMESPACE_BEGIN(Grid);

+#define GCRLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level<<" " 
+
 template<class Field>
-class PrecGeneralisedConjugateResidual : public OperatorFunction<Field> {
+class PrecGeneralisedConjugateResidual : public LinearFunction<Field> {
 public:                                                
-  using OperatorFunction<Field>::operator();

  RealD   Tolerance;
  Integer MaxIterations;
@@ -49,23 +50,29 @@ public:
  int mmax;
  int nstep;
  int steps;
+  int level;
  GridStopWatch PrecTimer;
  GridStopWatch MatTimer;
  GridStopWatch LinalgTimer;

-  LinearFunction<Field> &Preconditioner;
+  LinearFunction<Field>     &Preconditioner;
+  LinearOperatorBase<Field> &Linop;

-  PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec,int _mmax,int _nstep) : 
+  void Level(int lv) { level=lv; };
+
+  PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearOperatorBase<Field> &_Linop,LinearFunction<Field> &Prec,int _mmax,int _nstep) : 
    Tolerance(tol), 
    MaxIterations(maxit),
+    Linop(_Linop),
    Preconditioner(Prec),
    mmax(_mmax),
    nstep(_nstep)
  { 
+    level=1;
    verbose=1;
  };

-  void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
+  void operator() (const Field &src, Field &psi){

    psi=Zero();
    RealD cp, ssq,rsq;
@@ -84,9 +91,9 @@ public:
    steps=0;
    for(int k=0;k<MaxIterations;k++){

-      cp=GCRnStep(Linop,src,psi,rsq);
+      cp=GCRnStep(src,psi,rsq);

-      std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl;
+      GCRLogLevel <<"PGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl;

      if(cp<rsq) {

@@ -95,24 +102,26 @@ public:
 	Linop.HermOp(psi,r);
 	axpy(r,-1.0,src,r);
 	RealD tr = norm2(r);
-	std::cout<<GridLogMessage<<"PrecGeneralisedConjugateResidual: Converged on iteration " <<steps
+	GCRLogLevel<<"PGCR: Converged on iteration " <<steps
 		 << " computed residual "<<sqrt(cp/ssq)
 		 << " true residual "    <<sqrt(tr/ssq)
 		 << " target "           <<Tolerance <<std::endl;

-	std::cout<<GridLogMessage<<"VPGCR Time elapsed: Total  "<< SolverTimer.Elapsed() <<std::endl;
-	std::cout<<GridLogMessage<<"VPGCR Time elapsed: Precon "<<   PrecTimer.Elapsed() <<std::endl;
-	std::cout<<GridLogMessage<<"VPGCR Time elapsed: Matrix "<<    MatTimer.Elapsed() <<std::endl;
-	std::cout<<GridLogMessage<<"VPGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl;
+	GCRLogLevel<<"PGCR Time elapsed: Total  "<< SolverTimer.Elapsed() <<std::endl;
+	/*
+	  GCRLogLevel<<"PGCR Time elapsed: Precon "<<   PrecTimer.Elapsed() <<std::endl;
+	  GCRLogLevel<<"PGCR Time elapsed: Matrix "<<    MatTimer.Elapsed() <<std::endl;
+	  GCRLogLevel<<"PGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl;
+	*/
 	return;
      }

    }
-    std::cout<<GridLogMessage<<"Variable Preconditioned GCR did not converge"<<std::endl;
-    assert(0);
+    GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
+    //    assert(0);
  }

-  RealD GCRnStep(LinearOperatorBase<Field> &Linop,const Field &src, Field &psi,RealD rsq){
+  RealD GCRnStep(const Field &src, Field &psi,RealD rsq){

    RealD cp;
    RealD a, b;
@@ -134,6 +143,7 @@ public:
    std::vector<Field> p(mmax,grid);
    std::vector<RealD> qq(mmax);
      
+    GCRLogLevel<< "PGCR nStep("<<nstep<<")"<<std::endl;

    //////////////////////////////////
    // initial guess x0 is taken as nonzero.
@@ -143,38 +153,26 @@ public:
    Linop.HermOpAndNorm(psi,Az,zAz,zAAz); 
    MatTimer.Stop();
    
+
    LinalgTimer.Start();
    r=src-Az;
    LinalgTimer.Stop();
+    GCRLogLevel<< "PGCR true residual r = src - A psi   "<<norm2(r) <<std::endl;
    
    /////////////////////
    // p = Prec(r)
    /////////////////////
+
    PrecTimer.Start();
    Preconditioner(r,z);
    PrecTimer.Stop();

-    MatTimer.Start();
-    Linop.HermOp(z,tmp); 
-    MatTimer.Stop();
-
-    LinalgTimer.Start();
-    ttmp=tmp;
-    tmp=tmp-r;
-    LinalgTimer.Stop();
-
-    /*
-      std::cout<<GridLogMessage<<r<<std::endl;
-      std::cout<<GridLogMessage<<z<<std::endl;
-      std::cout<<GridLogMessage<<ttmp<<std::endl;
-      std::cout<<GridLogMessage<<tmp<<std::endl;
-    */
-
    MatTimer.Start();
    Linop.HermOpAndNorm(z,Az,zAz,zAAz); 
    MatTimer.Stop();

    LinalgTimer.Start();
+
    //p[0],q[0],qq[0] 
    p[0]= z;
    q[0]= Az;
@@ -200,11 +198,12 @@ public:
      cp = axpy_norm(r,-a,q[peri_k],r);
      LinalgTimer.Stop();

+      GCRLogLevel<< "PGCR step["<<steps<<"]  resid " << cp << " target " <<rsq<<std::endl; 
+
      if((k==nstep-1)||(cp<rsq)){
 	return cp;
      }

-      std::cout<<GridLogMessage<< " VPGCR_step["<<steps<<"]  resid " <<sqrt(cp/rsq)<<std::endl; 

      PrecTimer.Start();
      Preconditioner(r,z);// solve Az = r
@@ -212,12 +211,9 @@ public:

      MatTimer.Start();
      Linop.HermOpAndNorm(z,Az,zAz,zAAz);
-      Linop.HermOp(z,tmp);
      MatTimer.Stop();

      LinalgTimer.Start();
-      tmp=tmp-r;
-      std::cout<<GridLogMessage<< " Preconditioner resid " <<sqrt(norm2(tmp)/norm2(r))<<std::endl; 

      q[peri_kp]=Az;
      p[peri_kp]=z;
--- a/Grid/algorithms/iterative/QuasiMinimalResidual.h
+++ b/Grid/algorithms/iterative/QuasiMinimalResidual.h
@@ -0,0 +1,371 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/algorithmsf/iterative/QuasiMinimalResidual.h
+
+Copyright (C) 2019
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+template<class Field> 
+RealD innerG5ProductReal(Field &l, Field &r)
+{
+  Gamma G5(Gamma::Algebra::Gamma5);
+  Field tmp(l.Grid());
+  //  tmp = G5*r;
+  G5R5(tmp,r);
+  ComplexD ip =innerProduct(l,tmp);
+  std::cout << "innerProductRealG5R5 "<<ip<<std::endl;
+  return ip.real();
+}
+
+template<class Field>
+class QuasiMinimalResidual : public OperatorFunction<Field> {
+ public:
+  using OperatorFunction<Field>::operator();
+
+  bool ErrorOnNoConverge; 
+  RealD   Tolerance;
+  Integer MaxIterations;
+  Integer IterationCount;
+
+  QuasiMinimalResidual(RealD   tol,
+		       Integer maxit,
+		       bool    err_on_no_conv = true)
+      : Tolerance(tol)
+      , MaxIterations(maxit)
+      , ErrorOnNoConverge(err_on_no_conv) 
+  {};
+
+#if 1
+  void operator()(LinearOperatorBase<Field> &LinOp, const Field &b, Field &x) 
+  {
+    RealD resid;
+    IterationCount=0;
+
+    RealD  rho, rho_1, xi, gamma, gamma_1, theta, theta_1;
+    RealD  eta, delta, ep, beta; 
+
+    GridBase *Grid = b.Grid();
+    Field r(Grid), d(Grid), s(Grid);
+    Field v(Grid), w(Grid), y(Grid),  z(Grid);
+    Field v_tld(Grid), w_tld(Grid), y_tld(Grid), z_tld(Grid);
+    Field p(Grid), q(Grid), p_tld(Grid);
+
+    Real normb = norm2(b);
+
+    LinOp.Op(x,r); r = b - r;
+
+    assert(normb> 0.0);
+
+    resid = norm2(r)/normb;
+    if (resid <= Tolerance) {
+      return;
+    }
+
+    v_tld = r;
+    y = v_tld;
+    rho = norm2(y);
+
+    // Take Gamma5 conjugate
+    //    Gamma G5(Gamma::Algebra::Gamma5);
+    //    G5R5(w_tld,r);
+    //    w_tld = G5* v_tld;
+    w_tld=v_tld;
+    z = w_tld;
+    xi = norm2(z);
+
+    gamma = 1.0;
+    eta   = -1.0;
+    theta = 0.0;
+
+    for (int i = 1; i <= MaxIterations; i++) {
+
+      // Breakdown tests
+      assert( rho != 0.0);
+      assert( xi  != 0.0);
+
+      v = (1. / rho) * v_tld;
+      y = (1. / rho) * y;
+
+      w = (1. / xi) * w_tld;
+      z = (1. / xi) * z;
+
+      ComplexD Zdelta = innerProduct(z, y); // Complex?
+      std::cout << "Zdelta "<<Zdelta<<std::endl;
+      delta = Zdelta.real();
+
+      y_tld = y; 
+      z_tld = z;
+
+      if (i > 1) {
+	p = y_tld - (xi  * delta / ep) * p;
+	q = z_tld - (rho * delta / ep) * q;
+      } else {
+	p = y_tld;
+	q = z_tld;
+      }
+
+      LinOp.Op(p,p_tld);      //     p_tld = A * p;
+      ComplexD Zep = innerProduct(q, p_tld);
+      ep=Zep.real();
+      std::cout << "Zep "<<Zep <<std::endl;
+      // Complex Audit
+      assert(abs(ep)>0);
+
+      beta = ep / delta;
+      assert(abs(beta)>0);
+
+      v_tld = p_tld - beta * v;
+      y = v_tld;
+
+      rho_1 = rho;
+      rho   = norm2(y);
+      LinOp.AdjOp(q,w_tld);
+      w_tld = w_tld - beta * w;
+      z = w_tld;
+
+      xi = norm2(z);
+
+      gamma_1 = gamma;
+      theta_1 = theta;
+
+      theta   = rho / (gamma_1 * beta);
+      gamma   = 1.0 / sqrt(1.0 + theta * theta);
+      std::cout << "theta "<<theta<<std::endl;
+      std::cout << "gamma "<<gamma<<std::endl;
+
+      assert(abs(gamma)> 0.0);
+
+      eta = -eta * rho_1 * gamma* gamma / (beta * gamma_1 * gamma_1);
+
+      if (i > 1) {
+	d = eta * p + (theta_1 * theta_1 * gamma * gamma) * d;
+	s = eta * p_tld + (theta_1 * theta_1 * gamma * gamma) * s;
+      } else {
+	d = eta * p;
+	s = eta * p_tld;
+      }
+
+      x =x+d;                            // update approximation vector
+      r =r-s;                            // compute residual
+
+      if ((resid = norm2(r) / normb) <= Tolerance) {
+	return;
+      }
+      std::cout << "Iteration "<<i<<" resid " << resid<<std::endl;
+    }
+    assert(0);
+    return;                            // no convergence
+  }
+#else
+  // QMRg5 SMP thesis
+  void operator()(LinearOperatorBase<Field> &LinOp, const Field &b, Field &x) 
+  {
+    // Real scalars
+    GridBase *grid = b.Grid();
+
+    Field    r(grid);
+    Field    p_m(grid), p_m_minus_1(grid), p_m_minus_2(grid);
+    Field    v_m(grid), v_m_minus_1(grid), v_m_plus_1(grid);
+    Field    tmp(grid);
+
+    RealD    w;
+    RealD    z1, z2;
+    RealD    delta_m, delta_m_minus_1;
+    RealD    c_m_plus_1, c_m, c_m_minus_1;
+    RealD    s_m_plus_1, s_m, s_m_minus_1;
+    RealD    alpha, beta, gamma, epsilon;
+    RealD    mu, nu, rho, theta, xi, chi;
+    RealD    mod2r, mod2b;
+    RealD    tau2, target2;
+
+    mod2b=norm2(b);
+
+    /////////////////////////
+    // Initial residual
+    /////////////////////////
+    LinOp.Op(x,tmp);
+    r = b - tmp;
+
+    /////////////////////////
+    // \mu = \rho = |r_0|
+    /////////////////////////
+    mod2r = norm2(r);
+    rho = sqrt( mod2r);
+    mu=rho;
+    
+    std::cout << "QuasiMinimalResidual rho "<< rho<<std::endl;
+    /////////////////////////
+    // Zero negative history
+    /////////////////////////
+    v_m_plus_1  = Zero();
+    v_m_minus_1 = Zero();
+    p_m_minus_1 = Zero();
+    p_m_minus_2 = Zero();
+
+    // v0
+    v_m = (1.0/rho)*r;
+
+    /////////////////////////
+    // Initial coeffs
+    /////////////////////////
+    delta_m_minus_1 = 1.0;
+    c_m_minus_1     = 1.0;
+    c_m             = 1.0;
+    s_m_minus_1     = 0.0;
+    s_m             = 0.0;
+
+    /////////////////////////
+    // Set up convergence check
+    /////////////////////////
+    tau2    = mod2r;
+    target2 = mod2b * Tolerance*Tolerance;
+ 
+    for(int iter = 0 ; iter < MaxIterations; iter++){
+
+      /////////////////////////
+      // \delta_m = (v_m, \gamma_5 v_m) 
+      /////////////////////////
+      delta_m = innerG5ProductReal(v_m,v_m);
+      std::cout << "QuasiMinimalResidual delta_m "<< delta_m<<std::endl;
+
+      /////////////////////////
+      // tmp = A v_m
+      /////////////////////////
+      LinOp.Op(v_m,tmp);
+
+      /////////////////////////
+      // \alpha = (v_m, \gamma_5 temp) / \delta_m 
+      /////////////////////////
+      alpha = innerG5ProductReal(v_m,tmp);
+      alpha = alpha/delta_m ;
+      std::cout << "QuasiMinimalResidual alpha "<< alpha<<std::endl;
+
+      /////////////////////////
+      // \beta = \rho \delta_m / \delta_{m-1}
+      /////////////////////////
+      beta = rho * delta_m / delta_m_minus_1;
+      std::cout << "QuasiMinimalResidual beta "<< beta<<std::endl;
+
+      /////////////////////////
+      // \tilde{v}_{m+1} = temp - \alpha v_m - \beta v_{m-1}
+      /////////////////////////
+      v_m_plus_1 = tmp - alpha*v_m - beta*v_m_minus_1;
+
+      ///////////////////////////////
+      // \rho = || \tilde{v}_{m+1} ||
+      ///////////////////////////////
+      rho = sqrt( norm2(v_m_plus_1) );
+      std::cout << "QuasiMinimalResidual rho "<< rho<<std::endl;
+
+      ///////////////////////////////
+      //      v_{m+1} = \tilde{v}_{m+1}
+      ///////////////////////////////
+      v_m_plus_1 = (1.0 / rho) * v_m_plus_1;
+
+      ////////////////////////////////
+      // QMR recurrence coefficients.
+      ////////////////////////////////
+      theta      = s_m_minus_1 * beta;
+      gamma      = c_m_minus_1 * beta;
+      epsilon    =  c_m * gamma + s_m * alpha;
+      xi         = -s_m * gamma + c_m * alpha;
+      nu         = sqrt( xi*xi + rho*rho );
+      c_m_plus_1 = fabs(xi) / nu;
+      if ( xi == 0.0 ) {
+	s_m_plus_1 = 1.0;
+      } else {
+	s_m_plus_1 = c_m_plus_1 * rho / xi;
+      }
+      chi = c_m_plus_1 * xi + s_m_plus_1 * rho;
+
+      std::cout << "QuasiMinimalResidual coeffs "<< theta <<" "<<gamma<<" "<< epsilon<<" "<< xi<<" "<< nu<<std::endl;
+      std::cout << "QuasiMinimalResidual coeffs "<< chi   <<std::endl;
+
+      ////////////////////////////////
+      //p_m=(v_m - \epsilon p_{m-1} - \theta p_{m-2}) / \chi
+      ////////////////////////////////
+      p_m = (1.0/chi) * v_m - (epsilon/chi) * p_m_minus_1 - (theta/chi) * p_m_minus_2;
+
+      ////////////////////////////////////////////////////////////////
+      //      \psi = \psi + c_{m+1} \mu p_m	
+      ////////////////////////////////////////////////////////////////
+      x = x + ( c_m_plus_1 * mu ) * p_m;
+
+      ////////////////////////////////////////
+      //
+      ////////////////////////////////////////
+      mu              = -s_m_plus_1 * mu;
+      delta_m_minus_1 = delta_m;
+      c_m_minus_1     = c_m;
+      c_m             = c_m_plus_1;
+      s_m_minus_1     = s_m;
+      s_m             = s_m_plus_1;
+
+      ////////////////////////////////////
+      // Could use pointer swizzle games.
+      ////////////////////////////////////
+      v_m_minus_1 = v_m;
+      v_m         = v_m_plus_1;
+      p_m_minus_2 = p_m_minus_1;
+      p_m_minus_1 = p_m;
+
+
+      /////////////////////////////////////
+      // Convergence checks
+      /////////////////////////////////////
+      z1 = RealD(iter+1.0);
+      z2 = z1 + 1.0;
+      tau2 = tau2 *( z2 / z1 ) * s_m * s_m;
+      std::cout << " QuasiMinimumResidual iteration "<< iter<<std::endl;
+      std::cout << " QuasiMinimumResidual tau bound "<< tau2<<std::endl;
+
+      // Compute true residual
+      mod2r = tau2;
+      if ( 1 || (tau2 < (100.0 * target2)) ) {
+	LinOp.Op(x,tmp);
+	r = b - tmp;
+	mod2r = norm2(r);
+	std::cout << " QuasiMinimumResidual true residual is "<< mod2r<<std::endl;
+      }
+
+
+      if ( mod2r < target2 ) { 
+
+	std::cout << " QuasiMinimumResidual has converged"<<std::endl;
+	return;
+
+      }
+
+    }
+
+
+  }
+#endif
+};
+
+NAMESPACE_END(Grid);
--- a/Grid/allocator/AlignedAllocator.cc
+++ b/Grid/allocator/AlignedAllocator.cc
@@ -6,6 +6,12 @@ NAMESPACE_BEGIN(Grid);
 MemoryStats *MemoryProfiler::stats = nullptr;
 bool         MemoryProfiler::debug = false;

+#ifdef GRID_NVCC
+#define SMALL_LIMIT (0)
+#else
+#define SMALL_LIMIT (4096)
+#endif
+
 #ifdef POINTER_CACHE
 int PointerCache::victim;

@@ -13,7 +19,7 @@ PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];

 void *PointerCache::Insert(void *ptr,size_t bytes) {

-  if (bytes < 4096 ) return ptr;
+  if (bytes < SMALL_LIMIT ) return ptr;

 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
@@ -50,7 +56,7 @@ void *PointerCache::Insert(void *ptr,size_t bytes) {

 void *PointerCache::Lookup(size_t bytes) {

-  if (bytes < 4096 ) return NULL;
+  if (bytes < SMALL_LIMIT ) return NULL;

 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -49,8 +49,13 @@ NAMESPACE_BEGIN(Grid);
 #ifdef POINTER_CACHE
 class PointerCache {
 private:
-
+/*Pinning pages is costly*/
+/*Could maintain separate large and small allocation caches*/
+#ifdef GRID_NVCC 
+  static const int Ncache=128;
+#else
  static const int Ncache=8;
+#endif
  static int victim;

  typedef struct { 
@@ -63,7 +68,6 @@ private:

 public:

-
  static void *Insert(void *ptr,size_t bytes) ;
  static void *Lookup(size_t bytes) ;

@@ -170,13 +174,14 @@ public:
    // Unified (managed) memory
    ////////////////////////////////////
    if ( ptr == (_Tp *) NULL ) {
+      //      printf(" alignedAllocater cache miss %ld bytes ",bytes);      BACKTRACEFP(stdout);
      auto err = cudaMallocManaged((void **)&ptr,bytes);
      if( err != cudaSuccess ) {
 	ptr = (_Tp *) NULL;
 	std::cerr << " cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
 	assert(0);
      }
-    }
+    } 
    assert( ptr != (_Tp *)NULL);
 #else 
    //////////////////////////////////////////////////////////////////////////////////////////
--- a/Grid/cartesian/Cartesian_base.h
+++ b/Grid/cartesian/Cartesian_base.h
@@ -47,20 +47,19 @@ public:
  // Give Lattice access
  template<class object> friend class Lattice;

-  GridBase(const Coordinate & processor_grid) : CartesianCommunicator(processor_grid) {}; 
+  GridBase(const Coordinate & processor_grid) : CartesianCommunicator(processor_grid) { LocallyPeriodic=0;}; 

  GridBase(const Coordinate & processor_grid,
 	   const CartesianCommunicator &parent,
 	   int &split_rank) 
-    : CartesianCommunicator(processor_grid,parent,split_rank) {};
+    : CartesianCommunicator(processor_grid,parent,split_rank) {LocallyPeriodic=0;};

  GridBase(const Coordinate & processor_grid,
 	   const CartesianCommunicator &parent) 
-    : CartesianCommunicator(processor_grid,parent,dummy) {};
+    : CartesianCommunicator(processor_grid,parent,dummy) {LocallyPeriodic=0;};

  virtual ~GridBase() = default;

-
  // Physics Grid information.
  Coordinate _simd_layout;// Which dimensions get relayed out over simd lanes.
  Coordinate _fdimensions;// (full) Global dimensions of array prior to cb removal
@@ -80,7 +79,8 @@ public:
  Coordinate _lstart;     // local start of array in gcoors _processor_coor[d]*_ldimensions[d]
  Coordinate _lend  ;     // local end of array in gcoors   _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1

-    bool _isCheckerBoarded; 
+  bool _isCheckerBoarded; 
+  int        LocallyPeriodic;

 public:

--- a/Grid/communicator/SharedMemory.h
+++ b/Grid/communicator/SharedMemory.h
@@ -41,9 +41,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <sys/shm.h>
 #include <sys/mman.h>
 #include <zlib.h>
-#ifdef HAVE_NUMAIF_H
-#include <numaif.h>
-#endif

 NAMESPACE_BEGIN(Grid);

@@ -99,6 +96,7 @@ public:
  static void OptimalCommunicator            (const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
  static void OptimalCommunicatorHypercube   (const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
  static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
+  static void GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims);
  ///////////////////////////////////////////////////
  // Provide shared memory facilities off comm world
  ///////////////////////////////////////////////////
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -155,6 +155,35 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
  if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm);
  else                          OptimalCommunicatorSharedMemory(processors,optimal_comm);
 }
+static inline int divides(int a,int b)
+{
+  return ( b == ( (b/a)*a ) );
+}
+void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
+{
+  ////////////////////////////////////////////////////////////////
+  // Powers of 2,3,5 only in prime decomposition for now
+  ////////////////////////////////////////////////////////////////
+  int ndimension = WorldDims.size();
+  ShmDims=Coordinate(ndimension,1);
+
+  std::vector<int> primes({2,3,5});
+
+  int dim = 0;
+  int AutoShmSize = 1;
+  while(AutoShmSize != WorldShmSize) {
+    for(int p=0;p<primes.size();p++) {
+      int prime=primes[p];
+      if ( divides(prime,WorldDims[dim]/ShmDims[dim])
+        && divides(prime,WorldShmSize/AutoShmSize)  ) {
+	AutoShmSize*=prime;
+	ShmDims[dim]*=prime;
+	break;
+      }
+    }
+    dim=(dim+1) %ndimension;
+  }
+}
 void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
 {
  ////////////////////////////////////////////////////////////////
@@ -221,17 +250,13 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
  // in a maximally symmetrical way
  ////////////////////////////////////////////////////////////////
  int ndimension              = processors.size();
-  std::vector<int> processor_coor(ndimension);
-  std::vector<int> WorldDims = processors.toVector();
-  std::vector<int> ShmDims  (ndimension,1);  std::vector<int> NodeDims (ndimension);
-  std::vector<int> ShmCoor  (ndimension);    std::vector<int> NodeCoor (ndimension);    std::vector<int> WorldCoor(ndimension);
-  std::vector<int> HyperCoor(ndimension);
-  int dim = 0;
-  for(int l2=0;l2<log2size;l2++){
-    while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
-    ShmDims[dim]*=2;
-    dim=(dim+1)%ndimension;
-    }
+  Coordinate processor_coor(ndimension);
+  Coordinate WorldDims = processors;
+  Coordinate ShmDims  (ndimension);  Coordinate NodeDims (ndimension);
+  Coordinate ShmCoor  (ndimension);    Coordinate NodeCoor (ndimension);    Coordinate WorldCoor(ndimension);
+  Coordinate HyperCoor(ndimension);
+
+  GetShmDims(WorldDims,ShmDims);

  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
@@ -281,27 +306,16 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
 }
 void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
 {
-  ////////////////////////////////////////////////////////////////
-  // Assert power of two shm_size.
-  ////////////////////////////////////////////////////////////////
-  int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
-  assert(log2size != -1);
-
  ////////////////////////////////////////////////////////////////
  // Identify subblock of ranks on node spreading across dims
  // in a maximally symmetrical way
  ////////////////////////////////////////////////////////////////
  int ndimension              = processors.size();
  Coordinate processor_coor(ndimension);
-  Coordinate WorldDims = processors; Coordinate ShmDims(ndimension,1);  Coordinate NodeDims (ndimension);
+  Coordinate WorldDims = processors; Coordinate ShmDims(ndimension);  Coordinate NodeDims (ndimension);
  Coordinate ShmCoor(ndimension);    Coordinate NodeCoor(ndimension);   Coordinate WorldCoor(ndimension);
-  int dim = 0;
-  for(int l2=0;l2<log2size;l2++){
-    while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
-    ShmDims[dim]*=2;
-    dim=(dim+1)%ndimension;
-  }

+  GetShmDims(WorldDims,ShmDims);
  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
  ////////////////////////////////////////////////////////////////
@@ -418,7 +432,14 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  // e.g. DGX1, supermicro board, 
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  //  cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
-  cudaSetDevice(WorldShmRank);
+
+#ifdef GRID_IBM_SUMMIT
+  // IBM Jsrun makes cuda Device numbering screwy and not match rank
+    std::cout << "IBM Summit or similar - NOT setting device to WorldShmRank"<<std::endl;
+#else
+    std::cout << "setting device to WorldShmRank"<<std::endl;
+    cudaSetDevice(WorldShmRank);
+#endif
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Each MPI rank should allocate our own buffer
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -445,7 +466,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    // If it is me, pass around the IPC access key
    //////////////////////////////////////////////////
    cudaIpcMemHandle_t handle;
-
+    
    if ( r==WorldShmRank ) { 
      err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
      if ( err !=  cudaSuccess) {
@@ -714,6 +735,24 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
  std::vector<int> ranks(size);   for(int r=0;r<size;r++) ranks[r]=r;
  MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]); 

+#ifdef GRID_IBM_SUMMIT
+  // Hide the shared memory path between sockets 
+  // if even number of nodes
+  if ( (ShmSize & 0x1)==0 ) {
+    int SocketSize = ShmSize/2;
+    int mySocket = ShmRank/SocketSize; 
+    for(int r=0;r<size;r++){
+      int hisRank=ShmRanks[r];
+      if ( hisRank!= MPI_UNDEFINED ) {
+	int hisSocket=hisRank/SocketSize;
+	if ( hisSocket != mySocket ) {
+	  ShmRanks[r] = MPI_UNDEFINED;
+	}
+      }
+    }
+  }
+#endif
+
  SharedMemoryTest();
 }
 //////////////////////////////////////////////////////////////////
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -173,13 +173,14 @@ public:
  ///////////////////////////////////////////////////
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_object scalar_object;
  typedef vobj vector_object;

 private:
  void dealloc(void)
  {
-    alignedAllocator<vobj> alloc;
    if( this->_odata_size ) {
+      alignedAllocator<vobj> alloc;
      alloc.deallocate(this->_odata,this->_odata_size);
      this->_odata=nullptr;
      this->_odata_size=0;
@@ -187,15 +188,17 @@ private:
  }
  void resize(uint64_t size)
  {
-    alignedAllocator<vobj> alloc;
    if ( this->_odata_size != size ) {
+      alignedAllocator<vobj> alloc;
+
      dealloc();
+      
+      this->_odata_size = size;
+      if ( size ) 
+	this->_odata      = alloc.allocate(this->_odata_size);
+      else 
+	this->_odata      = nullptr;
    }
-    this->_odata_size = size;
-    if ( size ) 
-      this->_odata      = alloc.allocate(this->_odata_size);
-    else 
-      this->_odata      = nullptr;
  }
 public:
  /////////////////////////////////////////////////////////////////////////////////
@@ -346,7 +349,7 @@ public:
  void reset(GridBase* grid) {
    if (this->_grid != grid) {
      this->_grid = grid;
-      this->_odata.resize(grid->oSites());
+      this->resize(grid->oSites());
      this->checkerboard = 0;
    }
  }
--- a/Grid/lattice/Lattice_coordinate.h
+++ b/Grid/lattice/Lattice_coordinate.h
@@ -37,19 +37,18 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
  GridBase *grid = l.Grid();
  int Nsimd = grid->iSites();

-  Coordinate gcoor;
-  ExtractBuffer<scalar_type> mergebuf(Nsimd);
-
-  vector_type vI;
  auto l_v = l.View();
-  for(int o=0;o<grid->oSites();o++){
+  thread_for( o, grid->oSites(), {
+    vector_type vI;
+    Coordinate gcoor;
+    ExtractBuffer<scalar_type> mergebuf(Nsimd);
    for(int i=0;i<grid->iSites();i++){
      grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
      mergebuf[i]=(Integer)gcoor[mu];
    }
    merge<vector_type,scalar_type>(vI,mergebuf);
    l_v[o]=vI;
-  }
+  });
 };

 // LatticeCoordinate();
--- a/Grid/lattice/Lattice_peekpoke.h
+++ b/Grid/lattice/Lattice_peekpoke.h
@@ -156,7 +156,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
 // Peek a scalar object from the SIMD array
 //////////////////////////////////////////////////////////
 template<class vobj,class sobj>
-void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
+accelerator_inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
        
  GridBase *grid = l.Grid();

@@ -185,7 +185,7 @@ void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
 };

 template<class vobj,class sobj>
-void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
+accelerator_inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){

  GridBase *grid=l.Grid();

--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -317,116 +317,6 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
  }
 }

-template<class vobj>
-static void mySliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
-{
-  // std::cout << GridLogMessage << "Start mySliceInnerProductVector" << std::endl;
-
-  typedef typename vobj::scalar_type scalar_type;
-  std::vector<scalar_type> lsSum;
-  localSliceInnerProductVector(result, lhs, rhs, lsSum, orthogdim);
-  globalSliceInnerProductVector(result, lhs, lsSum, orthogdim);
-  // std::cout << GridLogMessage << "End mySliceInnerProductVector" << std::endl;
-}
-
-template <class vobj>
-static void localSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, const Lattice<vobj> &rhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
-{
-  // std::cout << GridLogMessage << "Start prep" << std::endl;
-  typedef typename vobj::vector_type   vector_type;
-  typedef typename vobj::scalar_type   scalar_type;
-  GridBase  *grid = lhs.Grid();
-  assert(grid!=NULL);
-  conformable(grid,rhs.Grid());
-
-  const int    Nd = grid->_ndimension;
-  const int Nsimd = grid->Nsimd();
-
-  assert(orthogdim >= 0);
-  assert(orthogdim < Nd);
-
-  int fd=grid->_fdimensions[orthogdim];
-  int ld=grid->_ldimensions[orthogdim];
-  int rd=grid->_rdimensions[orthogdim];
-  // std::cout << GridLogMessage << "Start alloc" << std::endl;
-
-  Vector<vector_type> lvSum(rd); // will locally sum vectors first
-  lsSum.resize(ld,scalar_type(0.0));                    // sum across these down to scalars
-  ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);   // splitting the SIMD  
-  // std::cout << GridLogMessage << "End alloc" << std::endl;
-
-  result.resize(fd); // And then global sum to return the same vector to every node for IO to file
-  for(int r=0;r<rd;r++){
-    lvSum[r]=Zero();
-  }
-
-  int e1=    grid->_slice_nblock[orthogdim];
-  int e2=    grid->_slice_block [orthogdim];
-  int stride=grid->_slice_stride[orthogdim];
-  // std::cout << GridLogMessage << "End prep" << std::endl;
-  // std::cout << GridLogMessage << "Start parallel inner product, _rd = " << rd << std::endl;
-  vector_type vv;
-  auto l_v=lhs.View();
-  auto r_v=rhs.View();
-  thread_for( r,rd,{
-
-    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
-
-    for(int n=0;n<e1;n++){
-      for(int b=0;b<e2;b++){
-        int ss = so + n * stride + b;
-        vv = TensorRemove(innerProduct(l_v[ss], r_v[ss]));
-        lvSum[r] = lvSum[r] + vv;
-      }
-    }
-  });
-  // std::cout << GridLogMessage << "End parallel inner product" << std::endl;
-
-  // Sum across simd lanes in the plane, breaking out orthog dir.
-  Coordinate icoor(Nd);
-  for(int rt=0;rt<rd;rt++){
-
-    iScalar<vector_type> temp; 
-    temp._internal = lvSum[rt];
-    extract(temp,extracted);
-
-    for(int idx=0;idx<Nsimd;idx++){
-
-      grid->iCoorFromIindex(icoor,idx);
-
-      int ldx =rt+icoor[orthogdim]*rd;
-
-      lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal;
-
-    }
-  }
-  // std::cout << GridLogMessage << "End sum over simd lanes" << std::endl;
-}
-template <class vobj>
-static void globalSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
-{
-  typedef typename vobj::scalar_type scalar_type;
-  GridBase *grid = lhs.Grid();
-  int fd = result.size();
-  int ld = lsSum.size();
-  // sum over nodes.
-  std::vector<scalar_type> gsum;
-  gsum.resize(fd, scalar_type(0.0));
-  // std::cout << GridLogMessage << "Start of gsum[t] creation:" << std::endl;
-  for(int t=0;t<fd;t++){
-    int pt = t/ld; // processor plane
-    int lt = t%ld;
-    if ( pt == grid->_processor_coor[orthogdim] ) {
-      gsum[t]=lsSum[lt];
-    }
-  }
-  // std::cout << GridLogMessage << "End of gsum[t] creation:" << std::endl;
-  // std::cout << GridLogMessage << "Start of GlobalSumVector:" << std::endl;
-  grid->GlobalSumVector(&gsum[0], fd);
-  // std::cout << GridLogMessage << "End of GlobalSumVector:" << std::endl;
-
-  result = gsum;
-}
 template<class vobj>
 static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) 
 {
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -1,5 +1,4 @@
 /*************************************************************************************
-
    Grid physics library, www.github.com/paboyle/Grid 

    Source file: ./lib/lattice/Lattice_transfer.h
@@ -83,12 +82,35 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
  });
 }
  
-
 template<class vobj,class CComplex,int nbasis>
 inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
+			  const             Lattice<vobj>   &fineData,
+			  const std::vector<Lattice<vobj> > &Basis)
+{
+  GridBase * fine  = fineData.Grid();
+  GridBase * coarse= coarseData.Grid();
+
+  Lattice<CComplex> ip(coarse); 
+
+  //  auto fineData_   = fineData.View();
+  auto coarseData_ = coarseData.View();
+  auto ip_         = ip.View();
+  for(int v=0;v<nbasis;v++) {
+    blockInnerProduct(ip,Basis[v],fineData);
+    accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
+	coalescedWrite(coarseData_[sc](v),ip_(sc));
+      });
+  }
+}
+
+template<class vobj,class CComplex,int nbasis>
+inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
 			 const             Lattice<vobj>   &fineData,
 			 const std::vector<Lattice<vobj> > &Basis)
 {
+  typedef iVector<CComplex,nbasis > coarseSiteData;
+  coarseSiteData elide;
+  typedef decltype(coalescedRead(elide)) ScalarComplex;
  GridBase * fine  = fineData.Grid();
  GridBase * coarse= coarseData.Grid();
  int  _ndimension = coarse->_ndimension;
@@ -106,26 +128,40 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
    block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
    assert(block_r[d]*coarse->_rdimensions[d] == fine->_rdimensions[d]);
  }
+  int blockVol = fine->oSites()/coarse->oSites();

  coarseData=Zero();

  auto fineData_   = fineData.View();
  auto coarseData_ = coarseData.View();
-  // Loop over coars parallel, and then loop over fine associated with coarse.
-  thread_for( sf, fine->oSites(), {
-    int sc;
-    Coordinate coor_c(_ndimension);
-    Coordinate coor_f(_ndimension);
-    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
-    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
-    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // To make this lock free, loop over coars parallel, and then loop over fine associated with coarse.
+  // Otherwise do fine inner product per site, and make the update atomic
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////
+  accelerator_for( sci, nbasis*coarse->oSites(), vobj::Nsimd(), {

-    thread_critical {
-      for(int i=0;i<nbasis;i++) {
-	auto Basis_      = Basis[i].View();
-	coarseData_[sc](i)=coarseData_[sc](i) + innerProduct(Basis_[sf],fineData_[sf]);
-      }
+    auto sc=sci/nbasis;
+    auto i=sci%nbasis;
+    auto Basis_      = Basis[i].View();
+
+    Coordinate coor_c(_ndimension);
+    Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions);  // Block coordinate
+
+    int sf;
+    decltype(innerProduct(Basis_(sf),fineData_(sf))) reduce=Zero();
+
+    for(int sb=0;sb<blockVol;sb++){
+
+      Coordinate coor_b(_ndimension);
+      Coordinate coor_f(_ndimension);
+
+      Lexicographic::CoorFromIndex(coor_b,sb,block_r);
+      for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d]+coor_b[d];
+      Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
+      
+      reduce=reduce+innerProduct(Basis_(sf),fineData_(sf));
    }
+    coalescedWrite(coarseData_[sc](i),reduce);
  });
  return;
 }
@@ -160,7 +196,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
  auto fineY_  = fineY.View();
  auto coarseA_= coarseA.View();

-  thread_for(sf, fine->oSites(), {
+  accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
    
    int sc;
    Coordinate coor_c(_ndimension);
@@ -171,7 +207,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);

    // z = A x + y
-    fineZ_[sf]=coarseA_[sc]*fineX_[sf]+fineY_[sf];
+    coalescedWrite(fineZ_[sf],coarseA_(sc)*fineX_(sf)+fineY_(sf));

  });

@@ -196,7 +232,7 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,

  fine_inner = localInnerProduct(fineX,fineY);
  blockSum(coarse_inner,fine_inner);
-  thread_for(ss, coarse->oSites(),{
+  accelerator_for(ss, coarse->oSites(), 1, {
    CoarseInner_[ss] = coarse_inner_[ss];
  });
 }
@@ -226,23 +262,29 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
  for(int d=0 ; d<_ndimension;d++){
    block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
  }
+  int blockVol = fine->oSites()/coarse->oSites();

  // Turn this around to loop threaded over sc and interior loop 
  // over sf would thread better
-  coarseData=Zero();
  auto coarseData_ = coarseData.View();
  auto fineData_   = fineData.View();

-  thread_for(sf,fine->oSites(),{
-    int sc;
+  accelerator_for(sc,coarse->oSites(),1,{
+
+    // One thread per sub block
    Coordinate coor_c(_ndimension);
-    Coordinate coor_f(_ndimension);
-    
-    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
-    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
-    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
-    
-    thread_critical { 
+    Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions);  // Block coordinate
+    coarseData_[sc]=Zero();
+
+    for(int sb=0;sb<blockVol;sb++){
+      
+      int sf;
+      Coordinate coor_b(_ndimension);
+      Coordinate coor_f(_ndimension);
+      Lexicographic::CoorFromIndex(coor_b,sb,block_r);               // Block sub coordinate
+      for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
+      Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
+
      coarseData_[sc]=coarseData_[sc]+fineData_[sf];
    }

@@ -296,6 +338,7 @@ inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> >
  }
 }

+#if 0
 template<class vobj,class CComplex,int nbasis>
 inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 			 Lattice<vobj>   &fineData,
@@ -321,7 +364,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
  auto coarseData_ = coarseData.View();

  // Loop with a cache friendly loop ordering
-  thread_for(sf,fine->oSites(),{
+  accelerator_for(sf,fine->oSites(),1,{
    int sc;
    Coordinate coor_c(_ndimension);
    Coordinate coor_f(_ndimension);
@@ -332,13 +375,35 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,

    for(int i=0;i<nbasis;i++) {
      auto basis_ = Basis[i].View();
-      if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf];
-      else     fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf];
+      if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]);
+      else     fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
    }
  });
  return;
  
 }
+#else
+template<class vobj,class CComplex,int nbasis>
+inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
+			 Lattice<vobj>   &fineData,
+			 const std::vector<Lattice<vobj> > &Basis)
+{
+  GridBase * fine  = fineData.Grid();
+  GridBase * coarse= coarseData.Grid();
+
+  fineData=Zero();
+  for(int i=0;i<nbasis;i++) {
+    Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
+    Lattice<CComplex> cip(coarse);
+    auto cip_ = cip.View();
+    auto  ip_ =  ip.View();
+    accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
+	coalescedWrite(cip_[sc], ip_(sc)());
+    });
+    blockZAXPY<vobj,CComplex >(fineData,cip,Basis[i],fineData);
+  }
+}
+#endif

 // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars.
 // Simd layouts need not match since we use peek/poke Local
@@ -374,6 +439,67 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
  });
 }

+template<class vobj>
+void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate FromLowerLeft, Coordinate ToLowerLeft, Coordinate RegionSize)
+{
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  static const int words=sizeof(vobj)/sizeof(vector_type);
+
+  GridBase *Fg = From.Grid();
+  GridBase *Tg = To.Grid();
+  assert(!Fg->_isCheckerBoarded);
+  assert(!Tg->_isCheckerBoarded);
+  int Nsimd = Fg->Nsimd();
+  int nF = Fg->_ndimension;
+  int nT = Tg->_ndimension;
+  int nd = nF;
+  assert(nF == nT);
+
+  for(int d=0;d<nd;d++){
+    assert(Fg->_processors[d]  == Tg->_processors[d]);
+  }
+
+  // the above should guarantee that the operations are local
+  Coordinate ldf = Fg->_ldimensions;
+  Coordinate rdf = Fg->_rdimensions;
+  Coordinate isf = Fg->_istride;
+  Coordinate osf = Fg->_ostride;
+  Coordinate rdt = Tg->_rdimensions;
+  Coordinate ist = Tg->_istride;
+  Coordinate ost = Tg->_ostride;
+  auto t_v = To.View();
+  auto f_v = From.View();
+  accelerator_for(idx,Fg->lSites(),1,{
+    sobj s;
+    Coordinate Fcoor(nd);
+    Coordinate Tcoor(nd);
+    Lexicographic::CoorFromIndex(Fcoor,idx,ldf);
+    int in_region=1;
+    for(int d=0;d<nd;d++){
+      if ( (Fcoor[d] < FromLowerLeft[d]) || (Fcoor[d]>=FromLowerLeft[d]+RegionSize[d]) ){ 
+	in_region=0;
+      }
+      Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d];
+    }
+    if (in_region) {
+      Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]);
+      Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]);
+      Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]);
+      Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]);
+      scalar_type * fp = (scalar_type *)&f_v[odx_f];
+      scalar_type * tp = (scalar_type *)&t_v[odx_t];
+      for(int w=0;w<words;w++){
+	tp[idx_t+w*Nsimd] = fp[idx_f+w*Nsimd];  // FIXME IF RRII layout, type pun no worke
+      }
+      //      peekLocalSite(s,From,Fcoor);
+      //      pokeLocalSite(s,To  ,Tcoor);
+    }
+  });
+}
+

 template<class vobj>
 void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog)
--- a/Grid/parallelIO/NerscIO.h
+++ b/Grid/parallelIO/NerscIO.h
@@ -354,6 +354,6 @@ public:
  }
 };

-NAMESPACE_END(QCD);
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/perfmon/PerfCount.h
+++ b/Grid/perfmon/PerfCount.h
@@ -44,8 +44,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <sys/syscall.h>
 #endif
 #ifdef __x86_64__
+#ifdef GRID_NVCC
+accelerator_inline uint64_t __rdtsc(void) {  return 0; }
+accelerator_inline uint64_t __rdpmc(int ) {  return 0; }
+#else
 #include <x86intrin.h>
 #endif
+#endif

 NAMESPACE_BEGIN(Grid);

@@ -89,13 +94,8 @@ inline uint64_t cyclecount(void){
  return tmp;
 }
 #elif defined __x86_64__
-#ifdef GRID_NVCC
-accelerator_inline uint64_t __rdtsc(void) {  return 0; }
-#endif
 inline uint64_t cyclecount(void){ 
  return __rdtsc();
-  //  unsigned int dummy;
-  // return __rdtscp(&dummy);
 }
 #else

--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@@ -101,7 +101,8 @@ public:
  virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);

  // Efficient support for multigrid coarsening
-  virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
+  virtual void  Mdir   (const FermionField &in, FermionField &out,int dir,int disp);
+  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out);

  void   Meooe5D       (const FermionField &in, FermionField &out);
  void   MeooeDag5D    (const FermionField &in, FermionField &out);
--- a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
@@ -62,14 +62,15 @@ public:

  // Efficient support for multigrid coarsening
  virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
+  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out);

-      ///////////////////////////////////////////////////////////////
-      // Physical surface field utilities
-      ///////////////////////////////////////////////////////////////
-      //      virtual void Dminus(const FermionField &psi, FermionField &chi);     // Inherit trivial case
-      //      virtual void DminusDag(const FermionField &psi, FermionField &chi);  // Inherit trivial case
-      virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
-      virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);
+  ///////////////////////////////////////////////////////////////
+  // Physical surface field utilities
+  ///////////////////////////////////////////////////////////////
+  //      virtual void Dminus(const FermionField &psi, FermionField &chi);     // Inherit trivial case
+  //      virtual void DminusDag(const FermionField &psi, FermionField &chi);  // Inherit trivial case
+  virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
+  virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);

  // Constructors
  ContinuedFractionFermion5D(GaugeField &_Umu,
--- a/Grid/qcd/action/fermion/DomainWallFermion.h
+++ b/Grid/qcd/action/fermion/DomainWallFermion.h
@@ -80,7 +80,7 @@ public:
 	  theFFT.FFT_all_dim(out,prop_k,FFT::backward);
        }
 	//phase for boundary condition
-	out = out * exp(Scalar(2.0*M_PI)*ci*ph);
+	out = out * exp(ci*ph);
      };

      virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary,std::vector<double> twist) {
--- a/Grid/qcd/action/fermion/FermionOperator.h
+++ b/Grid/qcd/action/fermion/FermionOperator.h
@@ -89,6 +89,7 @@ public:

  virtual void  Mdiag  (const FermionField &in, FermionField &out) { Mooee(in,out);};   // Same as Mooee applied to both CB's
  virtual void  Mdir   (const FermionField &in, FermionField &out,int dir,int disp)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
+  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac


      virtual void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);};
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
@@ -103,6 +103,7 @@ public:
  // Multigrid assistance; force term uses too
  ///////////////////////////////////////////////////////////////
  void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
+  void MdirAll(const FermionField &in, std::vector<FermionField> &out);
  void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);

  ///////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -86,7 +86,8 @@ public:
  void   MooeeDag    (const FermionField &in, FermionField &out);
  void   MooeeInvDag (const FermionField &in, FermionField &out);

-  void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp);
+  void Mdir   (const FermionField &in, FermionField &out,int dir,int disp);
+  void MdirAll(const FermionField &in, std::vector<FermionField> &out);
  void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);

  // These can be overridden by fancy 5d chiral action
--- a/Grid/qcd/action/fermion/PartialFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.h
@@ -67,12 +67,13 @@ public:

  // Efficient support for multigrid coarsening
  virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
+  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out);

-      ///////////////////////////////////////////////////////////////
-      // Physical surface field utilities
-      ///////////////////////////////////////////////////////////////
-      virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
-      virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);
+  ///////////////////////////////////////////////////////////////
+  // Physical surface field utilities
+  ///////////////////////////////////////////////////////////////
+  virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
+  virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);

  // Constructors
  PartialFractionFermion5D(GaugeField &_Umu,
--- a/Grid/qcd/action/fermion/WilsonFermion.h
+++ b/Grid/qcd/action/fermion/WilsonFermion.h
@@ -115,9 +115,10 @@ public:
  // Multigrid assistance; force term uses too
  ///////////////////////////////////////////////////////////////
  void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
+  void MdirAll(const FermionField &in, std::vector<FermionField> &out);
  void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);
-  void DhopDirDisp(const FermionField &in, FermionField &out, int dirdisp,
-                   int gamma, int dag);
+  void DhopDirAll(const FermionField &in, std::vector<FermionField> &out);
+  void DhopDirCalc(const FermionField &in, FermionField &out, int dirdisp,int gamma, int dag);

  ///////////////////////////////////////////////////////////////
  // Extra methods added by derived
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@@ -111,15 +111,16 @@ public:
  virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
  virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
  virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
+  virtual void   MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac

  // These can be overridden by fancy 5d chiral action
  virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
  virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
  virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);

-      void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
-      void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
-      void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+  void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+  void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+  void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;

  // Implement hopping term non-hermitian hopping term; half cb or both
  // Implement s-diagonal DW
@@ -131,6 +132,9 @@ public:
  // add a DhopComm
  // -- suboptimal interface will presently trigger multiple comms.
  void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
+  void DhopDirAll(const FermionField &in,std::vector<FermionField> &out);
+  void DhopDirComms(const FermionField &in);
+  void DhopDirCalc(const FermionField &in, FermionField &out,int point);
    
  ///////////////////////////////////////////////////////////////
  // New methods added 
--- a/Grid/qcd/action/fermion/WilsonKernels.h
+++ b/Grid/qcd/action/fermion/WilsonKernels.h
@@ -60,6 +60,9 @@ public:
 			    int Ls, int Nsite, const FermionField &in, FermionField &out,
 			    int interior=1,int exterior=1) ;

+  static void DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
+			  int Nsite, const FermionField &in, std::vector<FermionField> &out) ;
+
  static void DhopDirKernel(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
 			    int Ls, int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma);

@@ -100,8 +103,17 @@ public:

 private:

-  static accelerator void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor * buf,
+  static accelerator_inline void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor * buf,
 				   int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dirdisp, int gamma);
+
+  static accelerator_inline void DhopDirXp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirYp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirZp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirTp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirXm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirYm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirZm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirTm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
      
  // Specialised variants
  static accelerator void GenericDhopSite(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
--- a/Grid/qcd/action/fermion/g5HermitianLinop.h
+++ b/Grid/qcd/action/fermion/g5HermitianLinop.h
@@ -54,6 +54,14 @@ public:
    _Mat.Mdir(in,tmp,dir,disp);
    G5R5(out,tmp);
  }
+  void OpDirAll(const Field &in, std::vector<Field> &out) {
+    Field tmp(in.Grid());
+    _Mat.MdirAll(in,out);
+    for(int p=0;p<out.size();p++) {
+      tmp=out[p];
+      G5R5(out[p],tmp);
+    }
+  }

  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){

@@ -96,6 +104,12 @@ public:
    _Mat.Mdir(in,tmp,dir,disp);
    out=g5*tmp;
  }
+  void OpDirAll(const Field &in, std::vector<Field> &out) {
+    _Mat.MdirAll(in,out);
+    for(int p=0;p<out.size();p++) {
+      out[p]=g5*out[p];
+    }
+  }

  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){

--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -383,11 +383,20 @@ void CayleyFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &
 }

 template<class Impl>
-void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
-  Meo5D(psi,this->tmp());
-  // Apply 4d dslash fragment
-  this->DhopDir(this->tmp(),chi,dir,disp);
+void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp)
+{
+  FermionField tmp(psi.Grid());
+  Meo5D(psi,tmp);
+  this->DhopDir(tmp,chi,dir,disp);
 }
+template<class Impl>
+void  CayleyFermion5D<Impl>::MdirAll(const FermionField &psi, std::vector<FermionField> &out)
+{
+  FermionField tmp(psi.Grid());
+  Meo5D(psi,tmp);
+  this->DhopDirAll(tmp,out);
+}
+
 // force terms; five routines; default to Dhop on diagonal
 template<class Impl>
 void CayleyFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
@@ -10,6 +10,7 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Gianluca Filaci <g.filaci@ed.ac.uk>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -54,6 +55,10 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
  auto chi = chi_i.View();
  assert(phi.Checkerboard() == psi.Checkerboard());

+  auto pdiag = &diag[0];
+  auto pupper = &upper[0];
+  auto plower = &lower[0];
+
  int Ls =this->Ls;

  // 10 = 3 complex mult + 2 complex add
@@ -71,7 +76,7 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
      uint64_t idx_l = ss+((s+Ls-1)%Ls);
      spProj5m(tmp1,psi(idx_u));
      spProj5p(tmp2,psi(idx_l));
-      coalescedWrite(chi[ss+s],diag[s]*phi(ss+s)+upper[s]*tmp1+lower[s]*tmp2);
+      coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
    }
  });
  M5Dtime+=usecond();
@@ -93,6 +98,10 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
  auto chi = chi_i.View();
  assert(phi.Checkerboard() == psi.Checkerboard());

+  auto pdiag = &diag[0];
+  auto pupper = &upper[0];
+  auto plower = &lower[0];
+
  int Ls=this->Ls;

  // Flops = 6.0*(Nc*Ns) *Ls*vol
@@ -109,7 +118,7 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
      uint64_t idx_l = ss+((s+Ls-1)%Ls);
      spProj5p(tmp1,psi(idx_u));
      spProj5m(tmp2,psi(idx_l));
-      coalescedWrite(chi[ss+s],diag[s]*phi(ss+s)+upper[s]*tmp1+lower[s]*tmp2);
+      coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
    }
  });
  M5Dtime+=usecond();
@@ -139,39 +148,41 @@ CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
    typedef decltype(coalescedRead(psi[0])) spinor;
-    spinor tmp;
+    spinor tmp, acc, res;

-    // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
-    // Apply (L^{\prime})^{-1}
-    coalescedWrite(chi[ss],psi(ss)); // chi[0]=psi[0]
-    for(int s=1;s<Ls;s++){
-      spProj5p(tmp,chi(ss+s-1));  
-      coalescedWrite(chi[ss+s] , psi(ss+s)-plee[s-1]*tmp);
+    // X = Nc*Ns
+    // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops
+    // Apply (L^{\prime})^{-1} L_m^{-1}
+    res = psi(ss);
+    spProj5m(tmp,res);
+    acc = pleem[0]*tmp;
+    spProj5p(tmp,res);
+    coalescedWrite(chi[ss],res);
+    
+    for(int s=1;s<Ls-1;s++){
+      res = psi(ss+s);
+      res -= plee[s-1]*tmp;
+      spProj5m(tmp,res);
+      acc += pleem[s]*tmp;
+      spProj5p(tmp,res);
+      coalescedWrite(chi[ss+s],res);
    }
-
-    // L_m^{-1} 
-    for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -pleem[s]P_- chi
-      spProj5m(tmp,chi(ss+s));    
-      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pleem[s]*tmp);
-    }
-
-    // U_m^{-1} D^{-1}
-    for (int s=0;s<Ls-1;s++){
-      // Chi[s] + 1/d chi[s] 
-      spProj5p(tmp,chi(ss+Ls-1)); 
-      coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s)-(pueem[s]/pdee[Ls-1])*tmp);
-    }	
-    coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1));
-      
-    // Apply U^{-1}
+    res = psi(ss+Ls-1) - plee[Ls-2]*tmp - acc;
+    
+    // Apply U_m^{-1} D^{-1} U^{-1}
+    res = (1.0/pdee[Ls-1])*res;
+    coalescedWrite(chi[ss+Ls-1],res);
+    spProj5p(acc,res);
+    spProj5m(tmp,res);
    for (int s=Ls-2;s>=0;s--){
-      spProj5m(tmp,chi(ss+s+1));  
-      coalescedWrite(chi[ss+s], chi(ss+s) - puee[s]*tmp);
+      res = (1.0/pdee[s])*chi(ss+s) - puee[s]*tmp - pueem[s]*acc;
+      spProj5m(tmp,res);
+      coalescedWrite(chi[ss+s],res);
    }
  });

  MooeeInvTime+=usecond();
-
+  
 }

 template<class Impl>
@@ -201,31 +212,36 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
  accelerator_for(sss,nloop,Simd::Nsimd(),{
    uint64_t ss=sss*Ls;
    typedef decltype(coalescedRead(psi[0])) spinor;
-    spinor tmp;
+    spinor tmp, acc, res;

-    // Apply (U^{\prime})^{-dagger}
-    coalescedWrite(chi[ss],psi(ss));
-    for (int s=1;s<Ls;s++){
-      spProj5m(tmp,chi(ss+s-1));
-      coalescedWrite(chi[ss+s], psi(ss+s)-conjugate(puee[s-1])*tmp);
+    // X = Nc*Ns
+    // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops
+    // Apply (U^{\prime})^{-dagger} U_m^{-\dagger}
+    res = psi(ss);
+    spProj5p(tmp,res);
+    acc = conjugate(pueem[0])*tmp;
+    spProj5m(tmp,res);
+    coalescedWrite(chi[ss],res);
+    
+    for(int s=1;s<Ls-1;s++){
+      res = psi(ss+s);
+      res -= conjugate(puee[s-1])*tmp;
+      spProj5p(tmp,res);
+      acc += conjugate(pueem[s])*tmp;
+      spProj5m(tmp,res);
+      coalescedWrite(chi[ss+s],res);
    }
-    // U_m^{-\dagger} 
-    for (int s=0;s<Ls-1;s++){
-      spProj5p(tmp,chi(ss+s));
-      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - conjugate(pueem[s])*tmp);
-    }
-
-    // L_m^{-\dagger} D^{-dagger}
-    for (int s=0;s<Ls-1;s++){
-      spProj5m(tmp,chi(ss+Ls-1));
-      coalescedWrite(chi[ss+s], conjugate(1.0/pdee[s])*chi(ss+s)-conjugate(pleem[s]/pdee[Ls-1])*tmp);
-    }	
-    coalescedWrite(chi[ss+Ls-1], conjugate(1.0/pdee[Ls-1])*chi(ss+Ls-1));
-  
-    // Apply L^{-dagger}
+    res = psi(ss+Ls-1) - conjugate(puee[Ls-2])*tmp - acc;
+    
+    // Apply L_m^{-\dagger} D^{-dagger} L^{-dagger}
+    res = conjugate(1.0/pdee[Ls-1])*res;
+    coalescedWrite(chi[ss+Ls-1],res);
+    spProj5m(acc,res);
+    spProj5p(tmp,res);
    for (int s=Ls-2;s>=0;s--){
-      spProj5p(tmp,chi(ss+s+1));
-      coalescedWrite(chi[ss+s], chi(ss+s) - conjugate(plee[s])*tmp);
+      res = conjugate(1.0/pdee[s])*chi(ss+s) - conjugate(plee[s])*tmp - conjugate(pleem[s])*acc;
+      spProj5p(tmp,res);
+      coalescedWrite(chi[ss+s],res);
    }
  });
  MooeeInvTime+=usecond();
--- a/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
@@ -143,6 +143,25 @@ void  ContinuedFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionFi
  }
 }
 template<class Impl>
+void  ContinuedFractionFermion5D<Impl>::MdirAll (const FermionField &psi, std::vector<FermionField> &chi)
+{
+  int Ls = this->Ls;
+
+  this->DhopDirAll(psi,chi); // Dslash on diagonal. g5 Dslash is hermitian
+
+  for(int p=0;p<chi.size();p++){
+    int sign=1;
+    for(int s=0;s<Ls;s++){
+      if ( s==(Ls-1) ){
+	ag5xpby_ssp(chi[p],Beta[s]*ZoloHiInv,chi[p],0.0,chi[p],s,s);
+      } else {
+	ag5xpby_ssp(chi[p],cc[s]*Beta[s]*sign*ZoloHiInv,chi[p],0.0,chi[p],s,s);
+      }
+      sign=-sign; 
+    }
+  }
+}
+template<class Impl>
 void   ContinuedFractionFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
 {
  int Ls = this->Ls;
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
@@ -11,6 +11,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: David Murphy <dmurphy@phys.columbia.edu>
+Author: Gianluca Filaci <g.filaci@ed.ac.uk>

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -49,6 +50,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
  auto psi = psi_i.View();
  auto chi = chi_i.View();
  assert(phi.Checkerboard() == psi.Checkerboard());
+  auto pdiag = &diag[0];
+  auto pupper = &upper[0];
+  auto plower = &lower[0];
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  this->M5Dcalls++;
  this->M5Dtime -= usecond();
@@ -63,7 +67,7 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
      uint64_t idx_l = ss+((s+Ls-1)%Ls);
      spProj5m(tmp1, psi(idx_u));
      spProj5p(tmp2, psi(idx_l));
-      coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2);
+      coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2);
    }
  });

@@ -82,6 +86,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
  auto phi = phi_i.View();
  auto chi = chi_i.View();
  assert(phi.Checkerboard() == psi.Checkerboard());
+  auto pdiag = &diag[0];
+  auto pupper = &upper[0];
+  auto plower = &lower[0];

  // Flops = 6.0*(Nc*Ns) *Ls*vol
  this->M5Dcalls++;
@@ -97,7 +104,7 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
      uint64_t idx_l = ss+((s+Ls-1)%Ls);
      spProj5p(tmp1, psi(idx_u));
      spProj5m(tmp2, psi(idx_l));
-      coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2);
+      coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2);
    }
  });

@@ -124,36 +131,37 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
  this->MooeeInvTime -= usecond();
  uint64_t nloop=grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
-    auto ss=sss*Ls;
+    uint64_t ss=sss*Ls;
    typedef decltype(coalescedRead(psi[0])) spinor;
-    spinor tmp1,tmp2;
+    spinor tmp, acc, res;

-    // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
-    // Apply (L^{\prime})^{-1}
-    coalescedWrite(chi[ss],psi(ss)); // chi[0]=psi[0]
-    for(int s=1; s<Ls; s++){
-      spProj5p(tmp1, chi(ss+s-1));
-      coalescedWrite(chi[ss+s], psi(ss+s) - plee[s-1]*tmp1);
+    // Apply (L^{\prime})^{-1} L_m^{-1}
+    res = psi(ss);
+    spProj5m(tmp,res);
+    acc = pleem[0]*tmp;
+    spProj5p(tmp,res);
+    coalescedWrite(chi[ss],res);
+    
+    for(int s=1;s<Ls-1;s++){
+      res = psi(ss+s);
+      res -= plee[s-1]*tmp;
+      spProj5m(tmp,res);
+      acc += pleem[s]*tmp;
+      spProj5p(tmp,res);
+      coalescedWrite(chi[ss+s],res);
    }
-
-    // L_m^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-      spProj5m(tmp1, chi(ss+s));
-      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pleem[s]*tmp1);
-    }
-
-    // U_m^{-1} D^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
-      spProj5p(tmp1, chi(ss+Ls-1));
-      coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pueem[s]/pdee[Ls])*tmp1);
-    }
-    spProj5m(tmp2, chi(ss+Ls-1));
-    coalescedWrite(chi[ss+Ls-1],(1.0/pdee[Ls])*tmp1 + (1.0/pdee[Ls-1])*tmp2);
-
-    // Apply U^{-1}
-    for(int s=Ls-2; s>=0; s--){
-      spProj5m(tmp1, chi(ss+s+1));
-      coalescedWrite(chi[ss+s], chi(ss+s) - puee[s]*tmp1);
+    res = psi(ss+Ls-1) - plee[Ls-2]*tmp - acc;
+    
+    // Apply U_m^{-1} D^{-1} U^{-1}
+    acc = (1.0/pdee[Ls  ])*res;
+    tmp = (1.0/pdee[Ls-1])*res;
+    spProj5p(acc,acc);
+    spProj5m(tmp,tmp);
+    coalescedWrite(chi[ss+Ls-1], acc + tmp);
+    for (int s=Ls-2;s>=0;s--){
+      res = (1.0/pdee[s])*chi(ss+s) - puee[s]*tmp - pueem[s]*acc;
+      spProj5m(tmp,res);
+      coalescedWrite(chi[ss+s],res);
    }
  });
  this->MooeeInvTime += usecond();
@@ -168,56 +176,50 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
  auto chi = chi_i.View();
  int Ls = this->Ls;

+  auto plee  = & this->lee[0];
+  auto pdee  = & this->dee[0];
+  auto puee  = & this->uee[0];
+
+  auto pleem = & this->leem[0];
+  auto pueem = & this->ueem[0];
+
  assert(psi.Checkerboard() == psi.Checkerboard());

-  Vector<Coeff_t> ueec(Ls);
-  Vector<Coeff_t> deec(Ls+1);
-  Vector<Coeff_t> leec(Ls);
-  Vector<Coeff_t> ueemc(Ls);
-  Vector<Coeff_t> leemc(Ls);
-
-  for(int s=0; s<ueec.size(); s++){
-    ueec[s]  = conjugate(this->uee[s]);
-    deec[s]  = conjugate(this->dee[s]);
-    leec[s]  = conjugate(this->lee[s]);
-    ueemc[s] = conjugate(this->ueem[s]);
-    leemc[s] = conjugate(this->leem[s]);
-  }
-  deec[Ls] = conjugate(this->dee[Ls]);
-
  this->MooeeInvCalls++;
  this->MooeeInvTime -= usecond();
  auto nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
+    uint64_t ss=sss*Ls;
    typedef decltype(coalescedRead(psi[0])) spinor;
-    spinor tmp1,tmp2;
-    auto ss=sss*Ls;
+    spinor tmp, acc, res;

-    // Apply (U^{\prime})^{-dagger}
-    coalescedWrite(chi[ss], psi(ss));
-    for(int s=1; s<Ls; s++){
-      spProj5m(tmp1, chi(ss+s-1));
-      coalescedWrite(chi[ss+s], psi(ss+s) - ueec[s-1]*tmp1);
+    // Apply (U^{\prime})^{-dagger} U_m^{-\dagger} 
+    res = psi(ss);
+    spProj5p(tmp,res);
+    acc = conjugate(pueem[0])*tmp;
+    spProj5m(tmp,res);
+    coalescedWrite(chi[ss],res);
+    
+    for(int s=1;s<Ls-1;s++){
+      res = psi(ss+s);
+      res -= conjugate(puee[s-1])*tmp;
+      spProj5p(tmp,res);
+      acc += conjugate(pueem[s])*tmp;
+      spProj5m(tmp,res);
+      coalescedWrite(chi[ss+s],res);
    }
-
-    // U_m^{-\dagger}
-    for(int s=0; s<Ls-1; s++){
-      spProj5p(tmp1, chi(ss+s));
-      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - ueemc[s]*tmp1);
-    }
-
-    // L_m^{-\dagger} D^{-dagger}
-    for(int s=0; s<Ls-1; s++){
-      spProj5m(tmp1, chi(ss+Ls-1));
-      coalescedWrite(chi[ss+s] ,(1.0/deec[s])*chi(ss+s) - (leemc[s]/deec[Ls-1])*tmp1);
-    }
-    spProj5p(tmp2, chi(ss+Ls-1));
-    coalescedWrite(chi[ss+Ls-1], (1.0/deec[Ls-1])*tmp1 + (1.0/deec[Ls])*tmp2);
-
-    // Apply L^{-dagger}
-    for(int s=Ls-2; s>=0; s--){
-      spProj5p(tmp1, chi(ss+s+1));
-      coalescedWrite(chi[ss+s],chi(ss+s) - leec[s]*tmp1);
+    res = psi(ss+Ls-1) - conjugate(puee[Ls-2])*tmp - acc;
+    
+    // Apply L_m^{-\dagger} D^{-dagger} L^{-dagger}
+    acc = conjugate(1.0/pdee[Ls-1])*res;
+    tmp = conjugate(1.0/pdee[Ls  ])*res;
+    spProj5m(acc,acc);
+    spProj5p(tmp,tmp);
+    coalescedWrite(chi[ss+Ls-1], acc + tmp);
+    for (int s=Ls-2;s>=0;s--){
+      res = conjugate(1.0/pdee[s])*chi(ss+s) - conjugate(plee[s])*tmp - conjugate(pleem[s])*acc;
+      spProj5p(tmp,res);
+      coalescedWrite(chi[ss+s],res);
    }
  });

--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
@@ -538,10 +538,16 @@ void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void)
 // Implement the general interface. Here we use SAME mass on all slices
 /////////////////////////////////////////////////////////////////////////
 template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
+void ImprovedStaggeredFermion5D<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) 
+{
  DhopDir(in, out, dir, disp);
 }
 template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out) 
+{
+  assert(0);
+}
+template <class Impl>
 RealD ImprovedStaggeredFermion5D<Impl>::M(const FermionField &in, FermionField &out) {
  out.Checkerboard() = in.Checkerboard();
  Dhop(in, out, DaggerNo);
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
@@ -362,12 +362,19 @@ void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField
 }

 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
+void ImprovedStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) 
+{
  DhopDir(in, out, dir, disp);
 }
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out) 
+{
+  assert(0); // Not implemented yet
+}

 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) {
+void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) 
+{

  Compressor compressor;
  Stencil.HaloExchange(in, compressor);
@@ -380,6 +387,7 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel
  });
 };

+
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
 						  DoubledGaugeField &U,
@@ -404,7 +412,6 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
 #ifdef GRID_OMP
  Compressor compressor; 
  int len =  U.Grid()->oSites();
-  const int LLs =  1;

  DhopTotalTime   -= usecond();

--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
@@ -11,6 +11,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: David Murphy <dmurphy@phys.columbia.edu>
+Author: Gianluca Filaci <g.filaci@ed.ac.uk>

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -49,6 +50,10 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField

  assert(phi.Checkerboard() == psi.Checkerboard());

+  auto pdiag = &diag[0];
+  auto pupper = &upper[0];
+  auto plower = &lower[0];
+
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  this->M5Dcalls++;
  this->M5Dtime -= usecond();
@@ -64,7 +69,7 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
      uint64_t idx_l = ss+((s+Ls-1)%Ls);
      spProj5m(tmp1, psi(idx_u));
      spProj5p(tmp2, psi(idx_l));
-      coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2);
+      coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2);
    }
  });

@@ -88,6 +93,11 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion

  assert(phi.Checkerboard() == psi.Checkerboard());

+  auto pdiag = &diag[0];
+  auto pupper = &upper[0];
+  auto plower = &lower[0];
+  auto pshift_coeffs = &shift_coeffs[0];
+
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  this->M5Dcalls++;
  this->M5Dtime -= usecond();
@@ -108,7 +118,7 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
      if(pm == 1){ spProj5p(tmp, psi(ss+shift_s)); }
      else       { spProj5m(tmp, psi(ss+shift_s)); }

-      coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 +lower[s]*tmp2 + shift_coeffs[s]*tmp);
+      coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 +plower[s]*tmp2 + pshift_coeffs[s]*tmp);
    }
  });

@@ -128,6 +138,10 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie

  assert(phi.Checkerboard() == psi.Checkerboard());

+  auto pdiag = &diag[0];
+  auto pupper = &upper[0];
+  auto plower = &lower[0];
+
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  this->M5Dcalls++;
  this->M5Dtime -= usecond();
@@ -144,7 +158,7 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
      uint64_t idx_l = ss+((s+Ls-1)%Ls);
      spProj5p(tmp1, psi(idx_u));
      spProj5m(tmp2, psi(idx_l));
-      coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2);
+      coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2);
    }
  });

@@ -166,6 +180,11 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm

  assert(phi.Checkerboard() == psi.Checkerboard());

+  auto pdiag = &diag[0];
+  auto pupper = &upper[0];
+  auto plower = &lower[0];
+  auto pshift_coeffs = &shift_coeffs[0];
+
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  this->M5Dcalls++;
  this->M5Dtime -= usecond();
@@ -189,12 +208,12 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
      spProj5p(tmp1, psi(idx_u));
      spProj5m(tmp2, psi(idx_l));

-      if(s==(Ls-1)) coalescedWrite(chi[ss+s], chi(ss+s)+ diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2);
-      else          coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2);
+      if(s==(Ls-1)) coalescedWrite(chi[ss+s], chi(ss+s)+ pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2);
+      else          coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2);
      if(pm == 1){ spProj5p(tmp, psi(ss+s)); }
      else       { spProj5m(tmp, psi(ss+s)); }

-      coalescedWrite(chi[ss+shift_s],chi(ss+shift_s)+shift_coeffs[s]*tmp);
+      coalescedWrite(chi[ss+shift_s],chi(ss+shift_s)+pshift_coeffs[s]*tmp);
    }
  });

@@ -223,36 +242,38 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &

  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
-
-    uint64_t ss = sss*Ls;
-
+    uint64_t ss=sss*Ls;
    typedef decltype(coalescedRead(psi[0])) spinor;
-    spinor tmp;
+    spinor tmp, acc, res;

-    // Apply (L^{\prime})^{-1}
-    coalescedWrite(chi[ss], psi(ss)); // chi[0]=psi[0]
-    for(int s=1; s<Ls; s++){
-      spProj5p(tmp, chi(ss+s-1));
-      coalescedWrite(chi[ss+s], psi(ss+s) - plee[s-1]*tmp);
+    // X = Nc*Ns
+    // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops
+    // Apply (L^{\prime})^{-1} L_m^{-1}
+    res = psi(ss);
+    spProj5m(tmp,res);
+    acc = pleem[0]*tmp;
+    spProj5p(tmp,res);
+    coalescedWrite(chi[ss],res);
+    
+    for(int s=1;s<Ls-1;s++){
+      res = psi(ss+s);
+      res -= plee[s-1]*tmp;
+      spProj5m(tmp,res);
+      acc += pleem[s]*tmp;
+      spProj5p(tmp,res);
+      coalescedWrite(chi[ss+s],res);
    }
-
-    // L_m^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-      spProj5m(tmp, chi(ss+s));
-      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pleem[s]*tmp);
-    }
-
-    // U_m^{-1} D^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
-      spProj5p(tmp, chi(ss+Ls-1));
-      coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pueem[s]/pdee[Ls-1])*tmp);
-    }
-    coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1));
-
-    // Apply U^{-1}
-    for(int s=Ls-2; s>=0; s--){
-      spProj5m(tmp, chi(ss+s+1));
-      coalescedWrite(chi[ss+s], chi(ss+s) - puee[s]*tmp);
+    res = psi(ss+Ls-1) - plee[Ls-2]*tmp - acc;
+    
+    // Apply U_m^{-1} D^{-1} U^{-1}
+    res = (1.0/pdee[Ls-1])*res;
+    coalescedWrite(chi[ss+Ls-1],res);
+    spProj5p(acc,res);
+    spProj5m(tmp,res);
+    for (int s=Ls-2;s>=0;s--){
+      res = (1.0/pdee[s])*chi(ss+s) - puee[s]*tmp - pueem[s]*acc;
+      spProj5m(tmp,res);
+      coalescedWrite(chi[ss+s],res);
    }
  });
   
@@ -281,45 +302,45 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF

  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
+      uint64_t ss=sss*Ls;
+      typedef decltype(coalescedRead(psi[0])) spinor;
+      spinor tmp, acc, res, tmp_spProj;

-    uint64_t ss = sss*Ls;
+      // Apply (L^{\prime})^{-1} L_m^{-1}
+      res = psi(ss);
+      spProj5m(tmp,res);
+      acc = pleem[0]*tmp;
+      spProj5p(tmp,res);
+      coalescedWrite(chi[ss],res);
+      tmp_spProj = pMooeeInv_shift_lc[0]*res;

-    typedef decltype(coalescedRead(psi[0])) spinor;
-    spinor tmp1,tmp2,tmp2_spProj;
+      for(int s=1;s<Ls-1;s++){
+	res = psi(ss+s);
+	tmp_spProj += pMooeeInv_shift_lc[s]*res;
+	res -= plee[s-1]*tmp;
+	spProj5m(tmp,res);
+	acc += pleem[s]*tmp;
+	spProj5p(tmp,res);
+	coalescedWrite(chi[ss+s],res);
+      }
+      res = psi(ss+Ls-1);

-    // Apply (L^{\prime})^{-1} and accumulate MooeeInv_shift_lc[j]*psi[j] in tmp2
-    coalescedWrite(chi[ss], psi(ss)); // chi[0]=psi[0]
-    tmp2 = pMooeeInv_shift_lc[0]*psi(ss);
-    for(int s=1; s<Ls; s++){
-      spProj5p(tmp1, chi(ss+s-1));
-      coalescedWrite(chi[ss+s], psi(ss+s) - plee[s-1]*tmp1);
-      tmp2 = tmp2 + pMooeeInv_shift_lc[s]*psi(ss+s);
-    }
-    if(pm == 1){ spProj5p(tmp2_spProj, tmp2);}
-    else       { spProj5m(tmp2_spProj, tmp2); }
+      tmp_spProj += pMooeeInv_shift_lc[Ls-1]*res;
+      if(pm == 1){ spProj5p(tmp_spProj, tmp_spProj);}
+      else       { spProj5m(tmp_spProj, tmp_spProj); }

-    // L_m^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-      spProj5m(tmp1, chi(ss+s));
-      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pleem[s]*tmp1);
-    }
+      res = res - plee[Ls-2]*tmp - acc;

-    // U_m^{-1} D^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
-      spProj5p(tmp1, chi(ss+Ls-1));
-      coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pueem[s]/pdee[Ls-1])*tmp1);
-    }
-    // chi[ss+Ls-1] = (1.0/pdee[Ls-1])*chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj;
-    coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1));
-    spProj5m(tmp1, chi(ss+Ls-1));
-    coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) + pMooeeInv_shift_norm[Ls-1]*tmp2_spProj);
-
-    // Apply U^{-1} and add shift term
-    for(int s=Ls-2; s>=0; s--){
-      coalescedWrite(chi[ss+s] , chi(ss+s) - puee[s]*tmp1);
-      spProj5m(tmp1, chi(ss+s));
-      coalescedWrite(chi[ss+s], chi(ss+s) + pMooeeInv_shift_norm[s]*tmp2_spProj);
-    }
+      // Apply U_m^{-1} D^{-1} U^{-1}
+      res = (1.0/pdee[Ls-1])*res;
+      spProj5p(acc,res);
+      spProj5m(tmp,res);
+      coalescedWrite(chi[ss+Ls-1], res + pMooeeInv_shift_norm[Ls-1]*tmp_spProj);
+      for (int s=Ls-2;s>=0;s--){
+	res = (1.0/pdee[s])*chi(ss+s) - puee[s]*tmp - pueem[s]*acc;
+	spProj5m(tmp,res);
+	coalescedWrite(chi[ss+s], res + pMooeeInv_shift_norm[s]*tmp_spProj);
+      }
  });

  this->MooeeInvTime += usecond();
@@ -347,39 +368,40 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel

  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
-
-    uint64_t ss = sss*Ls;
-
+    uint64_t ss=sss*Ls;
    typedef decltype(coalescedRead(psi[0])) spinor;
-    spinor tmp;
+    spinor tmp, acc, res;

-    // Apply (U^{\prime})^{-dag}
-    coalescedWrite(chi[ss], psi(ss));
-    for(int s=1; s<Ls; s++){
-      spProj5m(tmp, chi(ss+s-1));
-      coalescedWrite(chi[ss+s], psi(ss+s) - puee[s-1]*tmp);
-    }
+    // X = Nc*Ns
+    // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops
+    // Apply (U^{\prime})^{-dagger} U_m^{-\dagger}
+    res = psi(ss);
+    spProj5p(tmp,res);
+    acc = pueem[0]*tmp;
+    spProj5m(tmp,res);
+    coalescedWrite(chi[ss],res);
    
-    // U_m^{-\dag}
-    for(int s=0; s<Ls-1; s++){
-      spProj5p(tmp, chi(ss+s));
-      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pueem[s]*tmp);
+    for(int s=1;s<Ls-1;s++){
+      res = psi(ss+s);
+      res -= puee[s-1]*tmp;
+      spProj5p(tmp,res);
+      acc += pueem[s]*tmp;
+      spProj5m(tmp,res);
+      coalescedWrite(chi[ss+s],res);
    }
-
-    // L_m^{-\dag} D^{-dag}
-    for(int s=0; s<Ls-1; s++){
-      spProj5m(tmp, chi(ss+Ls-1));
-      coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pleem[s]/pdee[Ls-1])*tmp);
-    }
-    coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1));
-
-    // Apply L^{-dag}
-    for(int s=Ls-2; s>=0; s--){
-      spProj5p(tmp, chi(ss+s+1));
-      coalescedWrite(chi[ss+s], chi(ss+s) - plee[s]*tmp);
+    res = psi(ss+Ls-1) - puee[Ls-2]*tmp - acc;
+    
+    // Apply L_m^{-\dagger} D^{-dagger} L^{-dagger}
+    res = (1.0/pdee[Ls-1])*res;
+    coalescedWrite(chi[ss+Ls-1],res);
+    spProj5m(acc,res);
+    spProj5p(tmp,res);
+    for (int s=Ls-2;s>=0;s--){
+      res = (1.0/pdee[s])*chi(ss+s) - plee[s]*tmp - pleem[s]*acc;
+      spProj5p(tmp,res);
+      coalescedWrite(chi[ss+s],res);
    }
  });
-
  this->MooeeInvTime += usecond();
 }

@@ -406,45 +428,45 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi

  int nloop = grid->oSites()/Ls;
  accelerator_for(sss,nloop,Simd::Nsimd(),{
+      uint64_t ss=sss*Ls;
+      typedef decltype(coalescedRead(psi[0])) spinor;
+      spinor tmp, acc, res, tmp_spProj;

-    uint64_t ss = sss*Ls;
+      // Apply (U^{\prime})^{-dagger} U_m^{-\dagger}
+      res = psi(ss);
+      spProj5p(tmp,res);
+      acc = pueem[0]*tmp;
+      spProj5m(tmp,res);
+      coalescedWrite(chi[ss],res);
+      tmp_spProj = pMooeeInvDag_shift_lc[0]*res;

-    typedef decltype(coalescedRead(psi[0])) spinor;
-    spinor tmp1,tmp2,tmp2_spProj;
+      for(int s=1;s<Ls-1;s++){
+	res = psi(ss+s);
+	tmp_spProj += pMooeeInvDag_shift_lc[s]*res;
+	res -= puee[s-1]*tmp;
+	spProj5p(tmp,res);
+	acc += pueem[s]*tmp;
+	spProj5m(tmp,res);
+	coalescedWrite(chi[ss+s],res);
+      }
+      res = psi(ss+Ls-1);

-    // Apply (U^{\prime})^{-dag} and accumulate MooeeInvDag_shift_lc[j]*psi[j] in tmp2
-    coalescedWrite(chi[ss], psi(ss));
-    tmp2 = pMooeeInvDag_shift_lc[0]*psi(ss);
-    for(int s=1; s<Ls; s++){
-      spProj5m(tmp1, chi(ss+s-1));
-      coalescedWrite(chi[ss+s],psi(ss+s) - puee[s-1]*tmp1);
-      tmp2 = tmp2 + pMooeeInvDag_shift_lc[s]*psi(ss+s);
-    }
+      tmp_spProj += pMooeeInvDag_shift_lc[Ls-1]*res;
+      if(pm == 1){ spProj5p(tmp_spProj, tmp_spProj); }
+      else       { spProj5m(tmp_spProj, tmp_spProj); }

-    if(pm == 1){ spProj5p(tmp2_spProj, tmp2);}
-    else       { spProj5m(tmp2_spProj, tmp2);}
+      res = res - puee[Ls-2]*tmp - acc;

-    // U_m^{-\dag}
-    for(int s=0; s<Ls-1; s++){
-      spProj5p(tmp1, chi(ss+s));
-      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pueem[s]*tmp1);
-    }
-
-    // L_m^{-\dag} D^{-dag}
-    for(int s=0; s<Ls-1; s++){
-      spProj5m(tmp1, chi(ss+Ls-1));
-      coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pleem[s]/pdee[Ls-1])*tmp1);
-    }
-    coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1));
-    spProj5p(tmp1, chi(ss+Ls-1));
-    coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) + pMooeeInvDag_shift_norm[Ls-1]*tmp2_spProj);
-
-    // Apply L^{-dag}
-    for(int s=Ls-2; s>=0; s--){
-      coalescedWrite(chi[ss+s], chi(ss+s) - plee[s]*tmp1);
-      spProj5p(tmp1, chi(ss+s));
-      coalescedWrite(chi[ss+s], chi(ss+s) + pMooeeInvDag_shift_norm[s]*tmp2_spProj);
-    }
+      // Apply L_m^{-\dagger} D^{-dagger} L^{-dagger}
+      res = (1.0/pdee[Ls-1])*res;
+      spProj5m(acc,res);
+      spProj5p(tmp,res);
+      coalescedWrite(chi[ss+Ls-1], res + pMooeeInvDag_shift_norm[Ls-1]*tmp_spProj);
+      for (int s=Ls-2;s>=0;s--){
+	res = (1.0/pdee[s])*chi(ss+s) - plee[s]*tmp - pleem[s]*acc;
+	spProj5p(tmp,res);
+	coalescedWrite(chi[ss+s], res + pMooeeInvDag_shift_norm[s]*tmp_spProj);
+      }
  });

  this->MooeeInvTime += usecond();
--- a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
@@ -31,7 +31,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-template<class Impl>
+ template<class Impl>
 void  PartialFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
  // this does both dag and undag but is trivial; make a common helper routing
  int Ls = this->Ls;
@@ -45,8 +45,25 @@ void  PartialFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionFiel
    ag5xpby_ssp(chi, scale,chi,0.0,chi,s+1,s+1); 
  }
  ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);
-
 }
+template<class Impl>
+void  PartialFractionFermion5D<Impl>::MdirAll (const FermionField &psi, std::vector<FermionField> &chi){
+  // this does both dag and undag but is trivial; make a common helper routing
+  int Ls = this->Ls;
+
+  this->DhopDirAll(psi,chi);
+
+  for(int point=0;point<chi.size();point++){
+    int nblock=(Ls-1)/2;
+    for(int b=0;b<nblock;b++){
+      int s = 2*b;
+      ag5xpby_ssp(chi[point],-scale,chi[point],0.0,chi[point],s,s); 
+      ag5xpby_ssp(chi[point], scale,chi[point],0.0,chi[point],s+1,s+1); 
+    }
+    ag5xpby_ssp(chi[point],p[nblock]*scale/amax,chi[point],0.0,chi[point],Ls-1,Ls-1);
+  }
+}
+
 template<class Impl>
 void   PartialFractionFermion5D<Impl>::Meooe_internal(const FermionField &psi, FermionField &chi,int dag)
 {
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@@ -241,6 +241,15 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
  Kernels::DhopDirKernel(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out,dirdisp,gamma);

 };
+template<class Impl>
+void WilsonFermion5D<Impl>::DhopDirAll(const FermionField &in, std::vector<FermionField> &out)
+{
+  Compressor compressor(DaggerNo);
+  Stencil.HaloExchange(in,compressor);
+  uint64_t Nsite = Umu.Grid()->oSites();
+  Kernels::DhopDirAll(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out);
+};
+

 template<class Impl>
 void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@@ -319,28 +319,51 @@ void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int d
 }

 template <class Impl>
-void WilsonFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
+void WilsonFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) 
+{
  DhopDir(in, out, dir, disp);
 }
+template <class Impl>
+void WilsonFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out) 
+{
+  DhopDirAll(in, out);
+}

 template <class Impl>
 void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) 
 {
+  Compressor compressor(DaggerNo);
+  Stencil.HaloExchange(in, compressor);
+
  int skip = (disp == 1) ? 0 : 1;
  int dirdisp = dir + skip * 4;
  int gamma = dir + (1 - skip) * 4;

-  DhopDirDisp(in, out, dirdisp, gamma, DaggerNo);
+  DhopDirCalc(in, out, dirdisp, gamma, DaggerNo);
 };
-
 template <class Impl>
-void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) 
+void WilsonFermion<Impl>::DhopDirAll(const FermionField &in, std::vector<FermionField> &out) 
 {
-  Compressor compressor(dag);
-
+  Compressor compressor(DaggerNo);
  Stencil.HaloExchange(in, compressor);
+
+  assert((out.size()==8)||(out.size()==9)); 
+  for(int dir=0;dir<Nd;dir++){
+    for(int disp=-1;disp<=1;disp+=2){
+
+      int skip = (disp == 1) ? 0 : 1;
+      int dirdisp = dir + skip * 4;
+      int gamma = dir + (1 - skip) * 4;
+
+      DhopDirCalc(in, out[dirdisp], dirdisp, gamma, DaggerNo);
+    }
+  }
+}
+template <class Impl>
+void WilsonFermion<Impl>::DhopDirCalc(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) 
+{
  int Ls=1;
-  int Nsite=in.oSites();
+  uint64_t Nsite=in.oSites();
  Kernels::DhopDirKernel(Stencil, Umu, Stencil.CommBuf(), Ls, Nsite, in, out, dirdisp, gamma);
 };

@@ -348,7 +371,8 @@ template <class Impl>
 void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
                                       DoubledGaugeField &U,
                                       const FermionField &in,
-                                       FermionField &out, int dag) {
+                                       FermionField &out, int dag) 
+{
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -91,8 +91,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
  }								\
  synchronise();						

-#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon)			\
-  if (gamma == Dir) {						\
+#define GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon)		\
    if (SE->_is_local ) {					\
      int perm= SE->_permute;					\
      auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	\
@@ -102,10 +101,14 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
    }								\
    synchronise();						\
    Impl::multLink(Uchi, U[sU], chi, dir, SE, st);		\
-    Recon(result, Uchi);					\
-    synchronise();						\
+    Recon(result, Uchi);					
+
+#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon)			\
+  if (gamma == Dir) {						\
+    GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon);			\
  }

+
  ////////////////////////////////////////////////////////////////////
  // All legs kernels ; comms then compute
  ////////////////////////////////////////////////////////////////////
@@ -284,7 +287,36 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeField
  }
 };

-template <class Impl>
+#define DhopDirMacro(Dir,spProj,spRecon)	\
+  template <class Impl>							\
+  void WilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \
+					 int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
+  {									\
+  typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;		\
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;			\
+  calcHalfSpinor chi;							\
+  calcSpinor result;							\
+  calcHalfSpinor Uchi;							\
+  StencilEntry *SE;							\
+  int ptype;								\
+  const int Nsimd = SiteHalfSpinor::Nsimd();				\
+  const int lane=SIMTlane(Nsimd);					\
+									\
+  SE = st.GetEntry(ptype, dir, sF);					\
+  GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,spRecon);				\
+  coalescedWrite(out[sF], result,lane);					\
+  }									
+
+DhopDirMacro(Xp,spProjXp,spReconXp);
+DhopDirMacro(Yp,spProjYp,spReconYp);
+DhopDirMacro(Zp,spProjZp,spReconZp);
+DhopDirMacro(Tp,spProjTp,spReconTp);
+DhopDirMacro(Xm,spProjXm,spReconXm);
+DhopDirMacro(Ym,spProjYm,spReconYm);
+DhopDirMacro(Zm,spProjZm,spReconZm);
+DhopDirMacro(Tm,spProjTm,spReconTm);
+
+template <class Impl> 
 void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
 				    int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma) 
 {
@@ -299,18 +331,7 @@ void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si
  const int lane=SIMTlane(Nsimd);

  SE = st.GetEntry(ptype, dir, sF);
-  if (gamma == Xp) {						
-    if (SE->_is_local ) {					
-      int perm= SE->_permute;					
-      auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	
-      spProjXp(chi,tmp);						
-    } else {							
-      chi = coalescedRead(buf[SE->_offset],lane);			
-    }								
-    Impl::multLink(Uchi, U[sU], chi, dir, SE, st);		
-    spReconXp(result, Uchi);					
-  }
-
+  GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp);
  GENERIC_DHOPDIR_LEG(Yp,spProjYp,spReconYp);
  GENERIC_DHOPDIR_LEG(Zp,spProjZp,spReconZp);
  GENERIC_DHOPDIR_LEG(Tp,spProjTp,spReconTp);
@@ -321,6 +342,38 @@ void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si
  coalescedWrite(out[sF], result,lane);
 }

+template <class Impl>
+void WilsonKernels<Impl>::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
+				      int Nsite, const FermionField &in, std::vector<FermionField> &out) 
+{
+   auto U_v   = U.View();
+   auto in_v  = in.View();
+   auto st_v  = st.View();
+
+   auto out_Xm = out[0].View();
+   auto out_Ym = out[1].View();
+   auto out_Zm = out[2].View();
+   auto out_Tm = out[3].View();
+   auto out_Xp = out[4].View();
+   auto out_Yp = out[5].View();
+   auto out_Zp = out[6].View();
+   auto out_Tp = out[7].View();
+
+   accelerator_forNB(sss,Nsite*Ls,Simd::Nsimd(),{
+      int sU=sss/Ls;				
+      int sF =sss;				
+      DhopDirXm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xm,0);
+      DhopDirYm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Ym,1);
+      DhopDirZm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zm,2);
+      DhopDirTm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tm,3);
+      DhopDirXp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xp,4);
+      DhopDirYp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Yp,5);
+      DhopDirZp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zp,6);
+      DhopDirTp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tp,7);
+   });
+}
+
+
 template <class Impl>
 void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
 					 int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma) 
@@ -332,13 +385,32 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
   auto in_v  = in.View();
   auto out_v = out.View();
   auto st_v  = st.View();
-   accelerator_for(ss,Nsite,Simd::Nsimd(),{
-    for(int s=0;s<Ls;s++){
-      int sU=ss;
-      int sF = s+Ls*sU; 
-      DhopDirK(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_v,dirdisp,gamma);
-    }
-  });
+#define LoopBody(Dir)				\
+   case Dir :			\
+     accelerator_forNB(ss,Nsite,Simd::Nsimd(),{	\
+       for(int s=0;s<Ls;s++){			\
+	 int sU=ss;				\
+	 int sF = s+Ls*sU;						\
+	 DhopDir##Dir(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_v,dirdisp);\
+       }							       \
+       });							       \
+     break;
+
+   switch(gamma){
+   LoopBody(Xp);
+   LoopBody(Yp);
+   LoopBody(Zp);
+   LoopBody(Tp);
+
+   LoopBody(Xm);
+   LoopBody(Ym);
+   LoopBody(Zm);
+   LoopBody(Tm);
+   default:
+     assert(0);
+     break;
+   }
+#undef LoopBody
 } 

 #define KERNEL_CALLNB(A) \
--- a/Grid/qcd/action/fermion/instantiation/ImprovedStaggeredFermionInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/ImprovedStaggeredFermionInstantiation.cc
@@ -26,7 +26,7 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 NAMESPACE_BEGIN(Grid);

--- a/Grid/qcd/action/fermion/instantiation/ImprovedStaggeredFermionInstantiation.cc.master
+++ b/Grid/qcd/action/fermion/instantiation/ImprovedStaggeredFermionInstantiation.cc.master
@@ -26,7 +26,7 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 #include <Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h>

 NAMESPACE_BEGIN(Grid);
--- a/Grid/qcd/action/fermion/instantiation/StaggeredImplD/ImprovedStaggeredFermionInstantiationStaggeredImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/StaggeredImplD/ImprovedStaggeredFermionInstantiationStaggeredImplD.cc
@@ -26,7 +26,7 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 #include <Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h>

 NAMESPACE_BEGIN(Grid);
--- a/Grid/qcd/action/fermion/instantiation/StaggeredImplF/ImprovedStaggeredFermionInstantiationStaggeredImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/StaggeredImplF/ImprovedStaggeredFermionInstantiationStaggeredImplF.cc
@@ -26,7 +26,7 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 #include <Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h>

 NAMESPACE_BEGIN(Grid);
--- a/Grid/qcd/hmc/HMC_GridModules.h
+++ b/Grid/qcd/hmc/HMC_GridModules.h
@@ -97,7 +97,6 @@ protected:
 ////////////////////////////////////
 // Classes for the user
 ////////////////////////////////////
-// Note: the space time grid should be out of the QCD namespace
 template <class vector_type>
 class GridFourDimModule : public GridModule
 {
--- a/Grid/qcd/smearing/StoutSmearing.h
+++ b/Grid/qcd/smearing/StoutSmearing.h
@@ -1,5 +1,34 @@
+/*************************************************************************************
+ 
+ Grid physics library, www.github.com/paboyle/Grid
+ 
+ Source file: ./lib/qcd/smearing/StoutSmearing.h
+ 
+ Copyright (C) 2019
+ 
+ Author: unknown
+ Author: Felix Erben <ferben@ed.ac.uk>
+ Author: Michael Marshall <Michael.Marshall@ed.ac.uk>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ 
+ See the full license in the file "LICENSE" in the top level distribution
+ directory
+ *************************************************************************************/
 /*
-  @file stoutSmear.hpp
+  @file StoutSmearing.h
  @brief Declares Stout smearing class
 */
 #pragma once
@@ -9,19 +38,43 @@ NAMESPACE_BEGIN(Grid);
 /*!  @brief Stout smearing of link variable. */
 template <class Gimpl>
 class Smear_Stout : public Smear<Gimpl> {
-private:
-  const Smear<Gimpl>* SmearBase;
+ private:
+  int OrthogDim = -1;
+  const std::vector<double> SmearRho;
+  // Smear<Gimpl>* ownership semantics:
+  //    Smear<Gimpl>* passed in to constructor are owned by caller, so we don't delete them here
+  //    Smear<Gimpl>* created within constructor need to be deleted as part of the destructor
+  const std::unique_ptr<Smear<Gimpl>> OwnedBase; // deleted at destruction
+  const Smear<Gimpl>* SmearBase; // Not owned by this object, so not deleted at destruction

+  // only anticipated to be used from default constructor
+  inline static std::vector<double> rho3D(double rho, int orthogdim){
+    std::vector<double> rho3d(Nd*Nd);
+    for (int mu=0; mu<Nd; mu++)
+      for (int nu=0; nu<Nd; nu++)
+        rho3d[mu + Nd * nu] = (mu == nu || mu == orthogdim || nu == orthogdim) ? 0.0 : rho;
+    return rho3d;
+  };
+  
 public:
  INHERIT_GIMPL_TYPES(Gimpl)

-  Smear_Stout(Smear<Gimpl>* base) : SmearBase(base) {
-    assert(Nc == 3);//                  "Stout smearing currently implemented only for Nc==3");
+  /*! Stout smearing with base explicitly specified */
+  Smear_Stout(Smear<Gimpl>* base) : SmearBase{base} {
+    assert(Nc == 3 && "Stout smearing currently implemented only for Nc==3");
  }

-  /*! Default constructor */
-  Smear_Stout(double rho = 1.0) : SmearBase(new Smear_APE<Gimpl>(rho)) {
-    assert(Nc == 3);//                  "Stout smearing currently implemented only for Nc==3");
+  /*! Construct stout smearing object from explicitly specified rho matrix */
+  Smear_Stout(const std::vector<double>& rho_)
+    : OwnedBase{new Smear_APE<Gimpl>(rho_)}, SmearBase{OwnedBase.get()} {
+    std::cout << GridLogDebug << "Stout smearing constructor : Smear_Stout(const std::vector<double>& " << rho_ << " )" << std::endl
+    assert(Nc == 3 && "Stout smearing currently implemented only for Nc==3");
+    }
+
+  /*! Default constructor. rho is constant in all directions, optionally except for orthogonal dimension */
+  Smear_Stout(double rho = 1.0, int orthogdim = -1)
+  : OrthogDim{orthogdim}, SmearRho{ rho3D(rho,orthogdim) }, OwnedBase{ new Smear_APE<Gimpl>(SmearRho) }, SmearBase{OwnedBase.get()} {
+    assert(Nc == 3 && "Stout smearing currently implemented only for Nc==3");
  }

  ~Smear_Stout() {}  // delete SmearBase...
@@ -36,12 +89,16 @@ public:
    SmearBase->smear(C, U);

    for (int mu = 0; mu < Nd; mu++) {
-      tmp = peekLorentz(C, mu);
-      Umu = peekLorentz(U, mu);
-      iq_mu = Ta(
-		 tmp *
-		 adj(Umu));  // iq_mu = Ta(Omega_mu) to match the signs with the paper
-      exponentiate_iQ(tmp, iq_mu);
+      if( mu == OrthogDim )
+        tmp = 1.0;  // Don't smear in the orthogonal direction
+      else {
+        tmp = peekLorentz(C, mu);
+        Umu = peekLorentz(U, mu);
+        iq_mu = Ta(
+                   tmp *
+                   adj(Umu));  // iq_mu = Ta(Omega_mu) to match the signs with the paper
+        exponentiate_iQ(tmp, iq_mu);
+      }
      pokeLorentz(u_smr, tmp * Umu, mu);  // u_smr = exp(iQ_mu)*U_mu
    }
    std::cout << GridLogDebug << "Stout smearing completed\n";
@@ -80,6 +137,7 @@ public:
    iQ2 = iQ * iQ;
    iQ3 = iQ * iQ2;

+    //We should check sgn(c0) here already and then apply eq (34) from 0311018
    set_uw(u, w, iQ2, iQ3);
    set_fj(f0, f1, f2, u, w);

@@ -139,9 +197,8 @@ public:
  }

  LatticeComplex func_xi0(const LatticeComplex& w) const {
-    // Define a function to do the check
-    // if( w < 1e-4 ) std::cout << GridLogWarning<< "[Smear_stout] w too small:
-    // "<< w <<"\n";
+    // Definition from arxiv 0311018
+    //if (abs(w) < 0.05) {w2 = w*w; return 1.0 - w2/6.0 * (1.0-w2/20.0 * (1.0-w2/42.0));}
    return sin(w) / w;
  }

@@ -154,4 +211,3 @@ public:
 };

 NAMESPACE_END(Grid);
-
--- a/Grid/qcd/utils/A2Autils.h
+++ b/Grid/qcd/utils/A2Autils.h
@@ -67,8 +67,21 @@ public:
        const std::vector<ComplexField> &emB1,
        int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr);

-  static void ContractWWVV(std::vector<PropagatorField> &WWVV,
-			   const Eigen::Tensor<ComplexD,3> &WW_sd,
+  template <typename TensorType>
+  typename std::enable_if<(std::is_same<Eigen::Tensor<ComplexD,3>, TensorType>::value ||
+                           std::is_same<Eigen::TensorMap<Eigen::Tensor<Complex, 3, Eigen::RowMajor>>, TensorType>::value),
+                           void>::type
+  static ContractWWVV(std::vector<PropagatorField> &WWVV,
+			   const TensorType &WW_sd,
+			   const FermionField *vs,
+			   const FermionField *vd);
+
+  template <typename TensorType>
+  typename std::enable_if<!(std::is_same<Eigen::Tensor<ComplexD,3>, TensorType>::value ||
+                            std::is_same<Eigen::TensorMap<Eigen::Tensor<Complex, 3, Eigen::RowMajor>>, TensorType>::value),
+                            void>::type
+  static ContractWWVV(std::vector<PropagatorField> &WWVV,
+			   const TensorType &WW_sd,
 			   const FermionField *vs,
 			   const FermionField *vd);

@@ -98,6 +111,11 @@ public:
 			const FermionField *vd,
 			int orthogdim);
 #endif
+private:
+  inline static void OuterProductWWVV(PropagatorField &WWVV,
+                               const vobj &lhs,
+                               const vobj &rhs,
+                               const int Ns, const int ss);
 };

 template <class FImpl>
@@ -242,7 +260,7 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
 	      int ij_dx = m+Nmom*i + Nmom*Lblock * j + Nmom*Lblock * Rblock * lt;
 	      for(int mu=0;mu<Ngamma;mu++){
 		// this is a bit slow
-		mat(m,mu,t,i,j) = trace(lsSum[ij_dx]*Gamma(gammas[mu]));
+		mat(m,mu,t,i,j) = trace(lsSum[ij_dx]*Gamma(gammas[mu]))()()();
 	      }
 	    }
 	  }
@@ -968,9 +986,13 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
 // Take WW_sd v^dag_d (x) v_s
 // 

-template<class FImpl>
-void A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
-				   const Eigen::Tensor<ComplexD,3> &WW_sd,
+template <class FImpl>
+template <typename TensorType>
+typename std::enable_if<(std::is_same<Eigen::Tensor<ComplexD,3>, TensorType>::value ||
+                         std::is_same<Eigen::TensorMap<Eigen::Tensor<Complex, 3, Eigen::RowMajor>>, TensorType>::value),
+                         void>::type
+A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
+				   const TensorType &WW_sd,
 				   const FermionField *vs,
 				   const FermionField *vd)
 {
@@ -992,39 +1014,100 @@ void A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
    for(int d_o=0;d_o<N_d;d_o+=d_unroll){
      for(int t=0;t<N_t;t++){
      for(int s=0;s<N_s;s++){
-	auto vs_v = vs[s].View();
-	auto tmp1 = vs_v[ss];
-	vobj tmp2 = Zero();
-	vobj tmp3 = Zero();
-	for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
-	  auto vd_v = vd[d].View();
-	  Scalar_v coeff = WW_sd(t,s,d);
-	  tmp3 = conjugate(vd_v[ss]);
-	  mac(&tmp2, &coeff, &tmp3);
-	}
+  auto vs_v = vs[s].View();
+  auto tmp1 = vs_v[ss];
+  vobj tmp2 = Zero();
+  vobj tmp3 = Zero();
+  for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
+    auto vd_v = vd[d].View();
+    Scalar_v coeff = WW_sd(t,s,d);
+    tmp3 = conjugate(vd_v[ss]);
+    mac(&tmp2, &coeff, &tmp3);
+  }

-	//////////////////////////
-	// Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
-	//////////////////////////
-	auto WWVV_v = WWVV[t].View();
-	for(int s1=0;s1<Ns;s1++){
-	for(int s2=0;s2<Ns;s2++){
-	  WWVV_v[ss]()(s1,s2)(0,0) += tmp1()(s1)(0)*tmp2()(s2)(0);
-	  WWVV_v[ss]()(s1,s2)(0,1) += tmp1()(s1)(0)*tmp2()(s2)(1);
-	  WWVV_v[ss]()(s1,s2)(0,2) += tmp1()(s1)(0)*tmp2()(s2)(2);
-	  WWVV_v[ss]()(s1,s2)(1,0) += tmp1()(s1)(1)*tmp2()(s2)(0);
-	  WWVV_v[ss]()(s1,s2)(1,1) += tmp1()(s1)(1)*tmp2()(s2)(1);
-	  WWVV_v[ss]()(s1,s2)(1,2) += tmp1()(s1)(1)*tmp2()(s2)(2);
-	  WWVV_v[ss]()(s1,s2)(2,0) += tmp1()(s1)(2)*tmp2()(s2)(0);
-	  WWVV_v[ss]()(s1,s2)(2,1) += tmp1()(s1)(2)*tmp2()(s2)(1);
-	  WWVV_v[ss]()(s1,s2)(2,2) += tmp1()(s1)(2)*tmp2()(s2)(2);
-	}}
+  //////////////////////////
+  // Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
+  //////////////////////////
+  OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);

      }}
    }
  });
 }

+template <class FImpl>
+template <typename TensorType>
+typename std::enable_if<!(std::is_same<Eigen::Tensor<ComplexD, 3>, TensorType>::value ||
+                          std::is_same<Eigen::TensorMap<Eigen::Tensor<Complex, 3, Eigen::RowMajor>>, TensorType>::value),
+                          void>::type
+A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
+                              const TensorType &WW_sd,
+                              const FermionField *vs,
+                              const FermionField *vd)
+{
+  GridBase *grid = vs[0].Grid();
+
+  int nd    = grid->_ndimension;
+  int Nsimd = grid->Nsimd();
+  int N_t = WW_sd.dimensions()[0];
+  int N_s = WW_sd.dimensions()[1];
+  int N_d = WW_sd.dimensions()[2];
+
+  int d_unroll = 32;// Empirical optimisation
+
+  Eigen::Matrix<Complex, -1, -1, Eigen::RowMajor> buf;
+
+  for(int t=0;t<N_t;t++){
+    WWVV[t] = Zero();
+  }
+
+  for (int t = 0; t < N_t; t++){
+    std::cout << GridLogMessage << "Contraction t = " << t << std::endl;
+    buf = WW_sd[t];
+    thread_for(ss,grid->oSites(),{
+      for(int d_o=0;d_o<N_d;d_o+=d_unroll){
+        for(int s=0;s<N_s;s++){
+    auto vs_v = vs[s].View();
+    auto tmp1 = vs_v[ss];
+    vobj tmp2 = Zero();
+    vobj tmp3 = Zero();
+    for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
+      auto vd_v = vd[d].View();
+      Scalar_v coeff = buf(s,d);
+      tmp3 = conjugate(vd_v[ss]);
+      mac(&tmp2, &coeff, &tmp3);
+    }
+
+    //////////////////////////
+    // Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
+    //////////////////////////
+    OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);
+      }}
+    });
+  }
+}
+
+template <class FImpl>
+inline void A2Autils<FImpl>::OuterProductWWVV(PropagatorField &WWVV,
+                                             const vobj &lhs,
+                                             const vobj &rhs,
+                                             const int Ns, const int ss)
+{
+  auto WWVV_v = WWVV.View();
+  for (int s1 = 0; s1 < Ns; s1++){
+    for (int s2 = 0; s2 < Ns; s2++){
+      WWVV_v[ss]()(s1,s2)(0, 0) += lhs()(s1)(0) * rhs()(s2)(0);
+      WWVV_v[ss]()(s1,s2)(0, 1) += lhs()(s1)(0) * rhs()(s2)(1);
+      WWVV_v[ss]()(s1,s2)(0, 2) += lhs()(s1)(0) * rhs()(s2)(2);
+      WWVV_v[ss]()(s1,s2)(1, 0) += lhs()(s1)(1) * rhs()(s2)(0);
+      WWVV_v[ss]()(s1,s2)(1, 1) += lhs()(s1)(1) * rhs()(s2)(1);
+      WWVV_v[ss]()(s1,s2)(1, 2) += lhs()(s1)(1) * rhs()(s2)(2);
+      WWVV_v[ss]()(s1,s2)(2, 0) += lhs()(s1)(2) * rhs()(s2)(0);
+      WWVV_v[ss]()(s1,s2)(2, 1) += lhs()(s1)(2) * rhs()(s2)(1);
+      WWVV_v[ss]()(s1,s2)(2, 2) += lhs()(s1)(2) * rhs()(s2)(2);
+    }
+  }
+}

 template<class FImpl>
 void A2Autils<FImpl>::ContractFourQuarkColourDiagonal(const PropagatorField &WWVV0,
--- a/Grid/qcd/utils/BaryonUtils.h
+++ b/Grid/qcd/utils/BaryonUtils.h
@@ -0,0 +1,624 @@
+/*************************************************************************************
+ 
+ Grid physics library, www.github.com/paboyle/Grid
+ 
+ Source file: ./lib/qcd/utils/BaryonUtils.h
+ 
+ Copyright (C) 2019
+ 
+ Author: Felix Erben <felix.erben@ed.ac.uk>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ 
+ See the full license in the file "LICENSE" in the top level distribution directory
+ *************************************************************************************/
+/*  END LEGAL */
+#pragma once
+//#include <Grid/Hadrons/Global.hpp>
+#include <Grid/Eigen/unsupported/CXX11/Tensor>
+
+NAMESPACE_BEGIN(Grid);
+
+template <typename FImpl>
+class BaryonUtils 
+{
+public:
+  typedef typename FImpl::ComplexField ComplexField;
+  typedef typename FImpl::FermionField FermionField;
+  typedef typename FImpl::PropagatorField PropagatorField;
+
+  typedef typename FImpl::SitePropagator pobj;
+  typedef typename ComplexField::vector_object vobj;
+
+  typedef Lattice<iSpinMatrix<typename FImpl::Simd>> SpinMatrixField;
+  typedef typename SpinMatrixField::vector_object sobj;
+
+  static const int epsilon[6][3] ;
+  static const Complex epsilon_sgn[6];
+
+  private: 
+  template <class mobj, class robj>
+  static void baryon_site(const mobj &D1,
+				 const mobj &D2,
+				 const mobj &D3,
+				 const Gamma GammaA_left,
+				 const Gamma GammaB_left,
+				 const Gamma GammaA_right,
+				 const Gamma GammaB_right,
+				 const int parity,
+				 const int * wick_contractions,
+  				 robj &result);
+  public:
+  static void ContractBaryons(const PropagatorField &q1_left,
+				 const PropagatorField &q2_left,
+				 const PropagatorField &q3_left,
+				 const Gamma GammaA_left,
+				 const Gamma GammaB_left,
+				 const Gamma GammaA_right,
+				 const Gamma GammaB_right,
+				 const char * quarks_left,
+				 const char * quarks_right,
+				 const int parity,
+				 ComplexField &baryon_corr);
+  template <class mobj, class robj>
+  static void ContractBaryons_Sliced(const mobj &D1,
+				 const mobj &D2,
+				 const mobj &D3,
+				 const Gamma GammaA_left,
+				 const Gamma GammaB_left,
+				 const Gamma GammaA_right,
+				 const Gamma GammaB_right,
+				 const char * quarks_left,
+				 const char * quarks_right,
+				 const int parity,
+				 robj &result);
+  private: 
+  template <class mobj, class mobj2, class robj>
+  static void Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop,
+						 const mobj2 &Du_spec,
+						 const mobj &Dd_tf,
+						 const mobj &Ds_ti,
+				                 const Gamma Gamma_H,
+				                 const Gamma GammaB_sigma,
+		                 		 const Gamma GammaB_nucl,
+						 robj &result);
+  template <class mobj, class mobj2, class robj>
+  static void Sigma_to_Nucleon_Q1_NonEye_site(const mobj &Du_ti,
+						 const mobj &Du_tf,
+						 const mobj2 &Du_spec,
+						 const mobj &Dd_tf,
+						 const mobj &Ds_ti,
+				                 const Gamma Gamma_H,
+				                 const Gamma GammaB_sigma,
+		                 		 const Gamma GammaB_nucl,
+						 robj &result);
+
+
+  template <class mobj, class mobj2, class robj>
+  static void Sigma_to_Nucleon_Q2_Eye_site(const mobj &Dq_loop,
+						 const mobj2 &Du_spec,
+						 const mobj &Dd_tf,
+						 const mobj &Ds_ti,
+				                 const Gamma Gamma_H,
+				                 const Gamma GammaB_sigma,
+		                 		 const Gamma GammaB_nucl,
+						 robj &result);
+  template <class mobj, class mobj2, class robj>
+  static void Sigma_to_Nucleon_Q2_NonEye_site(const mobj &Du_ti,
+						 const mobj &Du_tf,
+						 const mobj2 &Du_spec,
+						 const mobj &Dd_tf,
+						 const mobj &Ds_ti,
+				                 const Gamma Gamma_H,
+				                 const Gamma GammaB_sigma,
+		                 		 const Gamma GammaB_nucl,
+						 robj &result);
+  public:
+  template <class mobj>
+  static void Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
+				 const mobj &Du_spec,
+				 const PropagatorField &qd_tf,
+				 const PropagatorField &qs_ti,
+				 const Gamma Gamma_H,
+				 const Gamma GammaB_sigma,
+				 const Gamma GammaB_nucl,
+		                 const std::string op,
+				 SpinMatrixField &stn_corr);
+  template <class mobj>
+  static void Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti,
+				 const PropagatorField &qq_tf,
+				 const mobj &Du_spec,
+				 const PropagatorField &qd_tf,
+				 const PropagatorField &qs_ti,
+				 const Gamma Gamma_H,
+				 const Gamma GammaB_sigma,
+				 const Gamma GammaB_nucl,
+		                 const std::string op,
+				 SpinMatrixField &stn_corr);
+};
+
+template <class FImpl> 
+const int BaryonUtils<FImpl>::epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}};
+template <class FImpl> 
+const Complex BaryonUtils<FImpl>::epsilon_sgn[6] = {Complex(1),
+						    Complex(1),
+						    Complex(1),
+						    Complex(-1),
+						    Complex(-1),
+						    Complex(-1)};
+
+template <class FImpl>
+template <class mobj, class robj>
+void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
+						 const mobj &D2,
+						 const mobj &D3,
+				                 const Gamma GammaA_left,
+				                 const Gamma GammaB_left,
+				                 const Gamma GammaA_right,
+		                 		 const Gamma GammaB_right,
+						 const int parity,
+						 const int * wick_contraction,
+						 robj &result)
+{
+
+  Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4)
+
+    auto gD1a = GammaA_left * GammaA_right * D1;
+    auto gD1b = GammaA_left * g4 * GammaA_right * D1;
+    auto pD1 = 0.5* (gD1a + (double)parity * gD1b);
+    auto gD3 = GammaB_right * D3;
+
+    for (int ie_left=0; ie_left < 6 ; ie_left++){
+      int a_left = epsilon[ie_left][0]; //a
+      int b_left = epsilon[ie_left][1]; //b
+      int c_left = epsilon[ie_left][2]; //c
+      for (int ie_right=0; ie_right < 6 ; ie_right++){
+        int a_right = epsilon[ie_right][0]; //a'
+        int b_right = epsilon[ie_right][1]; //b'
+        int c_right = epsilon[ie_right][2]; //c'
+        //This is the \delta_{456}^{123} part
+	if (wick_contraction[0]){
+          auto D2g = D2 * GammaB_left;
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
+	  for (int beta_left=0; beta_left<Ns; beta_left++){
+	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
+	    result()()() += epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,gamma_left)(c_right,c_left)*D2g()(alpha_right,beta_left)(a_right,a_left)*gD3()(alpha_right,beta_left)(b_right,b_left);
+          }}}
+  	}	  
+        //This is the \delta_{456}^{231} part
+	if (wick_contraction[1]){
+          auto pD1g = pD1 * GammaB_left;
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
+	  for (int beta_left=0; beta_left<Ns; beta_left++){
+	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
+	    result()()() += epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1g()(gamma_left,beta_left)(c_right,a_left)*D2()(alpha_right,beta_left)(a_right,b_left)*gD3()(alpha_right,gamma_left)(b_right,c_left);
+          }}}
+        }	  
+        //This is the \delta_{456}^{312} part
+	if (wick_contraction[2]){
+          auto gD3g = gD3 * GammaB_left;
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
+	  for (int beta_left=0; beta_left<Ns; beta_left++){
+	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
+	    result()()() += epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,beta_left)(c_right,b_left)*D2()(alpha_right,gamma_left)(a_right,c_left)*gD3g()(alpha_right,beta_left)(b_right,a_left);
+          }}}
+        }	  
+        //This is the \delta_{456}^{132} part
+	if (wick_contraction[3]){
+          auto gD3g = gD3 * GammaB_left;
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
+	  for (int beta_left=0; beta_left<Ns; beta_left++){
+	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
+	    result()()() -= epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,gamma_left)(c_right,c_left)*D2()(alpha_right,beta_left)(a_right,b_left)*gD3g()(alpha_right,beta_left)(b_right,a_left);
+          }}}
+        }	  
+        //This is the \delta_{456}^{321} part
+	if (wick_contraction[4]){
+          auto D2g = D2 * GammaB_left;
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
+	  for (int beta_left=0; beta_left<Ns; beta_left++){
+	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
+	    result()()() -= epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,beta_left)(c_right,b_left)*D2g()(alpha_right,beta_left)(a_right,a_left)*gD3()(alpha_right,gamma_left)(b_right,c_left);
+          }}}
+        }	  
+        //This is the \delta_{456}^{213} part
+	if (wick_contraction[5]){
+          auto pD1g = pD1 * GammaB_left;
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
+	  for (int beta_left=0; beta_left<Ns; beta_left++){
+	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
+	    result()()() -= epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1g()(gamma_left,beta_left)(c_right,a_left)*D2()(alpha_right,gamma_left)(a_right,c_left)*gD3()(alpha_right,beta_left)(b_right,b_left);
+          }}}
+        }	  
+      }
+    }
+}
+
+template<class FImpl>
+void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
+						 const PropagatorField &q2_left,
+						 const PropagatorField &q3_left,
+				                 const Gamma GammaA_left,
+				                 const Gamma GammaB_left,
+				                 const Gamma GammaA_right,
+		                 		 const Gamma GammaB_right,
+						 const char * quarks_left,
+						 const char * quarks_right,
+						 const int parity,
+						 ComplexField &baryon_corr)
+{
+  std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
+    std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
+    std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
+    std::cout << "GammaA (right) " << (GammaA_right.g) <<  std::endl;
+    std::cout << "GammaB (right) " << (GammaB_right.g) <<  std::endl;
+ 
+  assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
+
+  GridBase *grid = q1_left.Grid();
+
+  int wick_contraction[6];
+  for (int ie=0; ie < 6 ; ie++)
+    wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0;
+
+  auto vbaryon_corr= baryon_corr.View();
+  auto v1 = q1_left.View();
+  auto v2 = q2_left.View();
+  auto v3 = q3_left.View();
+
+ // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
+  thread_for(ss,grid->oSites(),{
+  //for(int ss=0; ss < grid->oSites(); ss++){
+
+    auto D1 = v1[ss];
+    auto D2 = v2[ss];
+    auto D3 = v3[ss];
+
+    vobj result=Zero();
+    baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
+    vbaryon_corr[ss] = result; 
+  }  );//end loop over lattice sites
+}
+template <class FImpl>
+template <class mobj, class robj>
+void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
+						 const mobj &D2,
+						 const mobj &D3,
+				                 const Gamma GammaA_left,
+				                 const Gamma GammaB_left,
+				                 const Gamma GammaA_right,
+		                 		 const Gamma GammaB_right,
+						 const char * quarks_left,
+						 const char * quarks_right,
+						 const int parity,
+						 robj &result)
+{
+  std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
+    std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
+    std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
+    std::cout << "GammaA (right) " << (GammaA_right.g) <<  std::endl;
+    std::cout << "GammaB (right) " << (GammaB_right.g) <<  std::endl;
+ 
+  assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
+
+  int wick_contraction[6];
+  for (int ie=0; ie < 6 ; ie++)
+    wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0;
+
+     result=Zero();
+     baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
+}
+
+/***********************************************************************
+ * End of Baryon 2pt-function code.                                    *
+ *                                                                     *
+ * The following code is for Sigma -> N rare hypeon decays             *
+ **********************************************************************/
+
+/* Dq_loop is a quark line from t_H to t_H
+ * Du_spec is a quark line from t_i to t_f
+ * Dd_tf is a quark line from t_f to t_H
+ * Ds_ti is a quark line from t_i to t_H */
+template <class FImpl>
+template <class mobj, class mobj2, class robj>
+void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop,
+						 const mobj2 &Du_spec,
+						 const mobj &Dd_tf,
+						 const mobj &Ds_ti,
+				                 const Gamma Gamma_H,
+				                 const Gamma GammaB_sigma,
+		                 		 const Gamma GammaB_nucl,
+						 robj &result)
+{
+
+  Gamma g5(Gamma::Algebra::Gamma5); 
+
+  auto DuG = Du_spec * GammaB_nucl;
+  // Gamma^B * Ds * \gamma_\mu^L * (\gamma_5 * Dd^\dagger * \gamma_5)
+  auto GDsGDd = GammaB_sigma * Ds_ti * Gamma_H * g5 * adj(Dd_tf) * g5;
+  // Dq_loop * \gamma_\mu^L
+  auto DqG = Dq_loop * Gamma_H;
+
+  for (int ie_n=0; ie_n < 6 ; ie_n++){
+    int a_n = epsilon[ie_n][0]; //a
+    int b_n = epsilon[ie_n][1]; //b
+    int c_n = epsilon[ie_n][2]; //c
+    for (int ie_s=0; ie_s < 6 ; ie_s++){
+      int a_s = epsilon[ie_s][0]; //a'
+      int b_s = epsilon[ie_s][1]; //b'
+      int c_s = epsilon[ie_s][2]; //c'
+      for (int alpha_s=0; alpha_s<Ns; alpha_s++){
+      for (int beta_n=0; beta_n<Ns; beta_n++){
+        auto GDsGDd_ab_bb = GDsGDd()(alpha_s,beta_n)(b_s,b_n);
+        for (int tau2=0; tau2<Ns; tau2++){
+        for (int j=0; j<Nc; j++){
+          auto DqG_tt_jj = DqG()(tau2,tau2)(j,j);
+          auto ee_GDGDDG = epsilon_sgn[ie_n] * epsilon_sgn[ie_s] * GDsGDd_ab_bb * DqG_tt_jj;
+          for (int gamma_s=0; gamma_s<Ns; gamma_s++){
+          for (int gamma_n=0; gamma_n<Ns; gamma_n++){
+            result()(gamma_s,gamma_n)() += ee_GDGDDG * DuG()(alpha_s, beta_n)(a_s,a_n) * Du_spec()(gamma_s,gamma_n)(c_s,c_n);
+            result()(gamma_s,gamma_n)() -= ee_GDGDDG * DuG()(gamma_s, beta_n)(c_s,a_n) * Du_spec()(alpha_s,gamma_n)(a_s,c_n);
+          }}
+	}}
+      }}
+    }
+  }
+}
+
+/* Du_ti is a quark line from t_i to t_H
+ * Du_tf is a quark line from t_f to t_H
+ * Du_spec is a quark line from t_i to t_f
+ * Dd_tf is a quark line from t_f to t_H
+ * Ds_ti is a quark line from t_i to t_H */
+template <class FImpl>
+template <class mobj, class mobj2, class robj>
+void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q1_NonEye_site(const mobj &Du_ti,
+						 const mobj &Du_tf,
+						 const mobj2 &Du_spec,
+						 const mobj &Dd_tf,
+						 const mobj &Ds_ti,
+				                 const Gamma Gamma_H,
+				                 const Gamma GammaB_sigma,
+		                 		 const Gamma GammaB_nucl,
+						 robj &result)
+{
+
+  Gamma g5(Gamma::Algebra::Gamma5); 
+
+  auto DuG = Du_spec * GammaB_nucl;
+  auto adjDu = g5 * adj(Du_tf) * g5;
+  auto adjDuG = adjDu * GammaB_nucl;
+  // Gamma^B * Ds * \gamma_\mu^L * (\gamma_5 * Dd^\dagger * \gamma_5)
+  auto GDsGDd = GammaB_sigma * Ds_ti * Gamma_H * g5 * adj(Dd_tf) * g5;
+  // Dq_loop * \gamma_\mu^L
+  auto DuGH = Du_ti * Gamma_H;
+
+  for (int ie_n=0; ie_n < 6 ; ie_n++){
+    int a_n = epsilon[ie_n][0]; //a
+    int b_n = epsilon[ie_n][1]; //b
+    int c_n = epsilon[ie_n][2]; //c
+    for (int ie_s=0; ie_s < 6 ; ie_s++){
+      int a_s = epsilon[ie_s][0]; //a'
+      int b_s = epsilon[ie_s][1]; //b'
+      int c_s = epsilon[ie_s][2]; //c'
+      for (int alpha_s=0; alpha_s<Ns; alpha_s++){
+      for (int beta_n=0; beta_n<Ns; beta_n++){
+        auto GDsGDd_ab_bb = GDsGDd()(alpha_s,beta_n)(b_s,b_n);
+        for (int tau2=0; tau2<Ns; tau2++){
+        for (int j=0; j<Nc; j++){
+          auto DuGH_at_aj = DuGH()(alpha_s,tau2)(a_s,j);
+          auto ee_GDGDDG_a = epsilon_sgn[ie_n] * epsilon_sgn[ie_s] * GDsGDd_ab_bb * DuGH_at_aj;
+          for (int gamma_s=0; gamma_s<Ns; gamma_s++){
+            auto DuGH_gt_cj = DuGH()(gamma_s,tau2)(c_s,j);
+            auto ee_GDGDDG_c = epsilon_sgn[ie_n] * epsilon_sgn[ie_s] * GDsGDd_ab_bb * DuGH_gt_cj;
+            for (int gamma_n=0; gamma_n<Ns; gamma_n++){
+              result()(gamma_s,gamma_n)() += ee_GDGDDG_a * DuG()(gamma_s, beta_n)(c_s,a_n) * adjDu()(tau2,gamma_n)(j,c_n);
+              result()(gamma_s,gamma_n)() += ee_GDGDDG_c * adjDuG()(tau2, beta_n)(j,a_n) * Du_spec()(alpha_s,gamma_n)(a_s,c_n);
+              result()(gamma_s,gamma_n)() -= ee_GDGDDG_a * adjDuG()(tau2, beta_n)(j,a_n) * Du_spec()(gamma_s,gamma_n)(c_s,c_n);
+              result()(gamma_s,gamma_n)() -= ee_GDGDDG_c * DuG()(alpha_s, beta_n)(a_s,a_n) * adjDu()(tau2,gamma_n)(j,c_n);
+            }
+	  }
+	}}
+      }}
+    }
+  }
+}
+
+//Equivalent to "One-trace"
+/* Dq_loop is a quark line from t_H to t_H
+ * Du_spec is a quark line from t_i to t_f
+ * Dd_tf is a quark line from t_f to t_H
+ * Ds_ti is a quark line from t_i to t_H */
+template <class FImpl>
+template <class mobj, class mobj2, class robj>
+void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q2_Eye_site(const mobj &Dq_loop,
+						 const mobj2 &Du_spec,
+						 const mobj &Dd_tf,
+						 const mobj &Ds_ti,
+				                 const Gamma Gamma_H,
+				                 const Gamma GammaB_sigma,
+		                 		 const Gamma GammaB_nucl,
+						 robj &result)
+{
+
+  Gamma g5(Gamma::Algebra::Gamma5); 
+
+  auto DuG = Du_spec * GammaB_nucl;
+  // Gamma^B * Ds * \gamma_\mu^L
+  auto GDsG = GammaB_sigma * Ds_ti * Gamma_H;
+  // Dq_loop * \gamma_\mu^L * (\gamma_5 * Dd^\dagger * \gamma_5)
+  auto DqGDd = Dq_loop * Gamma_H * g5 * adj(Dd_tf) * g5;
+
+  for (int ie_n=0; ie_n < 6 ; ie_n++){
+    int a_n = epsilon[ie_n][0]; //a
+    int b_n = epsilon[ie_n][1]; //b
+    int c_n = epsilon[ie_n][2]; //c
+    for (int ie_s=0; ie_s < 6 ; ie_s++){
+      int a_s = epsilon[ie_s][0]; //a'
+      int b_s = epsilon[ie_s][1]; //b'
+      int c_s = epsilon[ie_s][2]; //c'
+      for (int alpha_s=0; alpha_s<Ns; alpha_s++){
+      for (int tau=0; tau<Ns; tau++){
+      for (int i=0; i<Nc; i++){
+	auto GDsG_at_bi = GDsG()(alpha_s,tau)(b_s,i);
+        for (int beta_n=0; beta_n<Ns; beta_n++){
+          auto DqGDd_tb_ib = DqGDd()(tau,beta_n)(i,b_n);
+	  auto ee_GDGDGD = epsilon_sgn[ie_n] * epsilon_sgn[ie_s] * GDsG_at_bi * DqGDd_tb_ib;
+          for (int gamma_s=0; gamma_s<Ns; gamma_s++){
+          for (int gamma_n=0; gamma_n<Ns; gamma_n++){
+            result()(gamma_s,gamma_n)() -= ee_GDGDGD * DuG()(alpha_s, beta_n)(a_s,a_n) * Du_spec()(gamma_s,gamma_n)(c_s,c_n);
+            result()(gamma_s,gamma_n)() += ee_GDGDGD * DuG()(gamma_s, beta_n)(c_s,a_n) * Du_spec()(alpha_s,gamma_n)(a_s,c_n);
+          }}
+	}
+      }}}
+    }
+  }
+}
+
+/* Du_ti is a quark line from t_i to t_H
+ * Du_tf is a quark line from t_f to t_H
+ * Du_spec is a quark line from t_i to t_f
+ * Dd_tf is a quark line from t_f to t_H
+ * Ds_ti is a quark line from t_i to t_H */
+template <class FImpl>
+template <class mobj, class mobj2, class robj>
+void BaryonUtils<FImpl>::Sigma_to_Nucleon_Q2_NonEye_site(const mobj &Du_ti,
+						 const mobj &Du_tf,
+						 const mobj2 &Du_spec,
+						 const mobj &Dd_tf,
+						 const mobj &Ds_ti,
+				                 const Gamma Gamma_H,
+				                 const Gamma GammaB_sigma,
+		                 		 const Gamma GammaB_nucl,
+						 robj &result)
+{
+
+  Gamma g5(Gamma::Algebra::Gamma5); 
+
+  auto DuG = Du_spec * GammaB_nucl;
+  auto adjDu = g5 * adj(Du_tf) * g5;
+  auto adjDuG = adjDu * GammaB_nucl;
+  // Gamma^B * Ds * \gamma_\mu^L
+  auto GDsG = GammaB_sigma * Ds_ti * Gamma_H;
+  // Du * \gamma_\mu^L * (\gamma_5 * Dd^\dagger * \gamma_5)
+  auto DuGDd = Du_ti * Gamma_H * g5 * adj(Dd_tf) * g5;
+
+  for (int ie_n=0; ie_n < 6 ; ie_n++){
+    int a_n = epsilon[ie_n][0]; //a
+    int b_n = epsilon[ie_n][1]; //b
+    int c_n = epsilon[ie_n][2]; //c
+    for (int ie_s=0; ie_s < 6 ; ie_s++){
+      int a_s = epsilon[ie_s][0]; //a'
+      int b_s = epsilon[ie_s][1]; //b'
+      int c_s = epsilon[ie_s][2]; //c'
+      for (int alpha_s=0; alpha_s<Ns; alpha_s++){
+      for (int tau=0; tau<Ns; tau++){
+      for (int i=0; i<Nc; i++){
+	auto GDsG_at_bi = GDsG()(alpha_s,tau)(b_s,i);
+        for (int beta_n=0; beta_n<Ns; beta_n++){
+          auto DuGDd_ab_ab = DuGDd()(alpha_s,beta_n)(a_s,b_n);
+	  auto ee_GDGDGD_a = epsilon_sgn[ie_n] * epsilon_sgn[ie_s] * GDsG_at_bi * DuGDd_ab_ab;
+          for (int gamma_s=0; gamma_s<Ns; gamma_s++){
+            auto DuGDd_gb_cb = DuGDd()(gamma_s,beta_n)(c_s,b_n);
+	    auto ee_GDGDGD_c = epsilon_sgn[ie_n] * epsilon_sgn[ie_s] * GDsG_at_bi * DuGDd_gb_cb;
+            for (int gamma_n=0; gamma_n<Ns; gamma_n++){
+              result()(gamma_s,gamma_n)() -= ee_GDGDGD_a * DuG()(gamma_s, beta_n)(c_s,a_n) * adjDu()(tau,gamma_n)(i,c_n);
+              result()(gamma_s,gamma_n)() -= ee_GDGDGD_c * adjDuG()(tau, beta_n)(i,a_n) * Du_spec()(alpha_s,gamma_n)(a_s,c_n);
+              result()(gamma_s,gamma_n)() += ee_GDGDGD_a * adjDuG()(tau, beta_n)(i,a_n) * Du_spec()(gamma_s,gamma_n)(c_s,c_n);
+              result()(gamma_s,gamma_n)() += ee_GDGDGD_c * DuG()(alpha_s, beta_n)(a_s,a_n) * adjDu()(tau,gamma_n)(i,c_n);
+            }
+	  }
+	}
+      }}}
+    }
+  }
+}
+
+template<class FImpl>
+template <class mobj>
+void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
+						 const mobj &Du_spec,
+						 const PropagatorField &qd_tf,
+						 const PropagatorField &qs_ti,
+				                 const Gamma Gamma_H,
+				                 const Gamma GammaB_sigma,
+		                 		 const Gamma GammaB_nucl,
+						 const std::string op,
+						 SpinMatrixField &stn_corr)
+{
+  GridBase *grid = qs_ti.Grid();
+
+  auto vcorr= stn_corr.View();
+  auto vq_loop = qq_loop.View();
+  auto vd_tf = qd_tf.View();
+  auto vs_ti = qs_ti.View();
+
+ // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
+  thread_for(ss,grid->oSites(),{
+    auto Dq_loop = vq_loop[ss];
+    auto Dd_tf = vd_tf[ss];
+    auto Ds_ti = vs_ti[ss];
+    sobj result=Zero();
+    if(op == "Q1"){
+      Sigma_to_Nucleon_Q1_Eye_site(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
+    } else if(op == "Q2"){
+      Sigma_to_Nucleon_Q2_Eye_site(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
+    } else {
+      assert(0 && "Weak Operator not correctly specified");
+    }
+      vcorr[ss] = result; 
+  }  );//end loop over lattice sites
+}
+
+template<class FImpl>
+template <class mobj>
+void BaryonUtils<FImpl>::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti,
+						 const PropagatorField &qq_tf,
+						 const mobj &Du_spec,
+						 const PropagatorField &qd_tf,
+						 const PropagatorField &qs_ti,
+				                 const Gamma Gamma_H,
+				                 const Gamma GammaB_sigma,
+		                 		 const Gamma GammaB_nucl,
+						 const std::string op,
+						 SpinMatrixField &stn_corr)
+{
+  GridBase *grid = qs_ti.Grid();
+
+  auto vcorr= stn_corr.View();
+  auto vq_ti = qq_ti.View();
+  auto vq_tf = qq_tf.View();
+  auto vd_tf = qd_tf.View();
+  auto vs_ti = qs_ti.View();
+
+ // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
+  thread_for(ss,grid->oSites(),{
+    auto Dq_ti = vq_ti[ss];
+    auto Dq_tf = vq_tf[ss];
+    auto Dd_tf = vd_tf[ss];
+    auto Ds_ti = vs_ti[ss];
+    sobj result=Zero();
+    if(op == "Q1"){
+      Sigma_to_Nucleon_Q1_NonEye_site(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
+    } else if(op == "Q2"){
+      Sigma_to_Nucleon_Q2_NonEye_site(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
+    } else {
+      assert(0 && "Weak Operator not correctly specified");
+    }
+      vcorr[ss] = result; 
+  }  );//end loop over lattice sites
+}
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/utils/CovariantLaplacian.h
+++ b/Grid/qcd/utils/CovariantLaplacian.h
@@ -92,6 +92,7 @@ public:
  };

  void Mdir(const GaugeField&, GaugeField&, int, int){ assert(0);}
+  void MdirAll(const GaugeField&, std::vector<GaugeField> &){ assert(0);}
  void Mdiag(const GaugeField&, GaugeField&){ assert(0);}

  void ImportGauge(const GaugeField& _U) {
--- a/Grid/qcd/utils/CovariantSmearing.h
+++ b/Grid/qcd/utils/CovariantSmearing.h
@@ -27,8 +27,7 @@ directory
 *************************************************************************************/
 #pragma once

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 template <class Gimpl> class CovariantSmearing : public Gimpl 
 {
@@ -84,4 +83,5 @@ public:
    }
  }
 };
-}}
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/utils/LinalgUtils.h
+++ b/Grid/qcd/utils/LinalgUtils.h
@@ -201,7 +201,6 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
  });
 }

-// I explicitly need these outside the QCD namespace
 template<typename vobj>
 void G5C(Lattice<vobj> &z, const Lattice<vobj> &x)
 {
--- a/Grid/simd/Grid_gpu_vec.h
+++ b/Grid/simd/Grid_gpu_vec.h
@@ -403,6 +403,10 @@ namespace Optimization {
    accelerator_inline GpuVectorRD operator()(GpuVectorRD a, GpuVectorRD b){
      return a/b;
    }
+    accelerator_inline GpuVectorI operator()(GpuVectorI a, GpuVectorI b){
+      return a/b;
+    }
+
    // Danger -- element wise divide fro complex, not complex div. 
    // See Grid_vector_types.h lines around 735, applied after "toReal"
    accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b){
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -1233,7 +1233,7 @@ public:
  };
  
  void Report(void) {
-#define AVERAGE(A) _grid->GlobalSum(A);A/=NP;
+#define AVERAGE(A) 
 #define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
    RealD NP = _grid->_Nprocessors;
    RealD NN = _grid->NodeCount();
@@ -1281,11 +1281,13 @@ public:
 	std::cout << GridLogMessage << " Stencil SHM mem " << (membytes)/gatheralltime/1000. << " GB/s per rank"<<std::endl;
 	std::cout << GridLogMessage << " Stencil SHM mem " << (membytes)/gatheralltime/1000.*NP/NN << " GB/s per node"<<std::endl;
      }
+      /*
      PRINTIT(mpi3synctime);
      PRINTIT(mpi3synctime_g);
      PRINTIT(shmmergetime);
      PRINTIT(splicetime);
      PRINTIT(nosplicetime);
+      */
    }
 #undef PRINTIT
 #undef AVERAGE
--- a/Grid/tensors/Tensor_arith_scalar.h
+++ b/Grid/tensors/Tensor_arith_scalar.h
@@ -60,166 +60,243 @@ template<class l,int N> accelerator_inline iMatrix<l,N> operator * (const typena
 ////////////////////////////////////////////////////////////////////
 // Double support; cast to "scalar_type" through constructor
 ////////////////////////////////////////////////////////////////////
-template<class l> accelerator_inline iScalar<l> operator * (const iScalar<l>& lhs,double rhs) 
+
+template<class l,IfNotSame<typename iScalar<l>::scalar_type,double> = 0> 
+accelerator_inline iScalar<l> operator * (const iScalar<l>& lhs,double rhs) 
 {
+  //  typename std::enable_if<!std::is_same<typename iScalar<l>::scalar_type,double>::value,int>::type i=0;
  typename iScalar<l>::scalar_type t; t=rhs;
  typename iScalar<l>::tensor_reduced srhs;srhs=t;
  return lhs*srhs;
 }
-template<class l> accelerator_inline iScalar<l> operator * (double lhs,const iScalar<l>& rhs) {  return rhs*lhs; }
+template<class l,IfNotSame<typename iScalar<l>::scalar_type,double> = 0>  
+accelerator_inline iScalar<l> operator * (double lhs,const iScalar<l>& rhs) 
+{ 
+  //  typename std::enable_if<!std::is_same<typename iScalar<l>::scalar_type,double>::value,int>::type i=0;
+  return rhs*lhs; 
+}

-template<class l,int N> accelerator_inline iVector<l,N> operator * (const iVector<l,N>& lhs,double rhs) 
+template<class l,int N,IfNotSame<typename iScalar<l>::scalar_type,double> = 0>  
+accelerator_inline iVector<l,N> operator * (const iVector<l,N>& lhs,double rhs) 
 {
+  //  typename std::enable_if<!std::is_same<typename iScalar<l>::scalar_type,double>::value,int>::type i=0;
  typename iScalar<l>::scalar_type t;t=rhs;
  typename iScalar<l>::tensor_reduced srhs;srhs=t;
  return lhs*srhs;
 }
-template<class l,int N> accelerator_inline iVector<l,N> operator * (double lhs,const iVector<l,N>& rhs) {  return rhs*lhs; }
+template<class l,int N,IfNotSame<typename iScalar<l>::scalar_type,double> = 0>  
+accelerator_inline iVector<l,N> operator * (double lhs,const iVector<l,N>& rhs) 
+{ 
+  //  typename std::enable_if<!std::is_same<typename iScalar<l>::scalar_type,double>::value,int>::type i=0;
+  return rhs*lhs; 
+}

-template<class l,int N> accelerator_inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,double rhs) 
+template<class l,int N,IfNotSame<typename iScalar<l>::scalar_type,double> = 0>  
+accelerator_inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,double rhs) 
 {
+  //  typename std::enable_if<!std::is_same<typename iScalar<l>::scalar_type,double>::value,int>::type i=0;
  typename iScalar<l>::scalar_type t;t=rhs;
  typename iScalar<l>::tensor_reduced srhs;srhs=t;
  return lhs*srhs;
 }
-template<class l,int N> accelerator_inline iMatrix<l,N> operator * (double lhs,const iMatrix<l,N>& rhs) {  return rhs*lhs; }
+template<class l,int N,IfNotSame<typename iScalar<l>::scalar_type,double> = 0>  
+accelerator_inline iMatrix<l,N> operator * (double lhs,const iMatrix<l,N>& rhs) 
+{  
+  //  typename std::enable_if<!std::is_same<typename iScalar<l>::scalar_type,double>::value,int>::type i=0;
+  return rhs*lhs; 
+}

 ////////////////////////////////////////////////////////////////////
 // Complex support; cast to "scalar_type" through constructor
 ////////////////////////////////////////////////////////////////////
-template<class l> accelerator_inline iScalar<l> operator * (const iScalar<l>& lhs,ComplexD rhs) 
+template<class l,IfNotSame<typename iScalar<l>::scalar_type,ComplexD> = 0>  
+accelerator_inline iScalar<l> operator * (const iScalar<l>& lhs,ComplexD rhs) 
 {
+  //  typename std::enable_if<!std::is_same<typename iScalar<l>::scalar_type,ComplexD>::value,int>::type i=0;
  typename iScalar<l>::scalar_type t;t=rhs;
  typename iScalar<l>::tensor_reduced srhs;srhs=t;
  
-  
  return lhs*srhs;
 }
-template<class l> accelerator_inline iScalar<l> operator * (ComplexD lhs,const iScalar<l>& rhs) {  return rhs*lhs; }
-
-template<class l,int N> accelerator_inline iVector<l,N> operator * (const iVector<l,N>& lhs,ComplexD rhs) 
+template<class l,IfNotSame<typename iScalar<l>::scalar_type,ComplexD> = 0>  
+accelerator_inline iScalar<l> operator * (ComplexD lhs,const iScalar<l>& rhs) 
 {
+  //  typename std::enable_if<!std::is_same<typename iScalar<l>::scalar_type,ComplexD>::value,int>::type i=0;
+  return rhs*lhs; 
+}
+
+template<class l,int N,IfNotSame<typename iScalar<l>::scalar_type,ComplexD> = 0>  
+accelerator_inline iVector<l,N> operator * (const iVector<l,N>& lhs,ComplexD rhs) 
+{
+  //  typename std::enable_if<!std::is_same<typename iScalar<l>::scalar_type,ComplexD>::value,int>::type i=0;
  typename iScalar<l>::scalar_type t;t=rhs;
  typename iScalar<l>::tensor_reduced srhs;srhs=t;
  return lhs*srhs;
 }
-template<class l,int N> accelerator_inline iVector<l,N> operator * (ComplexD lhs,const iVector<l,N>& rhs) {  return rhs*lhs; }
-
-template<class l,int N> accelerator_inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,ComplexD rhs) 
+template<class l,int N,IfNotSame<typename iScalar<l>::scalar_type,ComplexD> = 0>  
+accelerator_inline iVector<l,N> operator * (ComplexD lhs,const iVector<l,N>& rhs) 
 {
+  //  typename std::enable_if<!std::is_same<typename iScalar<l>::scalar_type,ComplexD>::value,int>::type i=0;
+  return rhs*lhs; 
+}
+
+template<class l,int N,IfNotSame<typename iScalar<l>::scalar_type,ComplexD> = 0>  
+accelerator_inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,ComplexD rhs) 
+{
+  //  typename std::enable_if<!std::is_same<typename iScalar<l>::scalar_type,ComplexD>::value,int>::type i=0;
  typename iScalar<l>::scalar_type t;t=rhs;
  typename iScalar<l>::tensor_reduced srhs;srhs=t;
  return lhs*srhs;
 }
-template<class l,int N> accelerator_inline iMatrix<l,N> operator * (ComplexD lhs,const iMatrix<l,N>& rhs) {  return rhs*lhs; }
+template<class l,int N,IfNotSame<typename iScalar<l>::scalar_type,ComplexD> = 0>  
+accelerator_inline iMatrix<l,N> operator * (ComplexD lhs,const iMatrix<l,N>& rhs) 
+{ 
+  //  typename std::enable_if<!std::is_same<typename iScalar<l>::scalar_type,ComplexD>::value,int>::type i=0;
+  return rhs*lhs; 
+}

 ////////////////////////////////////////////////////////////////////
 // Integer support; cast to "scalar_type" through constructor
 ////////////////////////////////////////////////////////////////////
-template<class l> accelerator_inline iScalar<l> operator * (const iScalar<l>& lhs,Integer rhs) 
+template<class l,IfNotSame<typename iScalar<l>::scalar_type,Integer> = 0>  
+accelerator_inline iScalar<l> operator * (const iScalar<l>& lhs,Integer rhs) 
 {
+  //  typename std::enable_if<!std::is_same<typename iScalar<l>::scalar_type,ComplexD>::value,int>::type i=0;
  typename iScalar<l>::scalar_type t;  t=rhs;
  typename iScalar<l>::tensor_reduced srhs; srhs=t;
  return lhs*srhs;
 }
-template<class l> accelerator_inline iScalar<l> operator * (Integer lhs,const iScalar<l>& rhs) {  return rhs*lhs; }
+template<class l,IfNotSame<typename iScalar<l>::scalar_type,Integer> = 0>  
+accelerator_inline iScalar<l> operator * (Integer lhs,const iScalar<l>& rhs) 
+{  
+  //  typename std::enable_if<!std::is_same<typename iScalar<l>::scalar_type,Integer>::value,int>::type i=0;
+  return rhs*lhs; 
+}

-template<class l,int N> accelerator_inline iVector<l,N> operator * (const iVector<l,N>& lhs,Integer rhs) 
+template<class l,int N,IfNotSame<typename iScalar<l>::scalar_type,Integer> = 0>  
+accelerator_inline iVector<l,N> operator * (const iVector<l,N>& lhs,Integer rhs) 
 {
+  //  typename std::enable_if<!std::is_same<typename iScalar<l>::scalar_type,Integer>::value,int>::type i=0;
  typename iScalar<l>::scalar_type t;t=rhs;
  typename iScalar<l>::tensor_reduced srhs;srhs=t;
  return lhs*srhs;
 }
-template<class l,int N> accelerator_inline iVector<l,N> operator * (Integer lhs,const iVector<l,N>& rhs) {  return rhs*lhs; }
-
-template<class l,int N> accelerator_inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,Integer rhs) 
+template<class l,int N,IfNotSame<typename iScalar<l>::scalar_type,Integer> = 0>  
+accelerator_inline iVector<l,N> operator * (Integer lhs,const iVector<l,N>& rhs) 
 {
+  //  typename std::enable_if<!std::is_same<typename iScalar<l>::scalar_type,Integer>::value,int>::type i=0;
+  return rhs*lhs; 
+}
+
+template<class l,int N,IfNotSame<typename iScalar<l>::scalar_type,Integer> = 0>  
+accelerator_inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,Integer rhs) 
+{
+  //  typename std::enable_if<!std::is_same<typename iScalar<l>::scalar_type,Integer>::value,int>::type i=0;
  typename iScalar<l>::scalar_type t;t=rhs;
  typename iScalar<l>::tensor_reduced srhs;srhs=t;
  return lhs*srhs;
 }
-template<class l,int N> accelerator_inline iMatrix<l,N> operator * (Integer lhs,const iMatrix<l,N>& rhs) {  return rhs*lhs; }
+template<class l,int N,IfNotSame<typename iScalar<l>::scalar_type,Integer> = 0>  
+accelerator_inline iMatrix<l,N> operator * (Integer lhs,const iMatrix<l,N>& rhs) 
+{
+  //  typename std::enable_if<!std::is_same<typename iScalar<l>::scalar_type,Integer>::value,int>::type i=0;
+  return rhs*lhs; 
+}



 ///////////////////////////////////////////////////////////////////////////////////////////////
 // addition by fundamental scalar type applies to matrix(down diag) and scalar
 ///////////////////////////////////////////////////////////////////////////////////////////////
-template<class l> accelerator_inline iScalar<l> operator + (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
+template<class l>  
+accelerator_inline iScalar<l> operator + (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
 {
  typename iScalar<l>::tensor_reduced srhs; srhs=rhs;
  return lhs+srhs;
 }
-template<class l> accelerator_inline iScalar<l> operator + (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) {  return rhs+lhs; }
+template<class l>  
+accelerator_inline iScalar<l> operator + (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) {  return rhs+lhs; }

-template<class l,int N> accelerator_inline iMatrix<l,N> operator + (const iMatrix<l,N>& lhs,const typename iScalar<l>::scalar_type rhs) 
+template<class l,int N>  
+accelerator_inline iMatrix<l,N> operator + (const iMatrix<l,N>& lhs,const typename iScalar<l>::scalar_type rhs) 
 {
  typename iMatrix<l,N>::tensor_reduced srhs; srhs=rhs;
  return lhs+srhs;
 }
-template<class l,int N> accelerator_inline iMatrix<l,N> operator + (const typename iScalar<l>::scalar_type lhs,const iMatrix<l,N>& rhs) {  return rhs+lhs; }
+template<class l,int N>  
+accelerator_inline iMatrix<l,N> operator + (const typename iScalar<l>::scalar_type lhs,const iMatrix<l,N>& rhs) {  return rhs+lhs; }

 ////////////////////////////////////////////////////////////////////
 // Double support; cast to "scalar_type" through constructor
 ////////////////////////////////////////////////////////////////////
-template<class l> accelerator_inline iScalar<l> operator + (const iScalar<l>& lhs,double rhs) 
+template<class l,IfNotSame<typename iScalar<l>::scalar_type,double> = 0>  
+accelerator_inline iScalar<l> operator + (const iScalar<l>& lhs,double rhs) 
 {
  typename iScalar<l>::scalar_type t; t=rhs;
  typename iScalar<l>::tensor_reduced srhs; srhs=t;
  return lhs+srhs;
 }
-template<class l> accelerator_inline iScalar<l> operator + (double lhs,const iScalar<l>& rhs) {  return rhs+lhs; }
+template<class l,IfNotSame<typename iScalar<l>::scalar_type,double> = 0>  
+accelerator_inline iScalar<l> operator + (double lhs,const iScalar<l>& rhs) {  return rhs+lhs; }

-template<class l,int N> accelerator_inline iMatrix<l,N> operator + (const iMatrix<l,N>& lhs,double rhs) 
+template<class l,int N,IfNotSame<typename iScalar<l>::scalar_type,double> = 0>  
+accelerator_inline iMatrix<l,N> operator + (const iMatrix<l,N>& lhs,double rhs) 
 {
  typename iScalar<l>::scalar_type t;t=rhs;
  typename iScalar<l>::tensor_reduced srhs;srhs=t;
  return lhs+srhs;
 }
-template<class l,int N> accelerator_inline iMatrix<l,N> operator + (double lhs,const iMatrix<l,N>& rhs) {  return rhs+lhs; }
+template<class l,int N,IfNotSame<typename iScalar<l>::scalar_type,double> = 0>  
+accelerator_inline iMatrix<l,N> operator + (double lhs,const iMatrix<l,N>& rhs) {  return rhs+lhs; }


 // Integer support cast to scalar type through constructor
-
-
-template<class l> accelerator_inline iScalar<l> operator + (const iScalar<l>& lhs,Integer rhs) 
+template<class l,IfNotSame<typename iScalar<l>::scalar_type,double> = 0>  
+accelerator_inline iScalar<l> operator + (const iScalar<l>& lhs,Integer rhs) 
 {
  typename iScalar<l>::scalar_type t; t=rhs;
  typename iScalar<l>::tensor_reduced srhs; srhs=t;
  return lhs+srhs;
 }

-template<class l> accelerator_inline iScalar<l> operator + (Integer lhs,const iScalar<l>& rhs) {  return rhs+lhs; }
+template<class l,IfNotSame<typename iScalar<l>::scalar_type,double> = 0>  
+accelerator_inline iScalar<l> operator + (Integer lhs,const iScalar<l>& rhs) {  return rhs+lhs; }

-template<class l,int N> accelerator_inline iMatrix<l,N> operator + (const iMatrix<l,N>& lhs,Integer rhs) 
+template<class l,int N,IfNotSame<typename iScalar<l>::scalar_type,double> = 0>  
+accelerator_inline iMatrix<l,N> operator + (const iMatrix<l,N>& lhs,Integer rhs) 
 {
  typename iScalar<l>::scalar_type t;t=rhs;
  typename iScalar<l>::tensor_reduced srhs;srhs=t;
  return lhs+srhs;
 }
-template<class l,int N> accelerator_inline iMatrix<l,N> operator + (Integer lhs,const iMatrix<l,N>& rhs) {  return rhs+lhs; }
+template<class l,int N,IfNotSame<typename iScalar<l>::scalar_type,double> = 0>  
+accelerator_inline iMatrix<l,N> operator + (Integer lhs,const iMatrix<l,N>& rhs) {  return rhs+lhs; }


 ///////////////////////////////////////////////////////////////////////////////////////////////
 // subtraction of fundamental scalar type applies to matrix(down diag) and scalar
 ///////////////////////////////////////////////////////////////////////////////////////////////
-template<class l> accelerator_inline iScalar<l> operator - (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
+template<class l>  
+accelerator_inline iScalar<l> operator - (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
 {
  typename iScalar<l>::tensor_reduced srhs; srhs=rhs;
  return lhs-srhs;
 }
-template<class l> accelerator_inline iScalar<l> operator - (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) 
+template<class l>  
+accelerator_inline iScalar<l> operator - (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) 
 {
  typename iScalar<l>::tensor_reduced slhs;slhs=lhs;
  return slhs-rhs;
 }

-template<class l,int N> accelerator_inline iMatrix<l,N> operator - (const iMatrix<l,N>& lhs,const typename iScalar<l>::scalar_type rhs) 
+template<class l,int N>  
+accelerator_inline iMatrix<l,N> operator - (const iMatrix<l,N>& lhs,const typename iScalar<l>::scalar_type rhs) 
 {
  typename iScalar<l>::tensor_reduced srhs; srhs=rhs;
  return lhs-srhs;
 }
-template<class l,int N> accelerator_inline iMatrix<l,N> operator - (const typename iScalar<l>::scalar_type lhs,const iMatrix<l,N>& rhs) 
+template<class l,int N>  
+accelerator_inline iMatrix<l,N> operator - (const typename iScalar<l>::scalar_type lhs,const iMatrix<l,N>& rhs) 
 {
  typename iScalar<l>::tensor_reduced slhs;slhs=lhs;
  return slhs-rhs;
@@ -228,26 +305,30 @@ template<class l,int N> accelerator_inline iMatrix<l,N> operator - (const typena
 ////////////////////////////////////////////////////////////////////
 // Double support; cast to "scalar_type" through constructor
 ////////////////////////////////////////////////////////////////////
-template<class l> accelerator_inline iScalar<l> operator - (const iScalar<l>& lhs,double rhs) 
+template<class l,IfNotSame<typename iScalar<l>::scalar_type,double> = 0>  
+accelerator_inline iScalar<l> operator - (const iScalar<l>& lhs,double rhs) 
 {
  typename iScalar<l>::scalar_type t; t=rhs;
  typename iScalar<l>::tensor_reduced srhs; srhs=t;
  return lhs-srhs;
 }
-template<class l> accelerator_inline iScalar<l> operator - (double lhs,const iScalar<l>& rhs) 
+template<class l,IfNotSame<typename iScalar<l>::scalar_type,double> = 0>  
+accelerator_inline iScalar<l> operator - (double lhs,const iScalar<l>& rhs) 
 {
  typename iScalar<l>::scalar_type t(lhs);
  typename iScalar<l>::tensor_reduced slhs;slhs=t;
  return slhs-rhs;
 }

-template<class l,int N> accelerator_inline iMatrix<l,N> operator - (const iMatrix<l,N>& lhs,double rhs) 
+template<class l,int N,IfNotSame<typename iScalar<l>::scalar_type,double> = 0>  
+accelerator_inline iMatrix<l,N> operator - (const iMatrix<l,N>& lhs,double rhs) 
 {
  typename iScalar<l>::scalar_type t;t=rhs;
  typename iScalar<l>::tensor_reduced srhs;srhs=t;
  return lhs-srhs;
 }
-template<class l,int N> accelerator_inline iMatrix<l,N> operator - (double lhs,const iMatrix<l,N>& rhs) 
+template<class l,int N,IfNotSame<typename iScalar<l>::scalar_type,double> = 0>  
+accelerator_inline iMatrix<l,N> operator - (double lhs,const iMatrix<l,N>& rhs) 
 {
  typename iScalar<l>::scalar_type t(lhs);
  typename iScalar<l>::tensor_reduced slhs;slhs=t;
@@ -257,25 +338,29 @@ template<class l,int N> accelerator_inline iMatrix<l,N> operator - (double lhs,c
 ////////////////////////////////////////////////////////////////////
 // Integer support; cast to "scalar_type" through constructor
 ////////////////////////////////////////////////////////////////////
-template<class l> accelerator_inline iScalar<l> operator - (const iScalar<l>& lhs,Integer rhs) 
+template<class l,IfNotSame<typename iScalar<l>::scalar_type,Integer> = 0>  
+accelerator_inline iScalar<l> operator - (const iScalar<l>& lhs,Integer rhs) 
 {
  typename iScalar<l>::scalar_type t; t=rhs;
  typename iScalar<l>::tensor_reduced srhs; srhs=t;
  return lhs-srhs;
 }
-template<class l> accelerator_inline iScalar<l> operator - (Integer lhs,const iScalar<l>& rhs) 
+template<class l,IfNotSame<typename iScalar<l>::scalar_type,Integer> = 0>  
+accelerator_inline iScalar<l> operator - (Integer lhs,const iScalar<l>& rhs) 
 {
  typename iScalar<l>::scalar_type t;t=lhs;
  typename iScalar<l>::tensor_reduced slhs;slhs=t;
  return slhs-rhs;
 }
-template<class l,int N> accelerator_inline iMatrix<l,N> operator - (const iMatrix<l,N>& lhs,Integer rhs) 
+template<class l,int N,IfNotSame<typename iScalar<l>::scalar_type,Integer> = 0>  
+accelerator_inline iMatrix<l,N> operator - (const iMatrix<l,N>& lhs,Integer rhs) 
 {
  typename iScalar<l>::scalar_type t;t=rhs;
  typename iScalar<l>::tensor_reduced srhs;srhs=t;
  return lhs-srhs;
 }
-template<class l,int N> accelerator_inline iMatrix<l,N> operator - (Integer lhs,const iMatrix<l,N>& rhs) 
+template<class l,int N,IfNotSame<typename iScalar<l>::scalar_type,Integer> = 0>  
+accelerator_inline iMatrix<l,N> operator - (Integer lhs,const iMatrix<l,N>& rhs) 
 {
  typename iScalar<l>::scalar_type t;t=lhs;
  typename iScalar<l>::tensor_reduced slhs;slhs=t;
--- a/Grid/threads/Pragmas.h
+++ b/Grid/threads/Pragmas.h
@@ -57,14 +57,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define thread_max(a) (1)
 #endif

-#define naked_for(i,num,...) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
-#define naked_foreach(i,container,...) for ( uint64_t i=container.begin();i<container.end();i++) { __VA_ARGS__ } ;
-#define thread_for( i, num, ... )                           DO_PRAGMA(omp parallel for schedule(static)) naked_for(i,num,{__VA_ARGS__});
-#define thread_foreach( i, num, ... )                       DO_PRAGMA(omp parallel for schedule(static)) naked_foreach(i,num,{__VA_ARGS__});
-#define thread_for_in_region( i, num, ... )                 DO_PRAGMA(omp for schedule(static))          naked_for(i,num,{__VA_ARGS__});
-#define thread_for_collapse2( i, num, ... )                 DO_PRAGMA(omp parallel for collapse(2))      naked_for(i,num,{__VA_ARGS__});
-#define thread_for_collapse( N , i, num, ... )              DO_PRAGMA(omp parallel for collapse ( N ) )  naked_for(i,num,{__VA_ARGS__});
-#define thread_for_collapse_in_region( N , i, num, ... )    DO_PRAGMA(omp for collapse ( N ))            naked_for(i,num,{__VA_ARGS__});
+#define thread_for( i, num, ... )                           DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
+#define thread_foreach( i, container, ... )                 DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=container.begin();i<container.end();i++) { __VA_ARGS__ } ;
+#define thread_for_in_region( i, num, ... )                 DO_PRAGMA(omp for schedule(static))          for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
+#define thread_for_collapse2( i, num, ... )                 DO_PRAGMA(omp parallel for collapse(2))      for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
+#define thread_for_collapse( N , i, num, ... )              DO_PRAGMA(omp parallel for collapse ( N ) )  for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
+#define thread_for_collapse_in_region( N , i, num, ... )    DO_PRAGMA(omp for collapse ( N ))            for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
 #define thread_region                                       DO_PRAGMA(omp parallel)
 #define thread_critical                                     DO_PRAGMA(omp critical)

--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -292,7 +292,7 @@ void GridGpuInit(void)
  gpu_props = new cudaDeviceProp[nDevices];

  char * localRankStr = NULL;
-  int rank = 0, device = 0, world_rank=0; 
+  int rank = 0, world_rank=0; 
 #define ENV_LOCAL_RANK_OMPI    "OMPI_COMM_WORLD_LOCAL_RANK"
 #define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
 #define ENV_RANK_OMPI          "OMPI_COMM_WORLD_RANK"
@@ -301,23 +301,16 @@ void GridGpuInit(void)
  if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
  {
    rank = atoi(localRankStr);		
-    device = rank %nDevices;
  }
  if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
  {
    rank = atoi(localRankStr);		
-    device = rank %nDevices;
  }
  if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);}
  if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}

-  cudaSetDevice(device);
  if ( world_rank == 0 ) {
    GridBanner();
-    printf("GpuInit: ================================================\n");
-    printf("GpuInit: Setting up Cuda Device map before first MPI call\n",nDevices);
-    printf("GpuInit: ================================================\n");
-    printf("GpuInit: Cuda reports %d GPUs on MPI rank 0\n",nDevices);
  }

  for (int i = 0; i < nDevices; i++) {
@@ -325,7 +318,6 @@ void GridGpuInit(void)
 #define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("GpuInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
 #define GPU_PROP(canMapHostMemory)             GPU_PROP_FMT(canMapHostMemory,"%d");
    
-    //      cudaGetDeviceProperties(&prop, i);
    cudaGetDeviceProperties(&gpu_props[i], i);
    if ( world_rank == 0) {
      cudaDeviceProp prop; 
@@ -334,15 +326,13 @@ void GridGpuInit(void)
      printf("GpuInit: Device Number    : %d\n", i);
      printf("GpuInit: ========================\n");
      printf("GpuInit: Device identifier: %s\n", prop.name);
-      //      printf("GpuInit:   Peak Memory Bandwidth (GB/s): %f\n",(float)2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
+
      GPU_PROP(managedMemory);
      GPU_PROP(isMultiGpuBoard);
      GPU_PROP(warpSize);
-#if 0
-      GPU_PROP(unifiedAddressing);
-      GPU_PROP(l2CacheSize);
-      GPU_PROP(singleToDoublePrecisionPerfRatio);
-#endif
+      //      GPU_PROP(unifiedAddressing);
+      //      GPU_PROP(l2CacheSize);
+      //      GPU_PROP(singleToDoublePrecisionPerfRatio);
    }
  }
  if ( world_rank == 0 ) {
@@ -638,6 +628,7 @@ void Grid_debug_handler_init(void)
  sigaction(SIGSEGV,&sa,NULL);
  sigaction(SIGTRAP,&sa,NULL);
  sigaction(SIGBUS,&sa,NULL);
+  sigaction(SIGUSR2,&sa,NULL);

  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);

--- a/HMC/Mobius2p1f.cc
+++ b/HMC/Mobius2p1f.cc
@@ -31,7 +31,6 @@ directory

 int main(int argc, char **argv) {
  using namespace Grid;
-  using namespace Grid::QCD;

  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
@@ -44,18 +43,18 @@ int main(int argc, char **argv) {
  typedef typename FermionAction::FermionField FermionField;

  typedef Grid::XmlReader       Serialiser;
-  
+
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
-  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper; 
+  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
  //  MD.name    = std::string("Leap Frog");
-  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper; 
+  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
  //  MD.name    = std::string("Force Gradient");
-  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper; 
+  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
  MD.name    = std::string("MinimumNorm2");
  MD.MDsteps = 20;
  MD.trajL   = 1.0;
-  
+
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = 0;
  HMCparams.Trajectories     = 200;
@@ -67,7 +66,7 @@ int main(int argc, char **argv) {

  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
-  
+
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_EODWF_lat";
  CPparams.rng_prefix    = "ckpoint_EODWF_rng";
@@ -81,7 +80,7 @@ int main(int argc, char **argv) {
  TheHMC.Resources.SetRNGSeeds(RNGpar);

  // Construct observables
-  // here there is too much indirection 
+  // here there is too much indirection
  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
@@ -118,7 +117,7 @@ int main(int argc, char **argv) {
  // These lines are unecessary if BC are all periodic
  std::vector<Complex> boundary = {1,1,1,-1};
  FermionAction::ImplParams Params(boundary);
-  
+
  double StoppingCondition = 1e-10;
  double MaxCGIterations = 30000;
  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
--- a/HMC/Mobius2p1fEOFA.cc
+++ b/HMC/Mobius2p1fEOFA.cc
@@ -34,8 +34,7 @@ directory
 #define MIXED_PRECISION
 #endif

-namespace Grid{ 
-  namespace QCD{
+NAMESPACE_BEGIN(Grid);

  /*
   * Need a plan for gauge field update for mixed precision in HMC                      (2x speed up)
@@ -163,11 +162,11 @@ namespace Grid{
      MPCG(src,psi);
    }
  };
-}};
+
+NAMESPACE_END(Grid);

 int main(int argc, char **argv) {
  using namespace Grid;
-  using namespace Grid::QCD;

  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
--- a/HMC/Mobius2p1fEOFA_F1.cc
+++ b/HMC/Mobius2p1fEOFA_F1.cc
@@ -34,8 +34,7 @@ directory
 #define MIXED_PRECISION
 #endif

-namespace Grid{ 
-  namespace QCD{
+NAMESPACE_BEGIN(Grid);

  /*
   * Need a plan for gauge field update for mixed precision in HMC                      (2x speed up)
@@ -146,11 +145,12 @@ namespace Grid{
      MPCG(src,psi);
    }
  };
-}};
+
+NAMESPACE_END(Grid);
+

 int main(int argc, char **argv) {
  using namespace Grid;
-  using namespace Grid::QCD;

  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
@@ -341,7 +341,7 @@ int main(int argc, char **argv) {
 	 ActionCG, 
 	 ActionCG, ActionCG,
 	 ActionCG, ActionCG,
-	 //	 DerivativeCG, DerivativeCG,
+	 //         DerivativeCG, DerivativeCG,
 	 OFRp, true);
 #endif
  Level1.push_back(&EOFA);
--- a/HMC/Mobius2p1fRHMC.cc
+++ b/HMC/Mobius2p1fRHMC.cc
@@ -31,7 +31,6 @@ directory

 int main(int argc, char **argv) {
  using namespace Grid;
-  using namespace Grid::QCD;

  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
@@ -44,18 +43,18 @@ int main(int argc, char **argv) {
  typedef typename FermionAction::FermionField FermionField;

  typedef Grid::XmlReader       Serialiser;
-  
+
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
-  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper; 
+  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
  //  MD.name    = std::string("Leap Frog");
-  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper; 
+  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
  //  MD.name    = std::string("Force Gradient");
-  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper; 
+  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
  MD.name    = std::string("MinimumNorm2");
  MD.MDsteps = 20;
  MD.trajL   = 1.0;
-  
+
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = 30;
  HMCparams.Trajectories     = 200;
@@ -68,7 +67,7 @@ int main(int argc, char **argv) {

  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
-  
+
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_EODWF_lat";
  CPparams.rng_prefix    = "ckpoint_EODWF_rng";
@@ -82,7 +81,7 @@ int main(int argc, char **argv) {
  TheHMC.Resources.SetRNGSeeds(RNGpar);

  // Construct observables
-  // here there is too much indirection 
+  // here there is too much indirection
  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
@@ -93,11 +92,11 @@ int main(int argc, char **argv) {
  Real strange_mass = 0.04;
  Real pv_mass      = 1.0;
  RealD M5  = 1.8;
-  RealD b   = 1.0; 
+  RealD b   = 1.0;
  RealD c   = 0.0;
-  
+
  // FIXME:
-  // Same in MC and MD 
+  // Same in MC and MD
  // Need to mix precision too
  OneFlavourRationalParams OFRp;
  OFRp.lo       = 4.0e-3;
@@ -122,7 +121,7 @@ int main(int argc, char **argv) {
  // These lines are unecessary if BC are all periodic
  std::vector<Complex> boundary = {1,1,1,-1};
  FermionAction::ImplParams Params(boundary);
-  
+
  double StoppingCondition = 1e-10;
  double MaxCGIterations = 30000;
  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
--- a/Hadrons/A2AMatrix.hpp
+++ b/Hadrons/A2AMatrix.hpp
@@ -108,7 +108,7 @@ public:
    void saveBlock(const A2AMatrixSet<T> &m, const unsigned int ext, const unsigned int str,
                   const unsigned int i, const unsigned int j);
    template <template <class> class Vec, typename VecT>
-    void load(Vec<VecT> &v, double *tRead = nullptr);
+    void load(Vec<VecT> &v, double *tRead = nullptr, GridBase *grid = nullptr);
 private:
    std::string  filename_{""}, dataname_{""};
    unsigned int nt_{0}, ni_{0}, nj_{0};
@@ -506,44 +506,53 @@ void A2AMatrixIo<T>::saveBlock(const A2AMatrixSet<T> &m,

 template <typename T>
 template <template <class> class Vec, typename VecT>
-void A2AMatrixIo<T>::load(Vec<VecT> &v, double *tRead)
+void A2AMatrixIo<T>::load(Vec<VecT> &v, double *tRead, GridBase *grid)
 {
 #ifdef HAVE_HDF5
-    Hdf5Reader           reader(filename_);
    std::vector<hsize_t> hdim;
    H5NS::DataSet        dataset;
    H5NS::DataSpace      dataspace;
    H5NS::CompType       datatype;
-    
-    push(reader, dataname_);
-    auto &group = reader.getGroup();
-    dataset     = group.openDataSet(HADRONS_A2AM_NAME);
-    datatype    = dataset.getCompType();
-    dataspace   = dataset.getSpace();
-    hdim.resize(dataspace.getSimpleExtentNdims());
-    dataspace.getSimpleExtentDims(hdim.data());
-    if ((nt_*ni_*nj_ != 0) and
-        ((hdim[0] != nt_) or (hdim[1] != ni_) or (hdim[2] != nj_)))
+
+    if (!(grid) || grid->IsBoss())
    {
-        HADRONS_ERROR(Size, "all-to-all matrix size mismatch (got "
-            + std::to_string(hdim[0]) + "x" + std::to_string(hdim[1]) + "x"
-            + std::to_string(hdim[2]) + ", expected "
-            + std::to_string(nt_) + "x" + std::to_string(ni_) + "x"
-            + std::to_string(nj_));
-    }
-    else if (ni_*nj_ == 0)
-    {
-        if (hdim[0] != nt_)
+        Hdf5Reader reader(filename_);
+        push(reader, dataname_);
+        auto &group = reader.getGroup();
+        dataset = group.openDataSet(HADRONS_A2AM_NAME);
+        datatype = dataset.getCompType();
+        dataspace = dataset.getSpace();
+        hdim.resize(dataspace.getSimpleExtentNdims());
+        dataspace.getSimpleExtentDims(hdim.data());
+        if ((nt_ * ni_ * nj_ != 0) and
+            ((hdim[0] != nt_) or (hdim[1] != ni_) or (hdim[2] != nj_)))
        {
-            HADRONS_ERROR(Size, "all-to-all time size mismatch (got "
-                + std::to_string(hdim[0]) + ", expected "
-                + std::to_string(nt_) + ")");
+            HADRONS_ERROR(Size, "all-to-all matrix size mismatch (got "
+                + std::to_string(hdim[0]) + "x" + std::to_string(hdim[1]) + "x"
+                + std::to_string(hdim[2]) + ", expected "
+                + std::to_string(nt_) + "x" + std::to_string(ni_) + "x"
+                + std::to_string(nj_));
        }
-        ni_ = hdim[1];
-        nj_ = hdim[2];
+        else if (ni_*nj_ == 0)
+        {
+            if (hdim[0] != nt_)
+            {
+                HADRONS_ERROR(Size, "all-to-all time size mismatch (got "
+                    + std::to_string(hdim[0]) + ", expected "
+                    + std::to_string(nt_) + ")");
+            }
+            ni_ = hdim[1];
+            nj_ = hdim[2];
+        }
+    }
+    if (grid)
+    {
+        grid->Broadcast(grid->BossRank(), &ni_, sizeof(unsigned int));
+        grid->Broadcast(grid->BossRank(), &nj_, sizeof(unsigned int));
    }

    A2AMatrix<T>         buf(ni_, nj_);
+    int broadcastSize =  sizeof(T) * buf.size();
    std::vector<hsize_t> count    = {1, static_cast<hsize_t>(ni_),
                                     static_cast<hsize_t>(nj_)},
                         stride   = {1, 1, 1},
@@ -565,10 +574,20 @@ void A2AMatrixIo<T>::load(Vec<VecT> &v, double *tRead)
            std::cout << " " << t;
            std::cout.flush();
        }
-        dataspace.selectHyperslab(H5S_SELECT_SET, count.data(), offset.data(),
-                                  stride.data(), block.data());
-        if (tRead) *tRead -= usecond();    
-        dataset.read(buf.data(), datatype, memspace, dataspace);
+        if (!(grid) || grid->IsBoss())
+        {
+            dataspace.selectHyperslab(H5S_SELECT_SET, count.data(), offset.data(),
+                                      stride.data(), block.data());
+        }
+        if (tRead) *tRead -= usecond();
+        if (!(grid) || grid->IsBoss())
+        {
+            dataset.read(buf.data(), datatype, memspace, dataspace);
+        }
+        if (grid)
+        {
+            grid->Broadcast(grid->BossRank(), buf.data(), broadcastSize);
+        }
        if (tRead) *tRead += usecond();
        v[t] = buf.template cast<VecT>();
    }
--- a/Hadrons/DilutedNoise.hpp
+++ b/Hadrons/DilutedNoise.hpp
@@ -52,6 +52,7 @@ public:
    const std::vector<FermionField> & getNoise(void) const;
    const FermionField &              operator[](const unsigned int i) const;
    FermionField &                    operator[](const unsigned int i);
+    void                              normalise(Real norm);
    void                              resize(const unsigned int nNoise);
    unsigned int                      size(void) const;
    GridCartesian                     *getGrid(void) const;
@@ -93,6 +94,21 @@ private:
    unsigned int nSrc_;
 };

+template <typename FImpl>
+class SparseSpinColorDiagonalNoise: public DilutedNoise<FImpl>
+{
+public:
+    typedef typename FImpl::FermionField FermionField;
+public:
+    // constructor/destructor
+    SparseSpinColorDiagonalNoise(GridCartesian *g, unsigned int n_src, unsigned int n_sparse);
+    virtual ~SparseSpinColorDiagonalNoise(void) = default;
+    // generate noise
+    virtual void generateNoise(GridParallelRNG &rng);
+private:
+    unsigned int nSrc_;
+    unsigned int nSparse_;
+};

 /******************************************************************************
 *                    DilutedNoise template implementation                    *
@@ -138,6 +154,15 @@ DilutedNoise<FImpl>::operator[](const unsigned int i)
    return noise_[i];
 }

+template <typename FImpl>
+void DilutedNoise<FImpl>::normalise(Real norm)
+{
+    for(int i=0;i<noise_.size();i++)
+    {
+        noise_[i] = norm*noise_[i];
+    }
+}
+
 template <typename FImpl>
 void DilutedNoise<FImpl>::resize(const unsigned int nNoise)
 {
@@ -245,6 +270,87 @@ void FullVolumeSpinColorDiagonalNoise<FImpl>::generateNoise(GridParallelRNG &rng
    }
 }

+/******************************************************************************
+ *        SparseSpinColorDiagonalNoise template implementation           *
+ ******************************************************************************/
+template <typename FImpl>
+SparseSpinColorDiagonalNoise<FImpl>::
+SparseSpinColorDiagonalNoise(GridCartesian *g, unsigned int nSrc, unsigned int nSparse)
+: DilutedNoise<FImpl>(g, nSrc*Ns*FImpl::Dimension), nSrc_(nSrc), nSparse_(nSparse)
+{}
+
+template <typename FImpl>
+void SparseSpinColorDiagonalNoise<FImpl>::generateNoise(GridParallelRNG &rng)
+{
+    typedef decltype(peekColour((*this)[0], 0)) SpinField;
+
+    auto                       &noise = *this;
+    auto                       g      = this->getGrid();
+    auto                       nd     = g->GlobalDimensions().size();
+    auto                       nc     = FImpl::Dimension;
+    LatticeInteger             coor(g), coorTot(g); coorTot = 0.;
+    Complex                    shift(1., 1.);
+    LatticeComplex             eta(g), etaSparse(g);
+    SpinField                  etas(g);
+    unsigned int               i = 0;
+    unsigned int               j = 0;
+    unsigned int               nSrc_ec;
+    
+    if(nSrc_%nSparse_==0)
+    {
+         nSrc_ec = nSrc_/nSparse_;
+    }
+    else
+    {
+         nSrc_ec = (nSrc_ - nSrc_%nSparse_)/nSparse_;
+    }
+
+    for (unsigned int n = 0; n < nSrc_; ++n)
+    {
+        bernoulli(rng, eta);
+        eta = (2.*eta - shift)*(1./::sqrt(2.));
+
+        if(nSparse_ != 1)
+        { 
+        assert(g->GlobalDimensions()[1]%nSparse_ == 0);
+        // # 0 # 0
+        // 0 # 0 #
+        // # 0 # 0
+        // 0 # 0 #
+
+        coorTot = 0;
+
+            for(unsigned int d = 0; d < nd; ++d) 
+            {
+                LatticeCoordinate(coor, d);
+                coorTot = coorTot + coor;
+            }
+            coorTot = coorTot + j;
+            eta = where(mod(coorTot,nSparse_), 0.*eta, eta);
+            
+        }
+        
+        for (unsigned int s = 0; s < Ns; ++s)
+        {
+            etas = Zero();
+            pokeSpin(etas, eta, s);
+            for (unsigned int c = 0; c < nc; ++c)
+            {
+                noise[i] = Zero();
+                pokeColour(noise[i], etas, c);
+                
+                i++;
+                
+                /**/ 
+            
+            }
+        }
+        ((n+1)%nSrc_ec == 0) ? j++: 0;
+    }
+    Real norm = sqrt(1./nSrc_ec);
+    this->normalise(norm);
+}
+
 END_HADRONS_NAMESPACE

 #endif // Hadrons_DilutedNoise_hpp_
--- a/Hadrons/DiskVector.hpp
+++ b/Hadrons/DiskVector.hpp
@@ -87,13 +87,20 @@ public:
    };
 public:
    DiskVectorBase(const std::string dirname, const unsigned int size = 0,
-                   const unsigned int cacheSize = 1, const bool clean = true);
+                   const unsigned int cacheSize = 1, const bool clean = true,
+                   GridBase *grid = nullptr);
    DiskVectorBase(DiskVectorBase<T> &&v) = default;
    virtual ~DiskVectorBase(void);
    const T & operator[](const unsigned int i) const;
    RwAccessHelper operator[](const unsigned int i);
    double hitRatio(void) const;
    void resetStat(void);
+    void setSize(unsigned int size_);
+    unsigned int getSize() const;
+    unsigned int dvSize;
+    void setGrid(GridBase *grid_);
+    GridBase *getGrid() const;
+    GridBase *dvGrid;
 private:
    virtual void load(T &obj, const std::string filename) const = 0;
    virtual void save(const std::string filename, const T &obj) const = 0;
@@ -107,6 +114,7 @@ private:
    unsigned int                                          size_, cacheSize_;
    double                                                access_{0.}, hit_{0.};
    bool                                                  clean_;
+    GridBase                                              *grid_;
    // using pointers to allow modifications when class is const
    // semantic: const means data unmodified, but cache modification allowed
    std::unique_ptr<std::vector<T>>                       cachePtr_;
@@ -158,66 +166,92 @@ public:
    {
        return (*this)[i](j, k);
    }
+    std::vector<int> dimensions() const
+    {
+        std::vector<int> dims(3);
+        dims[0] = (*this).getSize();
+        dims[1] = (*this)[0].rows();
+        dims[2] = (*this)[0].cols();
+        return dims;
+    }
 private:
    virtual void load(EigenDiskVectorMat<T> &obj, const std::string filename) const
    {
-        std::ifstream f(filename, std::ios::binary);
-        uint32_t      crc, check;
-        Eigen::Index  nRow, nCol;
-        size_t        matSize;
-        double        tRead, tHash;
-
-        f.read(reinterpret_cast<char *>(&crc), sizeof(crc));
-        f.read(reinterpret_cast<char *>(&nRow), sizeof(nRow));
-        f.read(reinterpret_cast<char *>(&nCol), sizeof(nCol));
-        obj.resize(nRow, nCol);
-        matSize = nRow*nCol*sizeof(T);
-        tRead  = -usecond();
-        f.read(reinterpret_cast<char *>(obj.data()), matSize);
-        tRead += usecond();
-        tHash  = -usecond();
-#ifdef USE_IPP
-        check  = GridChecksum::crc32c(obj.data(), matSize);
-#else
-        check  = GridChecksum::crc32(obj.data(), matSize);
-#endif
-        tHash += usecond();
-        DV_DEBUG_MSG(this, "Eigen read " << tRead/1.0e6 << " sec " << matSize/tRead*1.0e6/1024/1024 << " MB/s");
-        DV_DEBUG_MSG(this, "Eigen crc32 " << std::hex << check << std::dec 
-                     << " " << tHash/1.0e6 << " sec " << matSize/tHash*1.0e6/1024/1024 << " MB/s");
-        if (crc != check)
+        GridBase *loadGrid;
+        loadGrid = (*this).getGrid();
+        if (!(loadGrid) || loadGrid->IsBoss())
        {
-            HADRONS_ERROR(Io, "checksum failed")
+            std::ifstream f(filename, std::ios::binary);
+            uint32_t      crc, check;
+            Eigen::Index  nRow, nCol;
+            size_t        matSize;
+            double        tRead, tHash;
+
+            f.read(reinterpret_cast<char *>(&crc), sizeof(crc));
+            f.read(reinterpret_cast<char *>(&nRow), sizeof(nRow));
+            f.read(reinterpret_cast<char *>(&nCol), sizeof(nCol));
+            obj.resize(nRow, nCol);
+            matSize = nRow*nCol*sizeof(T);
+            tRead  = -usecond();
+            f.read(reinterpret_cast<char *>(obj.data()), matSize);
+            tRead += usecond();
+            tHash  = -usecond();
+    #ifdef USE_IPP
+            check  = GridChecksum::crc32c(obj.data(), matSize);
+    #else
+            check  = GridChecksum::crc32(obj.data(), matSize);
+    #endif
+            tHash += usecond();
+            DV_DEBUG_MSG(this, "Eigen read " << tRead/1.0e6 << " sec " << matSize/tRead*1.0e6/1024/1024 << " MB/s");
+            DV_DEBUG_MSG(this, "Eigen crc32 " << std::hex << check << std::dec 
+                        << " " << tHash/1.0e6 << " sec " << matSize/tHash*1.0e6/1024/1024 << " MB/s");
+            if (crc != check)
+            {
+                HADRONS_ERROR(Io, "checksum failed")
+            }
+        }
+        int broadcastSize;
+        broadcastSize = sizeof(T)*obj.size();
+        if (loadGrid)
+        {
+            loadGrid->Broadcast(loadGrid->BossRank(), obj.data(), broadcastSize);
+            loadGrid->Barrier();
        }
    }

    virtual void save(const std::string filename, const EigenDiskVectorMat<T> &obj) const
    {
-        std::ofstream f(filename, std::ios::binary);
-        uint32_t      crc;
-        Eigen::Index  nRow, nCol;
-        size_t        matSize;
-        double        tWrite, tHash;
-        
-        nRow    = obj.rows();
-        nCol    = obj.cols();
-        matSize = nRow*nCol*sizeof(T);
-        tHash   = -usecond();
-#ifdef USE_IPP
-        crc     = GridChecksum::crc32c(obj.data(), matSize);
-#else
-        crc     = GridChecksum::crc32(obj.data(), matSize);
-#endif
-        tHash  += usecond();
-        f.write(reinterpret_cast<char *>(&crc), sizeof(crc));
-        f.write(reinterpret_cast<char *>(&nRow), sizeof(nRow));
-        f.write(reinterpret_cast<char *>(&nCol), sizeof(nCol));
-        tWrite = -usecond();
-        f.write(reinterpret_cast<const char *>(obj.data()), matSize);
-        tWrite += usecond();
-        DV_DEBUG_MSG(this, "Eigen write " << tWrite/1.0e6 << " sec " << matSize/tWrite*1.0e6/1024/1024 << " MB/s");
-        DV_DEBUG_MSG(this, "Eigen crc32 " << std::hex << crc << std::dec
-                     << " " << tHash/1.0e6 << " sec " << matSize/tHash*1.0e6/1024/1024 << " MB/s");
+        GridBase *saveGrid;
+        saveGrid = (*this).getGrid();
+        if (!(saveGrid) || saveGrid->IsBoss())
+        {
+            std::ofstream f(filename, std::ios::binary);
+            uint32_t      crc;
+            Eigen::Index  nRow, nCol;
+            size_t        matSize;
+            double        tWrite, tHash;
+            
+            nRow    = obj.rows();
+            nCol    = obj.cols();
+            matSize = nRow*nCol*sizeof(T);
+            tHash   = -usecond();
+    #ifdef USE_IPP
+            crc     = GridChecksum::crc32c(obj.data(), matSize);
+    #else
+            crc     = GridChecksum::crc32(obj.data(), matSize);
+    #endif
+            tHash  += usecond();
+            f.write(reinterpret_cast<char *>(&crc), sizeof(crc));
+            f.write(reinterpret_cast<char *>(&nRow), sizeof(nRow));
+            f.write(reinterpret_cast<char *>(&nCol), sizeof(nCol));
+            tWrite = -usecond();
+            f.write(reinterpret_cast<const char *>(obj.data()), matSize);
+            tWrite += usecond();
+            DV_DEBUG_MSG(this, "Eigen write " << tWrite/1.0e6 << " sec " << matSize/tWrite*1.0e6/1024/1024 << " MB/s");
+            DV_DEBUG_MSG(this, "Eigen crc32 " << std::hex << crc << std::dec
+                        << " " << tHash/1.0e6 << " sec " << matSize/tHash*1.0e6/1024/1024 << " MB/s");
+        }
+        if (saveGrid)   saveGrid->Barrier();
    }
 };

@@ -228,8 +262,9 @@ template <typename T>
 DiskVectorBase<T>::DiskVectorBase(const std::string dirname, 
                                  const unsigned int size,
                                  const unsigned int cacheSize,
-                                  const bool clean)
-: dirname_(dirname), size_(size), cacheSize_(cacheSize), clean_(clean)
+                                  const bool clean,
+                                  GridBase *grid)
+: dirname_(dirname), size_(size), cacheSize_(cacheSize), clean_(clean), grid_(grid)
 , cachePtr_(new std::vector<T>(size))
 , modifiedPtr_(new std::vector<bool>(size, false))
 , indexPtr_(new std::map<unsigned int, unsigned int>())
@@ -238,15 +273,21 @@ DiskVectorBase<T>::DiskVectorBase(const std::string dirname,
 {
    struct stat s;

-    if(stat(dirname.c_str(), &s) == 0)
+    if (!(grid_) || grid_->IsBoss())
    {
-        HADRONS_ERROR(Io, "directory '" + dirname + "' already exists")
+        if(stat(dirname.c_str(), &s) == 0)
+        {
+            HADRONS_ERROR(Io, "directory '" + dirname + "' already exists")
+        }
+        mkdir(dirname);
    }
-    mkdir(dirname);
+    if (grid_)  grid_->Barrier();
    for (unsigned int i = 0; i < cacheSize_; ++i)
    {
        freePtr_->push(i);
    }
+    setSize(size_);
+    setGrid(grid_);
 }

 template <typename T>
@@ -258,6 +299,30 @@ DiskVectorBase<T>::~DiskVectorBase(void)
    }
 }

+template <typename T>
+void DiskVectorBase<T>::setSize(unsigned int size_)
+{
+    dvSize = size_;
+}
+
+template <typename T>
+unsigned int DiskVectorBase<T>::getSize() const
+{
+    return dvSize;
+}
+
+template <typename T>
+void DiskVectorBase<T>::setGrid(GridBase *grid_)
+{
+    dvGrid = grid_;
+}
+
+template <typename T>
+GridBase *DiskVectorBase<T>::getGrid() const
+{
+    return dvGrid;
+}
+
 template <typename T>
 const T & DiskVectorBase<T>::operator[](const unsigned int i) const
 {
@@ -299,7 +364,7 @@ const T & DiskVectorBase<T>::operator[](const unsigned int i) const
    }
    DV_DEBUG_MSG(this, "in cache: " << msg);
 #endif
-
+    if (grid_)  grid_->Barrier();
    return cache[index.at(i)];
 }

@@ -358,6 +423,7 @@ void DiskVectorBase<T>::evict(void) const
        index.erase(i);
        loads.pop_front();
    }
+    if (grid_)  grid_->Barrier();
 }

 template <typename T>
@@ -395,27 +461,14 @@ void DiskVectorBase<T>::cacheInsert(const unsigned int i, const T &obj) const
    auto &freeInd  = *freePtr_;
    auto &loads    = *loadsPtr_;

-    // cache miss, evict and store
-    if (index.find(i) == index.end())
-    {
-        evict();
-        index[i] = freeInd.top();
-        freeInd.pop();
-        cache[index.at(i)] = obj;
-        loads.push_back(i);
-        modified[index.at(i)] = false;
-    }
-    // cache hit, modify current value
-    else
-    {
-        auto pos = std::find(loads.begin(), loads.end(), i);
-        
-        cache[index.at(i)]    = obj;
-        modified[index.at(i)] = true;
-        loads.erase(pos);
-        loads.push_back(i);
-    }
+    evict();
+    index[i] = freeInd.top();
+    freeInd.pop();
+    cache[index.at(i)] = obj;
+    loads.push_back(i);
+    modified[index.at(i)] = false;

+    if (grid_)  grid_->Barrier();
 #ifdef DV_DEBUG
    std::string msg;

@@ -434,21 +487,23 @@ void DiskVectorBase<T>::cacheInsert(const unsigned int i, const T &obj) const
 template <typename T>
 void DiskVectorBase<T>::clean(void)
 {
-    auto unlink = [](const char *fpath, const struct stat *sb, 
-                     int typeflag, struct FTW *ftwbuf)
+    if (!(grid_) || grid_->IsBoss())
    {
-        int rv = remove(fpath);
+        auto unlink = [](const char *fpath, const struct stat *sb,
+                         int typeflag, struct FTW *ftwbuf) {
+            int rv = remove(fpath);

-        if (rv)
-        {
-            HADRONS_ERROR(Io, "cannot remove '" + std::string(fpath) + "': "
-                          + std::string(std::strerror(errno)));
-        }
+            if (rv)
+            {
+                HADRONS_ERROR(Io, "cannot remove '" + std::string(fpath) + "': " + std::string(std::strerror(errno)));
+            }

-        return rv;
-    };
+            return rv;
+        };

-    nftw(dirname_.c_str(), unlink, 64, FTW_DEPTH | FTW_PHYS);
+        nftw(dirname_.c_str(), unlink, 64, FTW_DEPTH | FTW_PHYS);
+    }
+    if (grid_)  grid_->Barrier();
 }

 END_HADRONS_NAMESPACE
--- a/Hadrons/Environment.cc
+++ b/Hadrons/Environment.cc
@@ -84,6 +84,16 @@ GridParallelRNG * Environment::get4dRng(void)
    return rng4d_.get();
 }

+GridSerialRNG * Environment::getSerialRng(void)
+{
+    if (rngSerial_ == nullptr)
+    {
+        rngSerial_.reset(new GridSerialRNG());
+    }
+
+    return rngSerial_.get();
+}
+
 // general memory management ///////////////////////////////////////////////////
 void Environment::addObject(const std::string name, const int moduleAddress)
 {
--- a/Hadrons/Environment.hpp
+++ b/Hadrons/Environment.hpp
@@ -74,6 +74,7 @@ public:
    typedef std::unique_ptr<GridCartesian>         GridPt;
    typedef std::unique_ptr<GridRedBlackCartesian> GridRbPt;
    typedef std::unique_ptr<GridParallelRNG>       RngPt;
+    typedef std::unique_ptr<GridSerialRNG>         SerialRngPt;
    enum class Storage {object, cache, temporary};
 private:
    struct ObjInfo
@@ -114,6 +115,7 @@ public:
    double                  getVolume(void) const;
    // random number generator
    GridParallelRNG *       get4dRng(void);
+    GridSerialRNG *         getSerialRng(void);
    // general memory management
    void                    addObject(const std::string name,
                                      const int moduleAddress = -1);
@@ -183,6 +185,7 @@ private:
    unsigned int                        nd_;
    // random number generator
    RngPt                               rng4d_{nullptr};
+    SerialRngPt                         rngSerial_{nullptr};
    // object store
    std::vector<ObjInfo>                object_;
    std::map<std::string, unsigned int> objectAddress_;
--- a/Hadrons/Global.hpp
+++ b/Hadrons/Global.hpp
@@ -272,7 +272,7 @@ struct Correlator: Serializable
 {
    GRID_SERIALIZABLE_CLASS_MEMBERS(ARG(Correlator<Metadata, Scalar>),
                                    Metadata,             info,
-                                    std::vector<Complex>, corr);
+                                    std::vector<Scalar>, corr);
 };

 END_HADRONS_NAMESPACE
--- a/Hadrons/Module.cc
+++ b/Hadrons/Module.cc
@@ -93,3 +93,18 @@ GridParallelRNG & ModuleBase::rng4d(void)

    return r;
 }
+
+GridSerialRNG & ModuleBase::rngSerial(void)
+{
+    auto &r = *env().getSerialRng();
+
+    if (makeSeedString() != seed_)
+    {
+        seed_ = makeSeedString();
+        LOG(Message) << "Seeding Serial RNG " << &r << " with string '" 
+                     << seed_ << "'" << std::endl;
+        r.SeedUniqueString(seed_);
+    }
+
+    return r;
+}
--- a/Hadrons/Module.hpp
+++ b/Hadrons/Module.hpp
@@ -1,7 +1,6 @@
 /*************************************************************************************

 Grid physics library, www.github.com/paboyle/Grid 
-
 Source file: Hadrons/Module.hpp

 Copyright (C) 2015-2019
@@ -196,6 +195,7 @@ protected:
    DEFINE_VM_ALIAS;
    // RNG seeded from module string
    GridParallelRNG &rng4d(void);
+    GridSerialRNG &rngSerial(void);
 private:
    std::string makeSeedString(void);
 private:
--- a/Hadrons/Modules.hpp
+++ b/Hadrons/Modules.hpp
@@ -1,71 +1,87 @@
-#include <Hadrons/Modules/MSource/Gauss.hpp>
-#include <Hadrons/Modules/MSource/Momentum.hpp>
-#include <Hadrons/Modules/MSource/SeqAslash.hpp>
-#include <Hadrons/Modules/MSource/Z2.hpp>
-#include <Hadrons/Modules/MSource/Point.hpp>
-#include <Hadrons/Modules/MSource/SeqGamma.hpp>
-#include <Hadrons/Modules/MSource/Convolution.hpp>
-#include <Hadrons/Modules/MSource/Wall.hpp>
-#include <Hadrons/Modules/MSource/SeqConserved.hpp>
-#include <Hadrons/Modules/MScalarSUN/Div.hpp>
-#include <Hadrons/Modules/MScalarSUN/TrKinetic.hpp>
-#include <Hadrons/Modules/MScalarSUN/TrPhi.hpp>
-#include <Hadrons/Modules/MScalarSUN/TwoPoint.hpp>
-#include <Hadrons/Modules/MScalarSUN/Grad.hpp>
-#include <Hadrons/Modules/MScalarSUN/Utils.hpp>
-#include <Hadrons/Modules/MScalarSUN/StochFreeField.hpp>
-#include <Hadrons/Modules/MScalarSUN/EMT.hpp>
-#include <Hadrons/Modules/MScalarSUN/TrMag.hpp>
-#include <Hadrons/Modules/MScalarSUN/TwoPointNPR.hpp>
-#include <Hadrons/Modules/MScalarSUN/TransProj.hpp>
-#include <Hadrons/Modules/MNoise/TimeDilutedSpinColorDiagonal.hpp>
-#include <Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.hpp>
-#include <Hadrons/Modules/MScalar/FreeProp.hpp>
-#include <Hadrons/Modules/MScalar/Scalar.hpp>
-#include <Hadrons/Modules/MScalar/ChargedProp.hpp>
-#include <Hadrons/Modules/MAction/Wilson.hpp>
-#include <Hadrons/Modules/MAction/ScaledDWF.hpp>
-#include <Hadrons/Modules/MAction/MobiusDWF.hpp>
-#include <Hadrons/Modules/MAction/WilsonClover.hpp>
-#include <Hadrons/Modules/MAction/ZMobiusDWF.hpp>
 #include <Hadrons/Modules/MAction/DWF.hpp>
-#include <Hadrons/Modules/MGauge/UnitEm.hpp>
-#include <Hadrons/Modules/MGauge/Electrify.hpp>
-#include <Hadrons/Modules/MGauge/StoutSmearing.hpp>
-#include <Hadrons/Modules/MGauge/Random.hpp>
-#include <Hadrons/Modules/MGauge/FundtoHirep.hpp>
-#include <Hadrons/Modules/MGauge/GaugeFix.hpp>
-#include <Hadrons/Modules/MGauge/Unit.hpp>
-#include <Hadrons/Modules/MGauge/StochEm.hpp>
-#include <Hadrons/Modules/MUtilities/RandomVectors.hpp>
-#include <Hadrons/Modules/MUtilities/PrecisionCast.hpp>
-#include <Hadrons/Modules/MIO/LoadCosmHol.hpp>
-#include <Hadrons/Modules/MIO/LoadA2AVectors.hpp>
-#include <Hadrons/Modules/MIO/LoadEigenPack.hpp>
-#include <Hadrons/Modules/MIO/LoadNersc.hpp>
-#include <Hadrons/Modules/MIO/LoadBinary.hpp>
-#include <Hadrons/Modules/MIO/LoadCoarseEigenPack.hpp>
+#include <Hadrons/Modules/MAction/MobiusDWF.hpp>
+#include <Hadrons/Modules/MAction/ScaledDWF.hpp>
+#include <Hadrons/Modules/MAction/WilsonClover.hpp>
+#include <Hadrons/Modules/MAction/Wilson.hpp>
+#include <Hadrons/Modules/MAction/ZMobiusDWF.hpp>
+#include <Hadrons/Modules/MContraction/A2AAslashField.hpp>
+#include <Hadrons/Modules/MContraction/A2AFourQuarkContraction.hpp>
+#include <Hadrons/Modules/MContraction/A2ALoop.hpp>
+#include <Hadrons/Modules/MContraction/A2AMesonField.hpp>
+#include <Hadrons/Modules/MContraction/Baryon.hpp>
+#include <Hadrons/Modules/MContraction/DiscLoop.hpp>
+#include <Hadrons/Modules/MContraction/Gamma3pt.hpp>
+#include <Hadrons/Modules/MContraction/Meson.hpp>
+#include <Hadrons/Modules/MContraction/SigmaToNucleonEye.hpp>
+#include <Hadrons/Modules/MContraction/SigmaToNucleonNonEye.hpp>
 #include <Hadrons/Modules/MContraction/WeakEye3pt.hpp>
 #include <Hadrons/Modules/MContraction/WeakMesonDecayKl2.hpp>
-#include <Hadrons/Modules/MContraction/Gamma3pt.hpp>
-#include <Hadrons/Modules/MContraction/A2AMesonField.hpp>
-#include <Hadrons/Modules/MContraction/A2ALoop.hpp>
 #include <Hadrons/Modules/MContraction/WeakNonEye3pt.hpp>
-#include <Hadrons/Modules/MContraction/DiscLoop.hpp>
-#include <Hadrons/Modules/MContraction/A2AAslashField.hpp>
-#include <Hadrons/Modules/MContraction/Baryon.hpp>
-#include <Hadrons/Modules/MContraction/Meson.hpp>
-#include <Hadrons/Modules/MNPR/FourQuark.hpp>
-#include <Hadrons/Modules/MNPR/Bilinear.hpp>
-#include <Hadrons/Modules/MNPR/Amputate.hpp>
-#include <Hadrons/Modules/MSolver/A2AAslashVectors.hpp>
-#include <Hadrons/Modules/MSolver/RBPrecCG.hpp>
-#include <Hadrons/Modules/MSolver/Guesser.hpp>
-#include <Hadrons/Modules/MSolver/LocalCoherenceLanczos.hpp>
-#include <Hadrons/Modules/MSolver/A2AVectors.hpp>
-#include <Hadrons/Modules/MSolver/MixedPrecisionRBPrecCG.hpp>
+#include <Hadrons/Modules/MDistil/Distil.hpp>
+#include <Hadrons/Modules/MDistil/DistilPar.hpp>
+#include <Hadrons/Modules/MDistil/DistilVectors.hpp>
+#include <Hadrons/Modules/MDistil/LapEvec.hpp>
+#include <Hadrons/Modules/MDistil/Noises.hpp>
+#include <Hadrons/Modules/MDistil/PerambFromSolve.hpp>
+#include <Hadrons/Modules/MDistil/Perambulator.hpp>
+#include <Hadrons/Modules/MFermion/EMLepton.hpp>
 #include <Hadrons/Modules/MFermion/FreeProp.hpp>
 #include <Hadrons/Modules/MFermion/GaugeProp.hpp>
-#include <Hadrons/Modules/MFermion/EMLepton.hpp>
-#include <Hadrons/Modules/MSink/Smear.hpp>
+#include <Hadrons/Modules/MGauge/Electrify.hpp>
+#include <Hadrons/Modules/MGauge/FundtoHirep.hpp>
+#include <Hadrons/Modules/MGauge/GaugeFix.hpp>
+#include <Hadrons/Modules/MGauge/Random.hpp>
+#include <Hadrons/Modules/MGauge/StochEm.hpp>
+#include <Hadrons/Modules/MGauge/StoutSmearing.hpp>
+#include <Hadrons/Modules/MGauge/UnitEm.hpp>
+#include <Hadrons/Modules/MGauge/Unit.hpp>
+#include <Hadrons/Modules/MIO/LoadA2AMatrixDiskVector.hpp>
+#include <Hadrons/Modules/MIO/LoadA2AVectors.hpp>
+#include <Hadrons/Modules/MIO/LoadBinary.hpp>
+#include <Hadrons/Modules/MIO/LoadCoarseEigenPack.hpp>
+#include <Hadrons/Modules/MIO/LoadCosmHol.hpp>
+#include <Hadrons/Modules/MIO/LoadDistilNoise.hpp>
+#include <Hadrons/Modules/MIO/LoadEigenPack.hpp>
+#include <Hadrons/Modules/MIO/LoadNersc.hpp>
+#include <Hadrons/Modules/MIO/LoadPerambulator.hpp>
+#include <Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.hpp>
+#include <Hadrons/Modules/MNoise/SparseSpinColorDiagonal.hpp>
+#include <Hadrons/Modules/MNoise/TimeDilutedSpinColorDiagonal.hpp>
+#include <Hadrons/Modules/MNPR/Amputate.hpp>
+#include <Hadrons/Modules/MNPR/Bilinear.hpp>
+#include <Hadrons/Modules/MNPR/FourQuark.hpp>
+#include <Hadrons/Modules/MScalar/ChargedProp.hpp>
+#include <Hadrons/Modules/MScalar/FreeProp.hpp>
+#include <Hadrons/Modules/MScalar/Scalar.hpp>
+#include <Hadrons/Modules/MScalarSUN/Div.hpp>
+#include <Hadrons/Modules/MScalarSUN/EMT.hpp>
+#include <Hadrons/Modules/MScalarSUN/Grad.hpp>
+#include <Hadrons/Modules/MScalarSUN/StochFreeField.hpp>
+#include <Hadrons/Modules/MScalarSUN/TransProj.hpp>
+#include <Hadrons/Modules/MScalarSUN/TrKinetic.hpp>
+#include <Hadrons/Modules/MScalarSUN/TrMag.hpp>
+#include <Hadrons/Modules/MScalarSUN/TrPhi.hpp>
+#include <Hadrons/Modules/MScalarSUN/TwoPoint.hpp>
+#include <Hadrons/Modules/MScalarSUN/TwoPointNPR.hpp>
+#include <Hadrons/Modules/MScalarSUN/Utils.hpp>
 #include <Hadrons/Modules/MSink/Point.hpp>
+#include <Hadrons/Modules/MSink/Smear.hpp>
+#include <Hadrons/Modules/MSolver/A2AAslashVectors.hpp>
+#include <Hadrons/Modules/MSolver/A2AVectors.hpp>
+#include <Hadrons/Modules/MSolver/Guesser.hpp>
+#include <Hadrons/Modules/MSolver/LocalCoherenceLanczos.hpp>
+#include <Hadrons/Modules/MSolver/MixedPrecisionRBPrecCG.hpp>
+#include <Hadrons/Modules/MSolver/RBPrecCG.hpp>
+#include <Hadrons/Modules/MSource/Convolution.hpp>
+#include <Hadrons/Modules/MSource/Gauss.hpp>
+#include <Hadrons/Modules/MSource/JacobiSmear.hpp>
+#include <Hadrons/Modules/MSource/Momentum.hpp>
+#include <Hadrons/Modules/MSource/MomentumPhase.hpp>
+#include <Hadrons/Modules/MSource/Point.hpp>
+#include <Hadrons/Modules/MSource/SeqAslash.hpp>
+#include <Hadrons/Modules/MSource/SeqConserved.hpp>
+#include <Hadrons/Modules/MSource/SeqGamma.hpp>
+#include <Hadrons/Modules/MSource/Wall.hpp>
+#include <Hadrons/Modules/MSource/Z2.hpp>
+#include <Hadrons/Modules/MUtilities/PrecisionCast.hpp>
+#include <Hadrons/Modules/MUtilities/RandomVectors.hpp>
--- a/Hadrons/Modules/MContraction/A2AAslashField.hpp
+++ b/Hadrons/Modules/MContraction/A2AAslashField.hpp
@@ -174,6 +174,7 @@ void TA2AAslashField<FImpl, PhotonImpl>::setup(void)
 template <typename FImpl, typename PhotonImpl>
 void TA2AAslashField<FImpl, PhotonImpl>::execute(void)
 {
+#ifndef GRID_NVCC
    auto &left  = envGet(std::vector<FermionField>, par().left);
    auto &right = envGet(std::vector<FermionField>, par().right);

@@ -237,6 +238,7 @@ void TA2AAslashField<FImpl, PhotonImpl>::execute(void)

    envGetTmp(Computation, computation);
    computation.execute(left, right, kernel, ionameFn, filenameFn, metadataFn);
+#endif
 }

 END_MODULE_NAMESPACE
--- a/Hadrons/Modules/MContraction/A2AFourQuarkContraction.cc
+++ b/Hadrons/Modules/MContraction/A2AFourQuarkContraction.cc
@@ -0,0 +1,7 @@
+#include <Hadrons/Modules/MContraction/A2AFourQuarkContraction.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MContraction;
+
+template class Grid::Hadrons::MContraction::TA2AFourQuarkContraction<FIMPL>;
--- a/Hadrons/Modules/MContraction/A2AFourQuarkContraction.hpp
+++ b/Hadrons/Modules/MContraction/A2AFourQuarkContraction.hpp
@@ -0,0 +1,138 @@
+#ifndef Hadrons_MContraction_A2AFourQuarkContraction_hpp_
+#define Hadrons_MContraction_A2AFourQuarkContraction_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/DiskVector.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                         A2AFourQuarkContraction                                 *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MContraction)
+
+class A2AFourQuarkContractionPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(A2AFourQuarkContractionPar,
+                                    std::string,  v1,
+                                    std::string,  v2,
+                                    std::string,  mf12,
+                                    bool,         allContr,
+                                    unsigned int, dt);
+};
+
+template <typename FImpl>
+class TA2AFourQuarkContraction: public Module<A2AFourQuarkContractionPar>
+{
+  public:
+    FERM_TYPE_ALIASES(FImpl, );
+    // constructor
+    TA2AFourQuarkContraction(const std::string name);
+    // destructor
+    virtual ~TA2AFourQuarkContraction(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+  private:
+    unsigned int nt_;
+};
+
+MODULE_REGISTER_TMP(A2AFourQuarkContraction, TA2AFourQuarkContraction<FIMPL>, MContraction);
+
+/******************************************************************************
+ *                 TA2AFourQuarkContraction implementation                             *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl>
+TA2AFourQuarkContraction<FImpl>::TA2AFourQuarkContraction(const std::string name)
+: Module<A2AFourQuarkContractionPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl>
+std::vector<std::string> TA2AFourQuarkContraction<FImpl>::getInput(void)
+{
+    std::vector<std::string> in = {par().v1, par().v2, par().mf12};
+    
+    return in;
+}
+
+template <typename FImpl>
+std::vector<std::string> TA2AFourQuarkContraction<FImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TA2AFourQuarkContraction<FImpl>::setup(void)
+{
+    if (par().allContr)
+    {
+        nt_ = env().getDim(Tp);
+        envTmp(std::vector<PropagatorField>, "tmpWWVV", 1, nt_, envGetGrid(PropagatorField));
+        envCreate(std::vector<PropagatorField>, getName(), 1, nt_, envGetGrid(PropagatorField));
+    }
+    else
+    {
+        envTmp(std::vector<PropagatorField>, "tmpWWVV", 1, 1, envGetGrid(PropagatorField));
+        envCreate(PropagatorField, getName(), 1, envGetGrid(PropagatorField));
+    }
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TA2AFourQuarkContraction<FImpl>::execute(void)
+{
+    auto &v1   = envGet(std::vector<FermionField>, par().v1);
+    auto &v2   = envGet(std::vector<FermionField>, par().v2);
+    auto &mf12 = envGet(EigenDiskVector<Complex>, par().mf12);
+
+    envGetTmp(std::vector<PropagatorField>, tmpWWVV);
+
+    unsigned int dt = par().dt;
+    unsigned int nt = env().getDim(Tp);
+
+    if (par().allContr)
+    {
+        LOG(Message) << "Computing 4 quark contraction for " << getName()
+                     << " for all t0 time translations "
+                     << "with nt = " << nt_ << " and dt = " << dt << std::endl;
+
+        auto &WWVV = envGet(std::vector<PropagatorField>, getName());
+        A2Autils<FImpl>::ContractWWVV(tmpWWVV, mf12, &v1[0], &v2[0]);
+        for(unsigned int t = 0; t < nt_; t++){
+            unsigned int t0 = (t + dt) % nt_;
+            WWVV[t] = tmpWWVV[t0];
+        }
+    }
+    else
+    {
+        LOG(Message) << "Computing 4 quark contraction for: " << getName()
+                     << " for time dt = " << dt << std::endl;
+
+        auto &WWVV = envGet(PropagatorField, getName());
+        int ni = v1.size();
+        int nj = v2.size();
+        Eigen::Matrix<Complex, -1, -1, Eigen::RowMajor> mf;
+        mf = mf12[dt];
+        Eigen::TensorMap<Eigen::Tensor<Complex, 3, Eigen::RowMajor>> mfT(mf.data(), 1, ni, nj);
+        A2Autils<FImpl>::ContractWWVV(tmpWWVV, mfT, &v1[0], &v2[0]);
+        WWVV = tmpWWVV[0];
+    }
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MContraction_A2AFourQuarkContraction_hpp_
--- a/Hadrons/Modules/MContraction/Baryon.hpp
+++ b/Hadrons/Modules/MContraction/Baryon.hpp
@@ -7,7 +7,7 @@ Source file: Hadrons/Modules/MContraction/Baryon.hpp
 Copyright (C) 2015-2019

 Author: Antonin Portelli <antonin.portelli@me.com>
-Author: Lanny91 <andrew.lawson@gmail.com>
+Author: Felix Erben <felix.erben@ed.ac.uk>

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -33,6 +33,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
+#include <Grid/qcd/utils/BaryonUtils.h>

 BEGIN_HADRONS_NAMESPACE

@@ -41,6 +42,9 @@ BEGIN_HADRONS_NAMESPACE
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MContraction)

+typedef std::pair<Gamma::Algebra, Gamma::Algebra> GammaAB;
+typedef std::pair<GammaAB, GammaAB> GammaABPair;
+
 class BaryonPar: Serializable
 {
 public:
@@ -48,6 +52,11 @@ public:
                                    std::string, q1,
                                    std::string, q2,
                                    std::string, q3,
+                                    std::string, gammas,
+                                    std::string, quarks,
+                                    std::string, prefactors,
+                                    std::string, parity,
+                                    std::string, sink,
                                    std::string, output);
 };

@@ -58,12 +67,21 @@ public:
    FERM_TYPE_ALIASES(FImpl1, 1);
    FERM_TYPE_ALIASES(FImpl2, 2);
    FERM_TYPE_ALIASES(FImpl3, 3);
-    class Result: Serializable
+    BASIC_TYPE_ALIASES(ScalarImplCR, Scalar);
+    SINK_TYPE_ALIASES(Scalar);
+    class Metadata: Serializable
    {
    public:
-        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
-                                        std::vector<std::vector<std::vector<Complex>>>, corr);
+        GRID_SERIALIZABLE_CLASS_MEMBERS(Metadata,
+                                        Gamma::Algebra, gammaA_left,
+                                        Gamma::Algebra, gammaB_left,
+                                        Gamma::Algebra, gammaA_right,
+                                        Gamma::Algebra, gammaB_right,
+                                        std::string, quarks,
+                                        std::string, prefactors,
+                                        int, parity);
    };
+    typedef Correlator<Metadata> Result;
 public:
    // constructor
    TBaryon(const std::string name);
@@ -72,11 +90,14 @@ public:
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
+    virtual void parseGammaString(std::vector<GammaABPair> &gammaList);
 protected:
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
+    // Which gamma algebra was specified
+    Gamma::Algebra  al;
 };

 MODULE_REGISTER_TMP(Baryon, ARG(TBaryon<FIMPL, FIMPL, FIMPL>), MContraction);
@@ -94,7 +115,7 @@ TBaryon<FImpl1, FImpl2, FImpl3>::TBaryon(const std::string name)
 template <typename FImpl1, typename FImpl2, typename FImpl3>
 std::vector<std::string> TBaryon<FImpl1, FImpl2, FImpl3>::getInput(void)
 {
-    std::vector<std::string> input = {par().q1, par().q2, par().q3};
+    std::vector<std::string> input = {par().q1, par().q2, par().q3, par().sink};
    
    return input;
 }
@@ -107,30 +128,199 @@ std::vector<std::string> TBaryon<FImpl1, FImpl2, FImpl3>::getOutput(void)
    return out;
 }

+template <typename FImpl1, typename FImpl2, typename FImpl3>
+void TBaryon<FImpl1, FImpl2,FImpl3>::parseGammaString(std::vector<GammaABPair> &gammaList)
+{
+    gammaList.clear();
+    
+    std::string gammaString = par().gammas;
+    //Shorthands for standard baryon operators
+    gammaString = regex_replace(gammaString, std::regex("j12"),"(Identity SigmaXZ)");
+    gammaString = regex_replace(gammaString, std::regex("j32X"),"(Identity MinusGammaZGamma5)");
+    gammaString = regex_replace(gammaString, std::regex("j32Y"),"(Identity GammaT)");
+    gammaString = regex_replace(gammaString, std::regex("j32Z"),"(Identity GammaXGamma5)");
+    //Shorthands for less common baryon operators
+    gammaString = regex_replace(gammaString, std::regex("j12_alt1"),"(Gamma5 MinusSigmaYT)");
+    gammaString = regex_replace(gammaString, std::regex("j12_alt2"),"(Identity GammaYGamma5)");
+    
+    //A single gamma matrix 
+    std::regex rex_g("([0-9a-zA-Z]+)");
+    //The full string we expect
+    std::regex rex("( *\\(( *\\(([0-9a-zA-Z]+) +([0-9a-zA-Z]+) *\\)){2} *\\) *)+");
+    std::smatch sm;
+    std::regex_match(gammaString, sm, rex);
+    assert(sm[0].matched && "invalid gamma structure.");
+
+    auto gamma_begin = std::sregex_iterator(gammaString.begin(), gammaString.end(), rex_g);
+    auto gamma_end = std::sregex_iterator();
+
+    int nGamma = std::distance(gamma_begin, gamma_end); 
+    //couldn't find out how to count the size in the iterator, other than looping through it...
+  /*  int nGamma=0;
+    for (std::sregex_iterator i = gamma_begin; i != gamma_end; ++i) {
+	nGamma++;
+    }
+*/   
+    gammaList.resize(nGamma/4);
+    std::vector<std::string> gS;
+    gS.resize(nGamma);
+    //even more ugly workarounds here...
+    int iG=0;
+    for (std::sregex_iterator i = gamma_begin; i != gamma_end; ++i) {
+        std::smatch match = *i;                                                 
+        gS[iG] = match.str(); 
+	iG++;
+    }
+    for (int i = 0; i < gammaList.size(); i++){
+	std::vector<Gamma::Algebra> gS1 = strToVec<Gamma::Algebra>(gS[4*i]);
+	std::vector<Gamma::Algebra> gS2 = strToVec<Gamma::Algebra>(gS[4*i+1]);
+	std::vector<Gamma::Algebra> gS3 = strToVec<Gamma::Algebra>(gS[4*i+2]);
+	std::vector<Gamma::Algebra> gS4 = strToVec<Gamma::Algebra>(gS[4*i+3]);
+        gammaList[i].first.first=gS1[0];
+        gammaList[i].first.second=gS2[0];
+        gammaList[i].second.first=gS3[0];
+        gammaList[i].second.second=gS4[0];
+    }
+}
+
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl1, typename FImpl2, typename FImpl3>
 void TBaryon<FImpl1, FImpl2, FImpl3>::setup(void)
 {
    envTmpLat(LatticeComplex, "c");
+    envTmpLat(LatticeComplex, "c2");
 }

 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl1, typename FImpl2, typename FImpl3>
 void TBaryon<FImpl1, FImpl2, FImpl3>::execute(void)
 {
-    LOG(Message) << "Computing baryon contractions '" << getName() << "' using"
-                 << " quarks '" << par().q1 << "', '" << par().q2 << "', and '"
-                 << par().q3 << "'" << std::endl;
-    
-    auto       &q1 = envGet(PropagatorField1, par().q1);
-    auto       &q2 = envGet(PropagatorField2, par().q2);
-    auto       &q3 = envGet(PropagatorField3, par().q2);
+
+    std::vector<std::string> quarks = strToVec<std::string>(par().quarks);    
+    std::vector<double> prefactors = strToVec<double>(par().prefactors);    
+    int nQ=quarks.size();
+    const int  parity {par().parity.size()>0 ? std::stoi(par().parity) : 1};
+
+    std::vector<GammaABPair> gammaList;
+    parseGammaString(gammaList);
+
+    assert(prefactors.size()==nQ && "number of prefactors needs to match number of quark-structures.");
+    for (int iQ = 0; iQ < nQ; iQ++)
+        assert(quarks[iQ].size()==3 && "quark-structures must consist of 3 quarks each.");
+
+    LOG(Message) << "Computing baryon contractions '" << getName() << "'" << std::endl;
+    for (int iQ1 = 0; iQ1 < nQ; iQ1++)
+        for (int iQ2 = 0; iQ2 < nQ; iQ2++)
+            LOG(Message) << prefactors[iQ1]*prefactors[iQ2] << "*<" << quarks[iQ1] << "|" << quarks[iQ2] << ">" << std::endl;
+    LOG(Message) << " using quarks " << par().q1 << "', " << par().q2 << "', and '" << par().q3 << std::endl;
+     for (int iG = 0; iG < gammaList.size(); iG++)
+         LOG(Message) << "' with (Gamma^A,Gamma^B)_left = ( " << gammaList[iG].first.first << " , " << gammaList[iG].first.second << "') and (Gamma^A,Gamma^B)_right = ( " << gammaList[iG].second.first << " , " << gammaList[iG].second.second << ")" << std::endl; 
+      LOG(Message) << "and parity " << parity << " using sink " << par().sink << "." << std::endl;
+        
    envGetTmp(LatticeComplex, c);
-    Result     result;
-    
-    // FIXME: do contractions
-    
-    // saveResult(par().output, "meson", result);
+    envGetTmp(LatticeComplex, c2);
+    int nt = env().getDim(Tp);
+    std::vector<TComplex> buf;
+    TComplex cs;
+    TComplex ch;
+
+    std::vector<Result> result;
+    Result              r;
+    r.info.parity = parity;
+    r.info.quarks = par().quarks;
+    r.info.prefactors = par().prefactors;
+
+    if (envHasType(SlicedPropagator1, par().q1) and
+        envHasType(SlicedPropagator2, par().q2) and
+        envHasType(SlicedPropagator3, par().q3))
+    {
+        auto &q1 = envGet(SlicedPropagator1, par().q1);
+        auto &q2 = envGet(SlicedPropagator2, par().q2);
+        auto &q3 = envGet(SlicedPropagator3, par().q3);
+        for (unsigned int i = 0; i < gammaList.size(); ++i)
+        {
+            r.info.gammaA_left = gammaList[i].first.first;
+            r.info.gammaB_left = gammaList[i].first.second;
+            r.info.gammaA_right = gammaList[i].second.first;
+            r.info.gammaB_right = gammaList[i].second.second;
+
+            Gamma gAl(gammaList[i].first.first);
+            Gamma gBl(gammaList[i].first.second);
+            Gamma gAr(gammaList[i].second.first);
+            Gamma gBr(gammaList[i].second.second);
+        
+            LOG(Message) << "(propagator already sinked)" << std::endl;
+            r.corr.clear();
+            for (unsigned int t = 0; t < buf.size(); ++t)
+            {
+                cs = Zero();
+                for (int iQ1 = 0; iQ1 < nQ; iQ1++){
+                    for (int iQ2 = 0; iQ2 < nQ; iQ2++){
+                        BaryonUtils<FIMPL>::ContractBaryons_Sliced(q1[t],q2[t],q3[t],gAl,gBl,gAr,gBr,quarks[iQ1].c_str(),quarks[iQ2].c_str(),parity,ch);
+                        cs += prefactors[iQ1]*prefactors[iQ2]*ch;
+                    }
+                }
+                r.corr.push_back(TensorRemove(cs));
+            }
+            result.push_back(r);
+        }
+    }
+    else
+    {
+        auto       &q1 = envGet(PropagatorField1, par().q1);
+        auto       &q2 = envGet(PropagatorField2, par().q2);
+        auto       &q3 = envGet(PropagatorField3, par().q3);
+        for (unsigned int i = 0; i < gammaList.size(); ++i)
+        {
+            r.info.gammaA_left = gammaList[i].first.first;
+            r.info.gammaB_left = gammaList[i].first.second;
+            r.info.gammaA_right = gammaList[i].second.first;
+            r.info.gammaB_right = gammaList[i].second.second;
+
+            Gamma gAl(gammaList[i].first.first);
+            Gamma gBl(gammaList[i].first.second);
+            Gamma gAr(gammaList[i].second.first);
+            Gamma gBr(gammaList[i].second.second);
+        
+            std::string ns;
+                
+            ns = vm().getModuleNamespace(env().getObjectModule(par().sink));
+            if (ns == "MSource")
+            {
+                c=Zero();
+                for (int iQ1 = 0; iQ1 < nQ; iQ1++){
+                    for (int iQ2 = 0; iQ2 < nQ; iQ2++){
+                        BaryonUtils<FIMPL>::ContractBaryons(q1,q2,q3,gAl,gBl,gAr,gBr,quarks[iQ1].c_str(),quarks[iQ2].c_str(),parity,c2);
+                        c+=prefactors[iQ1]*prefactors[iQ2]*c2;
+                    }
+                }
+                PropagatorField1 &sink = envGet(PropagatorField1, par().sink);
+                auto test = closure(trace(sink*c));     
+                sliceSum(test, buf, Tp); 
+            }
+            else if (ns == "MSink")
+            {
+                c=Zero();
+                for (int iQ1 = 0; iQ1 < nQ; iQ1++){
+                    for (int iQ2 = 0; iQ2 < nQ; iQ2++){
+                        BaryonUtils<FIMPL>::ContractBaryons(q1,q2,q3,gAl,gBl,gAr,gBr,quarks[iQ1].c_str(),quarks[iQ2].c_str(),parity,c2);
+                        c+=prefactors[iQ1]*prefactors[iQ2]*c2;
+                    }
+                }
+                SinkFnScalar &sink = envGet(SinkFnScalar, par().sink);
+                buf = sink(c);
+            } 
+            r.corr.clear();
+            for (unsigned int t = 0; t < buf.size(); ++t)
+            {
+                r.corr.push_back(TensorRemove(buf[t]));
+            }
+            result.push_back(r);
+        }
+    }
+
+    saveResult(par().output, "baryon", result);
+
 }

 END_MODULE_NAMESPACE
--- a/Hadrons/Modules/MContraction/Gamma3pt.hpp
+++ b/Hadrons/Modules/MContraction/Gamma3pt.hpp
@@ -57,7 +57,8 @@ BEGIN_HADRONS_NAMESPACE
 *   - q1: sink smeared propagator, source at i
 *   - q2: propagator, source at i
 *   - q3: propagator, source at f
- *   - gamma: gamma matrix to insert
+ *   - gammas: gamma matrices to insert
+ *             (space-separated strings e.g. "GammaT GammaX GammaY") 
 *   - tSnk: sink position for propagator q1.
 *
 */
@@ -71,12 +72,12 @@ class Gamma3ptPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(Gamma3ptPar,
-                                    std::string,    q1,
-                                    std::string,    q2,
-                                    std::string,    q3,
-                                    Gamma::Algebra, gamma,
-                                    unsigned int,   tSnk,
-                                    std::string,    output);
+                                    std::string,  q1,
+                                    std::string,  q2,
+                                    std::string,  q3,
+                                    std::string,  gamma,
+                                    unsigned int, tSnk,
+                                    std::string,  output);
 };

 template <typename FImpl1, typename FImpl2, typename FImpl3>
@@ -100,6 +101,7 @@ public:
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
+    virtual void parseGammaString(std::vector<Gamma::Algebra> &gammaList);
 protected:
    // setup
    virtual void setup(void);
@@ -142,37 +144,67 @@ void TGamma3pt<FImpl1, FImpl2, FImpl3>::setup(void)
    envTmpLat(LatticeComplex, "c");
 }

+template <typename FImpl1, typename FImpl2, typename FImpl3>
+void TGamma3pt<FImpl1, FImpl2, FImpl3>::parseGammaString(std::vector<Gamma::Algebra> &gammaList)
+{
+    gammaList.clear();
+    // Determine gamma matrices to insert at source/sink.
+    if (par().gamma.compare("all") == 0)
+    {
+        // Do all contractions.
+        for (unsigned int i = 1; i < Gamma::nGamma; i += 2)
+        {
+            gammaList.push_back((Gamma::Algebra)i);
+        }
+    }
+    else
+    {
+        // Parse individual contractions from input string.
+        gammaList = strToVec<Gamma::Algebra>(par().gamma);
+    } 
+}
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl1, typename FImpl2, typename FImpl3>
 void TGamma3pt<FImpl1, FImpl2, FImpl3>::execute(void)
 {
    LOG(Message) << "Computing 3pt contractions '" << getName() << "' using"
                 << " quarks '" << par().q1 << "', '" << par().q2 << "' and '"
-                 << par().q3 << "', with " << par().gamma << " insertion." 
+                 << par().q3 << "', with " << par().gamma << " insertions." 
                 << std::endl;

    // Initialise variables. q2 and q3 are normal propagators, q1 may be 
    // sink smeared.
-    auto                  &q1 = envGet(SlicedPropagator1, par().q1);
-    auto                  &q2 = envGet(PropagatorField2, par().q2);
-    auto                  &q3 = envGet(PropagatorField2, par().q3);
-    Gamma                 g5(Gamma::Algebra::Gamma5);
-    Gamma                 gamma(par().gamma);
-    std::vector<TComplex> buf;
-    Result                result;
+    auto                        &q1 = envGet(SlicedPropagator1, par().q1);
+    auto                        &q2 = envGet(PropagatorField2, par().q2);
+    auto                        &q3 = envGet(PropagatorField2, par().q3);
+    Gamma                       g5(Gamma::Algebra::Gamma5);
+    std::vector<Gamma::Algebra> gammaList;
+    std::vector<TComplex>       buf;
+    std::vector<Result>         result;
+    int                         nt = env().getDim(Tp);
+
+
+    parseGammaString(gammaList);
+    result.resize(gammaList.size());
+    for (unsigned int i = 0; i < result.size(); ++i)
+    {
+        result[i].gamma = gammaList[i];
+        result[i].corr.resize(nt);
+    }
    
    // Extract relevant timeslice of sinked propagator q1, then contract &
    // sum over all spacial positions of gamma insertion.
    SitePropagator1 q1Snk = q1[par().tSnk];
    envGetTmp(LatticeComplex, c);
-    c = trace(g5*q1Snk*adj(q2)*(g5*gamma)*q3);
-    sliceSum(c, buf, Tp);
-
-    result.gamma = par().gamma;
-    result.corr.resize(buf.size());
-    for (unsigned int t = 0; t < buf.size(); ++t)
+    for (unsigned int i = 0; i < result.size(); ++i)
    {
-        result.corr[t] = TensorRemove(buf[t]);
+        Gamma gamma(gammaList[i]);
+        c = trace(g5*q1Snk*adj(q2)*(g5*gamma)*q3);
+        sliceSum(c, buf, Tp);
+        for (unsigned int t = 0; t < buf.size(); ++t)
+        {
+            result[i].corr[t] = TensorRemove(buf[t]);
+        }
    }
    saveResult(par().output, "gamma3pt", result);
 }
--- a/Hadrons/Modules/MContraction/Meson.hpp
+++ b/Hadrons/Modules/MContraction/Meson.hpp
@@ -199,7 +199,7 @@ void TMeson<FImpl1, FImpl2>::execute(void)
            Gamma gSnk(gammaList[i].first);
            Gamma gSrc(gammaList[i].second);
            
-            for (unsigned int t = 0; t < buf.size(); ++t)
+            for (unsigned int t = 0; t < nt; ++t)
            {
                result[i].corr[t] = TensorRemove(trace(mesonConnected(q1[t], q2[t], gSnk, gSrc)));
            }
--- a/Hadrons/Modules/MContraction/SigmaToNucleonEye.cc
+++ b/Hadrons/Modules/MContraction/SigmaToNucleonEye.cc
@@ -0,0 +1,7 @@
+#include <Hadrons/Modules/MContraction/SigmaToNucleonEye.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MContraction;
+
+template class Grid::Hadrons::MContraction::TSigmaToNucleonEye<FIMPL>;
--- a/Hadrons/Modules/MContraction/SigmaToNucleonEye.hpp
+++ b/Hadrons/Modules/MContraction/SigmaToNucleonEye.hpp
@@ -0,0 +1,218 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MContraction/SigmaToNucleonEye.hpp
+
+Copyright (C) 2015-2019
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Felix Erben <felix.erben@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef Hadrons_MContraction_SigmaToNucleonEye_hpp_
+#define Hadrons_MContraction_SigmaToNucleonEye_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Grid/qcd/utils/BaryonUtils.h>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                               SigmaToNucleonEye                                       *
+ ******************************************************************************/
+/*
+ * Sigma-to-nucleon 3-pt diagrams, eye topologies.
+ * 
+ * Schematics:      qqLoop                |                  
+ *                  /->-¬                 |                             
+ *                 /     \                |          qsTi      G     qdTf
+ *                 \     /                |        /---->------*------>----¬         
+ *          qsTi   \   /    qdTf          |       /          /-*-¬          \
+ *       /----->-----* *----->----¬       |      /          /  G  \          \
+ *      *            G G           *      |     *           \     /  qqLoop  * 
+ *      |\                        /|      |     |\           \-<-/          /|   
+ *      | \                      / |      |     | \                        / |      
+ *      |  \---------->---------/  |      |     |  \----------->----------/  |      
+ *       \          quSpec        /       |      \          quSpec          /        
+ *        \                      /        |       \                        /
+ *         \---------->---------/         |        \----------->----------/
+ *                  quSpec                |                 quSpec
+ * 
+ * analogously to the rare-kaon naming, the left diagram is named 'one-trace' and
+ * the diagram on the right 'two-trace'
+ *
+ * Propagators:
+ *  * qqLoop
+ *  * quSpec, source at ti
+ *  * qdTf,   source at tf 
+ *  * qsTi,   source at ti
+ */
+BEGIN_MODULE_NAMESPACE(MContraction)
+
+class SigmaToNucleonEyePar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(SigmaToNucleonEyePar,
+                                    std::string, qqLoop,
+                                    std::string, quSpec,
+                                    std::string, qdTf,
+                                    std::string, qsTi,
+                                    unsigned int,   tf,
+                                    std::string, sink,
+                                    std::string, output);
+};
+
+template <typename FImpl>
+class TSigmaToNucleonEye: public Module<SigmaToNucleonEyePar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl,);
+    BASIC_TYPE_ALIASES(ScalarImplCR, Scalar);
+    SINK_TYPE_ALIASES(Scalar);
+    typedef typename SpinMatrixField::vector_object::scalar_object SpinMatrix;
+    class Metadata: Serializable
+    {
+    public:
+        GRID_SERIALIZABLE_CLASS_MEMBERS(Metadata,
+                                        Gamma::Algebra, gammaH,
+                                        Gamma::Algebra, gammaASigma,
+                                        Gamma::Algebra, gammaBSigma,
+                                        Gamma::Algebra, gammaANucl,
+                                        Gamma::Algebra, gammaBNucl,
+                                        int, trace);
+    };
+    typedef Correlator<Metadata, SpinMatrix> Result;
+public:
+    // constructor
+    TSigmaToNucleonEye(const std::string name);
+    // destructor
+    virtual ~TSigmaToNucleonEye(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+protected:
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+    // Which gamma algebra was specified
+    Gamma::Algebra  al;
+};
+
+MODULE_REGISTER_TMP(SigmaToNucleonEye, ARG(TSigmaToNucleonEye<FIMPL>), MContraction);
+
+/******************************************************************************
+ *                         TSigmaToNucleonEye implementation                             *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl>
+TSigmaToNucleonEye<FImpl>::TSigmaToNucleonEye(const std::string name)
+: Module<SigmaToNucleonEyePar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl>
+std::vector<std::string> TSigmaToNucleonEye<FImpl>::getInput(void)
+{
+    std::vector<std::string> input = {par().qqLoop, par().quSpec, par().qdTf, par().qsTi, par().sink};
+    
+    return input;
+}
+
+template <typename FImpl>
+std::vector<std::string> TSigmaToNucleonEye<FImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TSigmaToNucleonEye<FImpl>::setup(void)
+{
+    envTmpLat(SpinMatrixField, "c");
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TSigmaToNucleonEye<FImpl>::execute(void)
+{
+    const Gamma GammaB(Gamma::Algebra::SigmaXZ); // C*gamma_5
+    const Gamma Id(Gamma::Algebra::Identity); // C*gamma_5
+
+    LOG(Message) << "Computing sigma-to-nucleon contractions '" << getName() << "'" << std::endl;
+    LOG(Message) << "' with (Gamma^A,Gamma^B)_sigma = ( Identity, C*gamma_5 ) and (Gamma^A,Gamma^B)_nucl = ( Identity, C*gamma_5 )" << std::endl; 
+    LOG(Message) << " using sink " << par().sink << "." << std::endl;
+        
+    envGetTmp(SpinMatrixField, c);
+    std::vector<SpinMatrix> buf;
+
+    std::vector<Result> result;
+    Result              r;
+    r.info.gammaASigma = Id.g;
+    r.info.gammaBSigma = GammaB.g;
+    r.info.gammaANucl  = Id.g;
+    r.info.gammaBNucl  = GammaB.g;
+
+    auto &qqLoop    = envGet(PropagatorField, par().qqLoop);
+    auto &quSpec    = envGet(SlicedPropagator, par().quSpec);
+    auto &qdTf      = envGet(PropagatorField, par().qdTf);
+    auto &qsTi      = envGet(PropagatorField, par().qsTi);
+    auto qut         = quSpec[par().tf];
+    for (auto &G: Gamma::gall)
+    {
+      r.info.gammaH = G.g;
+      //Operator Q1, equivalent to the two-trace case in the rare-kaons module
+      c=Zero();
+      BaryonUtils<FIMPL>::Sigma_to_Nucleon_Eye(qqLoop,qut,qdTf,qsTi,G,GammaB,GammaB,"Q1",c);
+      sliceSum(c,buf,Tp);
+      r.corr.clear();
+      for (unsigned int t = 0; t < buf.size(); ++t)
+      {
+          r.corr.push_back(buf[t]);
+      }
+      r.info.trace = 2;
+      result.push_back(r);
+      //Operator Q2, equivalent to the one-trace case in the rare-kaons module
+      c=Zero();
+      BaryonUtils<FIMPL>::Sigma_to_Nucleon_Eye(qqLoop,qut,qdTf,qsTi,G,GammaB,GammaB,"Q2",c);
+      sliceSum(c,buf,Tp);
+      r.corr.clear();
+      for (unsigned int t = 0; t < buf.size(); ++t)
+      {
+          r.corr.push_back(buf[t]);
+      }
+      r.info.trace = 1;
+      result.push_back(r);
+    }
+
+    saveResult(par().output, "stnEye", result);
+
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MContraction_SigmaToNucleonEye_hpp_
--- a/Hadrons/Modules/MContraction/SigmaToNucleonNonEye.cc
+++ b/Hadrons/Modules/MContraction/SigmaToNucleonNonEye.cc
@@ -0,0 +1,7 @@
+#include <Hadrons/Modules/MContraction/SigmaToNucleonNonEye.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MContraction;
+
+template class Grid::Hadrons::MContraction::TSigmaToNucleonNonEye<FIMPL>;
--- a/Hadrons/Modules/MContraction/SigmaToNucleonNonEye.hpp
+++ b/Hadrons/Modules/MContraction/SigmaToNucleonNonEye.hpp
@@ -0,0 +1,224 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MContraction/SigmaToNucleonNonEye.hpp
+
+Copyright (C) 2015-2019
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Felix Erben <felix.erben@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef Hadrons_MContraction_SigmaToNucleonNonEye_hpp_
+#define Hadrons_MContraction_SigmaToNucleonNonEye_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Grid/qcd/utils/BaryonUtils.h>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                               SigmaToNucleonNonEye                                       *
+ ******************************************************************************/
+/*
+ * Sigma-to-Nucleon 3-pt diagrams, non-eye topologies.
+ * 
+ * Schematic:     
+ *            qsTi          quTf           |            qsTi            qdTf
+ *          /-->--¬       /-->--¬          |          /-->--¬         /-->--¬       
+ *         /       \     /       \         |         /       \       /       \      
+ *        /         \   /         \        |        /         \     /         \     
+ *       /           \ /           \       |       /           \   /           \    
+ *      *             * G           *      |       *           G * * G          * 
+ *     |\             * G           |      |      |\           /   \           /|
+ *     | \           / \           /|      |      | \         /     \         / |   
+ *     |  \         /   \         / |      |      |  \       /       \       /  |
+ *     |   \       /     \       /  |      |      |   \-->--/         \-->--/   |   
+ *      \   \-->--/       \-->--/  /       |       \   quTi            quTf    /
+ *       \    quTi          qdTf  /        |        \                         /
+ *        \                      /         |         \                       /
+ *         \--------->----------/          |          \--------->-----------/
+ *                 quSpec                  |                  quSpec
+ *
+ *
+ * analogously to the rare-kaon naming, the left diagram is named 'one-trace' and
+ * the diagram on the right 'two-trace'
+ * 
+ * Propagators:
+ *  * quTi,   source at ti 
+ *  * quTf,   source at tf
+ *  * quSpec, source at ti
+ *  * qdTf,   source at tf 
+ *  * qsTi,   source at ti
+ */
+BEGIN_MODULE_NAMESPACE(MContraction)
+
+class SigmaToNucleonNonEyePar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(SigmaToNucleonNonEyePar,
+                                    std::string, quTi,
+                                    std::string, quTf,
+                                    std::string, quSpec,
+                                    std::string, qdTf,
+                                    std::string, qsTi,
+                                    unsigned int,   tf,
+                                    std::string, sink,
+                                    std::string, output);
+};
+
+template <typename FImpl>
+class TSigmaToNucleonNonEye: public Module<SigmaToNucleonNonEyePar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl,);
+    BASIC_TYPE_ALIASES(ScalarImplCR, Scalar);
+    SINK_TYPE_ALIASES(Scalar);
+    typedef typename SpinMatrixField::vector_object::scalar_object SpinMatrix;
+    class Metadata: Serializable
+    {
+    public:
+        GRID_SERIALIZABLE_CLASS_MEMBERS(Metadata,
+                                        Gamma::Algebra, gammaH,
+                                        Gamma::Algebra, gammaASigma,
+                                        Gamma::Algebra, gammaBSigma,
+                                        Gamma::Algebra, gammaANucl,
+                                        Gamma::Algebra, gammaBNucl,
+                                        int, trace);
+    };
+    typedef Correlator<Metadata, SpinMatrix> Result;
+public:
+    // constructor
+    TSigmaToNucleonNonEye(const std::string name);
+    // destructor
+    virtual ~TSigmaToNucleonNonEye(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+protected:
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+    // Which gamma algebra was specified
+    Gamma::Algebra  al;
+};
+
+MODULE_REGISTER_TMP(SigmaToNucleonNonEye, ARG(TSigmaToNucleonNonEye<FIMPL>), MContraction);
+
+/******************************************************************************
+ *                         TSigmaToNucleonNonEye implementation                             *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl>
+TSigmaToNucleonNonEye<FImpl>::TSigmaToNucleonNonEye(const std::string name)
+: Module<SigmaToNucleonNonEyePar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl>
+std::vector<std::string> TSigmaToNucleonNonEye<FImpl>::getInput(void)
+{
+    std::vector<std::string> input = {par().quTi, par().quTf, par().quSpec, par().qdTf, par().qsTi, par().sink};
+    
+    return input;
+}
+
+template <typename FImpl>
+std::vector<std::string> TSigmaToNucleonNonEye<FImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TSigmaToNucleonNonEye<FImpl>::setup(void)
+{
+    envTmpLat(SpinMatrixField, "c");
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TSigmaToNucleonNonEye<FImpl>::execute(void)
+{
+    const Gamma GammaB(Gamma::Algebra::SigmaXZ); // C*gamma_5
+    const Gamma Id(Gamma::Algebra::Identity); // C*gamma_5
+
+    LOG(Message) << "Computing sigma-to-nucleon contractions '" << getName() << "'" << std::endl;
+    LOG(Message) << "' with (Gamma^A,Gamma^B)_sigma = ( Identity, C*gamma_5 ) and (Gamma^A,Gamma^B)_nucl = ( Identity, C*gamma_5 )" << std::endl; 
+    LOG(Message) << " using sink " << par().sink << "." << std::endl;
+        
+    envGetTmp(SpinMatrixField, c);
+    std::vector<SpinMatrix> buf;
+
+    std::vector<Result> result;
+    Result              r;
+    r.info.gammaASigma = Id.g;
+    r.info.gammaBSigma = GammaB.g;
+    r.info.gammaANucl  = Id.g;
+    r.info.gammaBNucl  = GammaB.g;
+
+    auto &quTi      = envGet(PropagatorField, par().quTi);
+    auto &quTf      = envGet(PropagatorField, par().quTf);
+    auto &quSpec    = envGet(SlicedPropagator, par().quSpec);
+    auto &qdTf      = envGet(PropagatorField, par().qdTf);
+    auto &qsTi      = envGet(PropagatorField, par().qsTi);
+    auto qut         = quSpec[par().tf];
+    for (auto &G: Gamma::gall)
+    {
+      r.info.gammaH = G.g;
+      //Operator Q1, equivalent to the two-trace case in the rare-kaons module
+      c=Zero();
+      BaryonUtils<FIMPL>::Sigma_to_Nucleon_NonEye(quTi,quTf,qut,qdTf,qsTi,G,GammaB,GammaB,"Q1",c);
+      sliceSum(c,buf,Tp);
+      r.corr.clear();
+      for (unsigned int t = 0; t < buf.size(); ++t)
+      {
+          r.corr.push_back(buf[t]);
+      }
+      r.info.trace = 2;
+      result.push_back(r);
+      //Operator Q2, equivalent to the one-trace case in the rare-kaons module
+      c=Zero();
+      BaryonUtils<FIMPL>::Sigma_to_Nucleon_NonEye(quTi,quTf,qut,qdTf,qsTi,G,GammaB,GammaB,"Q2",c);
+      sliceSum(c,buf,Tp);
+      r.corr.clear();
+      for (unsigned int t = 0; t < buf.size(); ++t)
+      {
+          r.corr.push_back(buf[t]);
+      }
+      r.info.trace = 1;
+      result.push_back(r);
+    }
+
+    saveResult(par().output, "stnNonEye", result);
+
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MContraction_SigmaToNucleonNonEye_hpp_
--- a/Hadrons/Modules/MContraction/WeakEye3pt.hpp
+++ b/Hadrons/Modules/MContraction/WeakEye3pt.hpp
@@ -144,7 +144,7 @@ void TWeakEye3pt<FImpl>::execute(void)
 {
    LOG(Message) << "Computing mesonic weak 3pt contractions, eye topologies" << std::endl;
    LOG(Message) << "gIn : " << par().gammaIn << std::endl;
-    LOG(Message) << "gOut: " << par().gammaIn << std::endl;
+    LOG(Message) << "gOut: " << par().gammaOut << std::endl;
    LOG(Message) << "tOut: " << par().tOut << std::endl;
    LOG(Message) << "qbl : " << par().qBarLeft << std::endl;
    LOG(Message) << "qbr : " << par().qBarRight << std::endl;
--- a/Hadrons/Modules/MContraction/WeakNonEye3pt.hpp
+++ b/Hadrons/Modules/MContraction/WeakNonEye3pt.hpp
@@ -144,7 +144,7 @@ void TWeakNonEye3pt<FImpl>::execute(void)
 {
    LOG(Message) << "Computing mesonic weak 3pt contractions, non-eye topologies" << std::endl;
    LOG(Message) << "gIn : " << par().gammaIn << std::endl;
-    LOG(Message) << "gOut: " << par().gammaIn << std::endl;
+    LOG(Message) << "gOut: " << par().gammaOut << std::endl;
    LOG(Message) << "ql  : " << par().qLeft << std::endl;
    LOG(Message) << "qbl : " << par().qBarLeft << std::endl;
    LOG(Message) << "qr  : " << par().qRight << std::endl;
--- a/Hadrons/Modules/MDistil/Distil.hpp
+++ b/Hadrons/Modules/MDistil/Distil.hpp
@@ -0,0 +1,124 @@
+/*************************************************************************************
+ 
+ Grid physics library, www.github.com/paboyle/Grid
+ 
+ Source file: Hadrons/Modules/MDistil/Distil.hpp
+ 
+ Copyright (C) 2015-2019
+ 
+ Author: Felix Erben <ferben@ed.ac.uk>
+ Author: Michael Marshall <Michael.Marshall@ed.ac.uk>
+ 
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ 
+ See the full license in the file "LICENSE" in the top level distribution directory
+ *************************************************************************************/
+/*  END LEGAL */
+
+#ifndef Hadrons_MDistil_Distil_hpp_
+#define Hadrons_MDistil_Distil_hpp_
+
+#include <Hadrons/NamedTensor.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Solver.hpp>
+#include <Hadrons/A2AVectors.hpp>
+#include <Hadrons/DilutedNoise.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+BEGIN_MODULE_NAMESPACE(MDistil)
+
+/******************************************************************************
+ Distillation code that is common across modules
+
+ Documentation on how to use this code available at
+
+ *  https://aportelli.github.io/Hadrons-doc/#/mdistil  *
+ 
+ Notation for (stochastic) DistilParameters taken from 1104.3870:
+
+ TI is interlaced dilution in time (corresponding to Nt = time-dimension of the lattice)
+ LI is interlaced dilution in laplacian-eigenvector space (corresponding to nvec)
+ SI is interlaced dilution in spin (corresponding to Ns, taken from Grid, usually Ns=4)
+
+ This code automatically computes perambulators using exact distillation if
+ *   (TI,LI,SI) = (Nt,nvec,Ns)   *
+ In this case, nnoise=1 and Noises is set to an array of values =1 as well.
+ tsrc then specifies the only timeslice on which the sources are supported.
+ (( for stochastic distillation, the vaue of tsrc has no meaning in this code ))
+
+ ******************************************************************************/
+
+struct DistilParameters: Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(DistilParameters,
+                                    int, nvec,
+                                    int, nnoise,
+                                    int, tsrc,
+                                    int, TI,
+                                    int, LI,
+                                    int, SI )
+};
+
+/******************************************************************************
+ Make a lower dimensional grid in preparation for local slice operations
+ ******************************************************************************/
+
+inline void MakeLowerDimGrid( std::unique_ptr<GridCartesian> &up, GridCartesian * gridHD )
+{
+    int nd{static_cast<int>(gridHD->_ndimension)};
+    Coordinate latt_size   = gridHD->_gdimensions;
+    latt_size[nd-1] = 1;
+    Coordinate simd_layout = GridDefaultSimd(nd-1, vComplex::Nsimd());
+    simd_layout.push_back( 1 );
+    Coordinate mpi_layout  = gridHD->_processors;
+    mpi_layout[nd-1] = 1;
+    up.reset( new GridCartesian(latt_size,simd_layout,mpi_layout,*gridHD) );
+}
+
+/*************************************************************************************
+ Rotate eigenvectors into our phase convention
+ First component of first eigenvector is real and positive
+ *************************************************************************************/
+
+inline void RotateEigen(std::vector<LatticeColourVector> & evec)
+{
+    ColourVector cv0;
+    auto grid = evec[0].Grid();
+    Coordinate siteFirst(grid->Nd(),0);
+    peekSite(cv0, evec[0], siteFirst);
+    const std::complex<Real> cplx0{cv0()()(0).real(), cv0()()(0).imag()};
+    if( cplx0.imag() == 0 )
+        LOG(Message) << "RotateEigen() : Site 0 : " << cplx0 << " => already meets phase convention" << std::endl;
+    else
+    {
+        const Real cplx0_mag{ std::abs(cplx0) };
+        const std::complex<Real> std_phase{std::conj(cplx0/cplx0_mag)};
+        LOG(Message) << "RotateEigen() : Site 0 : |" << cplx0 << "|=" << cplx0_mag
+                     << " => phase=" << (std::arg(std_phase) / M_PI) << " pi" << std::endl;
+        {
+            const Grid::Complex phase{std_phase.real(),std_phase.imag()};
+            for( int k = 0 ; k < evec.size() ; k++ )
+                evec[k] *= phase;
+            // Get rid of the rounding error in imaginary phase on the very first site
+            peekSite(cv0, evec[0], siteFirst);
+            cv0()()(0).imag(0); // this should be zero after the phase multiply - force it to be so
+            pokeSite(cv0, evec[0], siteFirst);
+        }
+    }
+}
+
+END_MODULE_NAMESPACE
+END_HADRONS_NAMESPACE
+#endif
--- a/Hadrons/Modules/MDistil/DistilPar.cc
+++ b/Hadrons/Modules/MDistil/DistilPar.cc
@@ -0,0 +1,7 @@
+#include <Hadrons/Modules/MDistil/DistilPar.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MDistil;
+
+template class Grid::Hadrons::MDistil::TDistilPar<FIMPL>;
--- a/Hadrons/Modules/MDistil/DistilPar.hpp
+++ b/Hadrons/Modules/MDistil/DistilPar.hpp
@@ -0,0 +1,97 @@
+/*************************************************************************************
+ 
+ Grid physics library, www.github.com/paboyle/Grid
+ 
+ Source file: Hadrons/Modules/MDistil/DistilPar.hpp
+ 
+ Copyright (C) 2019
+ 
+ Author: Felix Erben <ferben@ed.ac.uk>
+ Author: Michael Marshall <Michael.Marshall@ed.ac.uk>
+ 
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ 
+ See the full license in the file "LICENSE" in the top level distribution directory
+ *************************************************************************************/
+/*  END LEGAL */
+
+#ifndef Hadrons_MDistil_DistilPar_hpp_
+#define Hadrons_MDistil_DistilPar_hpp_
+
+#include <Hadrons/Modules/MDistil/Distil.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+BEGIN_MODULE_NAMESPACE(MDistil)
+
+/******************************************************************************
+ *                         DistilPar                                 *
+ ******************************************************************************/
+
+template <typename FImpl>
+class TDistilPar: public Module<DistilParameters>
+{
+public:
+    // constructor
+    TDistilPar(const std::string name);
+    // destructor
+    virtual ~TDistilPar(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(DistilPar, TDistilPar<FIMPL>, MDistil);
+
+/******************************************************************************
+ *                 TDistilPar implementation                             *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl>
+TDistilPar<FImpl>::TDistilPar(const std::string name) : Module<DistilParameters>(name) {}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl>
+std::vector<std::string> TDistilPar<FImpl>::getInput(void)
+{
+    return {};
+}
+
+template <typename FImpl>
+std::vector<std::string> TDistilPar<FImpl>::getOutput(void)
+{
+    return {getName()};
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TDistilPar<FImpl>::setup(void)
+{
+    envCreate(DistilParameters, getName(), 1, par() );
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TDistilPar<FImpl>::execute(void)
+{
+    // Nothing to do. setup() created and initialised the output object
+}
+
+END_MODULE_NAMESPACE
+END_HADRONS_NAMESPACE
+#endif
--- a/Hadrons/Modules/MDistil/DistilVectors.cc
+++ b/Hadrons/Modules/MDistil/DistilVectors.cc
@@ -0,0 +1,36 @@
+/*************************************************************************************
+ 
+ Grid physics library, www.github.com/paboyle/Grid
+ 
+ Source file: Hadrons/Modules/MDistil/DistilVectors.cc
+ 
+ Copyright (C) 2019
+ 
+ Author: Felix Erben <ferben@ed.ac.uk>
+ Author: Michael Marshall <Michael.Marshall@ed.ac.uk>
+ 
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ 
+ See the full license in the file "LICENSE" in the top level distribution directory
+ *************************************************************************************/
+/*  END LEGAL */
+
+#include <Hadrons/Modules/MDistil/DistilVectors.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MDistil;
+
+template class Grid::Hadrons::MDistil::TDistilVectors<FIMPL>;
--- a/Hadrons/Modules/MDistil/DistilVectors.hpp
+++ b/Hadrons/Modules/MDistil/DistilVectors.hpp
@@ -0,0 +1,243 @@
+/*************************************************************************************
+ 
+ Grid physics library, www.github.com/paboyle/Grid
+ 
+ Source file: Hadrons/Modules/MDistil/DistilVectors.hpp
+ 
+ Copyright (C) 2019
+ 
+ Author: Felix Erben <ferben@ed.ac.uk>
+ Author: Michael Marshall <Michael.Marshall@ed.ac.uk>
+ 
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ 
+ See the full license in the file "LICENSE" in the top level distribution directory
+ *************************************************************************************/
+/*  END LEGAL */
+
+#ifndef Hadrons_MDistil_DistilVectors_hpp_
+#define Hadrons_MDistil_DistilVectors_hpp_
+
+#include <Hadrons/Modules/MDistil/Distil.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+BEGIN_MODULE_NAMESPACE(MDistil)
+
+/******************************************************************************
+ *                         DistilVectors                                      *
+ *                (Create rho and/or phi vectors)                             *
+ ******************************************************************************/
+
+class DistilVectorsPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(DistilVectorsPar,
+                                    std::string, noise,
+                                    std::string, perambulator,
+                                    std::string, lapevec,
+                                    std::string, rho,
+                                    std::string, phi,
+                                    std::string, DistilParams);
+};
+
+template <typename FImpl>
+class TDistilVectors: public Module<DistilVectorsPar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl,);
+    // constructor
+    TDistilVectors(const std::string name);
+    // destructor
+    virtual ~TDistilVectors(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+protected:
+    std::unique_ptr<GridCartesian> grid3d; // Owned by me, so I must delete it
+public:
+    // These variables contain parameters
+    std::string RhoName;
+    std::string PhiName;
+};
+
+MODULE_REGISTER_TMP(DistilVectors, TDistilVectors<FIMPL>, MDistil);
+
+/******************************************************************************
+ *                 TDistilVectors implementation                             *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl>
+TDistilVectors<FImpl>::TDistilVectors(const std::string name) : Module<DistilVectorsPar>(name) {}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl>
+std::vector<std::string> TDistilVectors<FImpl>::getInput(void)
+{
+    return {par().noise,par().perambulator,par().lapevec,par().DistilParams};
+}
+
+template <typename FImpl>
+std::vector<std::string> TDistilVectors<FImpl>::getOutput(void)
+{
+    RhoName = par().rho;
+    PhiName = par().phi;
+    if (RhoName.empty() && PhiName.empty())
+    {
+        HADRONS_ERROR(Argument,"No output specified");
+    }
+    std::vector<std::string> out;
+    if (!RhoName.empty())
+        out.push_back(RhoName);
+    if (!PhiName.empty())
+        out.push_back(PhiName);
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TDistilVectors<FImpl>::setup(void)
+{
+    // We expect the perambulator to have been created with these indices
+    auto &perambulator = envGet(PerambTensor, par().perambulator);
+    if (!perambulator.ValidateIndexNames())
+    {
+        HADRONS_ERROR(Range,"Perambulator index names bad");
+    }
+
+    const DistilParameters &dp{envGet(DistilParameters, par().DistilParams)};
+    const int Nt{env().getDim(Tdir)};
+    const bool full_tdil{ dp.TI == Nt };
+    const int Nt_inv{ full_tdil ? 1 : dp.TI };
+    
+    if (!RhoName.empty())
+        envCreate(std::vector<FermionField>, RhoName, 1, dp.nnoise*dp.LI*dp.SI*Nt_inv, envGetGrid(FermionField));
+    if (!PhiName.empty())
+        envCreate(std::vector<FermionField>, PhiName, 1, dp.nnoise*dp.LI*dp.SI*Nt_inv, envGetGrid(FermionField));
+    
+    Coordinate latt_size   = GridDefaultLatt();
+    Coordinate mpi_layout  = GridDefaultMpi();
+    Coordinate simd_layout_3 = GridDefaultSimd(Nd-1, vComplex::Nsimd());
+    latt_size[Nd-1] = 1;
+    simd_layout_3.push_back( 1 );
+    mpi_layout[Nd-1] = 1;
+    GridCartesian * const grid4d{env().getGrid()};
+    MakeLowerDimGrid(grid3d, grid4d);
+    
+    envTmp(LatticeSpinColourVector, "source4d",1,LatticeSpinColourVector(grid4d));
+    envTmp(LatticeSpinColourVector, "source3d",1,LatticeSpinColourVector(grid3d.get()));
+    envTmp(LatticeColourVector, "source3d_nospin",1,LatticeColourVector(grid3d.get()));
+    envTmp(LatticeSpinColourVector, "sink3d",1,LatticeSpinColourVector(grid3d.get()));
+    envTmp(LatticeColourVector, "evec3d",1,LatticeColourVector(grid3d.get()));
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TDistilVectors<FImpl>::execute(void)
+{
+    auto &noise        = envGet(NoiseTensor,  par().noise);
+    auto &perambulator = envGet(PerambTensor, par().perambulator);
+    auto &epack        = envGet(Grid::Hadrons::EigenPack<LatticeColourVector>, par().lapevec);
+    const DistilParameters &dp{envGet(DistilParameters, par().DistilParams)};
+    
+    envGetTmp(LatticeSpinColourVector, source4d);
+    envGetTmp(LatticeSpinColourVector, source3d);
+    envGetTmp(LatticeColourVector,     source3d_nospin);
+    envGetTmp(LatticeSpinColourVector, sink3d);
+    envGetTmp(LatticeColourVector,     evec3d);
+    
+    GridCartesian * const grid4d{env().getGrid()};
+    const int Ntlocal{ grid4d->LocalDimensions()[3] };
+    const int Ntfirst{ grid4d->LocalStarts()[3] };
+    
+    const int Nt{env().getDim(Tdir)}; 
+    const bool full_tdil{ dp.TI == Nt }; 
+    const int Nt_inv{ full_tdil ? 1 : dp.TI };
+    
+    int vecindex;
+    if (!RhoName.empty())
+    {
+        auto &rho = envGet(std::vector<FermionField>, RhoName);
+        for (int inoise = 0; inoise < dp.nnoise; inoise++) 
+	{
+            for (int dk = 0; dk < dp.LI; dk++) 
+	    {
+                for (int dt = 0; dt < Nt_inv; dt++) 
+		{
+                    for (int ds = 0; ds < dp.SI; ds++) 
+		    {
+                        vecindex = inoise + dp.nnoise * (dk + dp.LI * (ds + dp.SI * dt));
+                        rho[vecindex] = 0;
+                        for (int it = dt; it < Nt; it += dp.TI)
+			{
+                            const int t_inv{full_tdil ? dp.tsrc : it};
+                            if (t_inv >= Ntfirst && t_inv < Ntfirst + Ntlocal) 
+			    {
+                                for (int ik = dk; ik < dp.nvec; ik += dp.LI)
+				{
+                                    for (int is = ds; is < Ns; is += dp.SI)
+				    {
+                                        ExtractSliceLocal(evec3d,epack.evec[ik],0,t_inv-Ntfirst,Tdir);
+                                        source3d_nospin = evec3d * noise.tensor(inoise, t_inv, ik, is);
+                                        source3d=0;
+                                        pokeSpin(source3d,source3d_nospin,is);
+                                        source4d=0;
+                                        InsertSliceLocal(source3d,source4d,0,t_inv-Ntfirst,Tdir);
+                                        rho[vecindex] += source4d;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    if (!PhiName.empty()) 
+    {
+        auto &phi = envGet(std::vector<FermionField>, PhiName);
+        for (int inoise = 0; inoise < dp.nnoise; inoise++) 
+	{
+            for (int dk = 0; dk < dp.LI; dk++) 
+	    {
+                for (int dt = 0; dt < Nt_inv; dt++) 
+		{
+                    for (int ds = 0; ds < dp.SI; ds++) 
+		    {
+                        vecindex = inoise + dp.nnoise * (dk + dp.LI * (ds + dp.SI * dt));
+                        phi[vecindex] = 0;
+                        for (int t = Ntfirst; t < Ntfirst + Ntlocal; t++) 
+			{
+                            sink3d=0;
+                            for (int ivec = 0; ivec < dp.nvec; ivec++) 
+			    {
+                                ExtractSliceLocal(evec3d,epack.evec[ivec],0,t-Ntfirst,Tdir);
+                                sink3d += evec3d * perambulator.tensor(t, ivec, dk, inoise,dt,ds);
+                            }
+                            InsertSliceLocal(sink3d,phi[vecindex],0,t-Ntfirst,Tdir);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+END_MODULE_NAMESPACE
+END_HADRONS_NAMESPACE
+#endif // Hadrons_MDistil_DistilVectors_hpp_
--- a/Hadrons/Modules/MDistil/LapEvec.cc
+++ b/Hadrons/Modules/MDistil/LapEvec.cc
@@ -0,0 +1,36 @@
+/*************************************************************************************
+ 
+ Grid physics library, www.github.com/paboyle/Grid
+ 
+ Source file: Hadrons/Modules/MDistil/LapEvec.cc
+ 
+ Copyright (C) 2019
+ 
+ Author: Felix Erben <ferben@ed.ac.uk>
+ Author: Michael Marshall <Michael.Marshall@ed.ac.uk>
+ 
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ 
+ See the full license in the file "LICENSE" in the top level distribution directory
+ *************************************************************************************/
+/*  END LEGAL */
+
+#include <Hadrons/Modules/MDistil/LapEvec.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MDistil;
+
+template class Grid::Hadrons::MDistil::TLapEvec<GIMPL>;
--- a/Hadrons/Modules/MDistil/LapEvec.hpp
+++ b/Hadrons/Modules/MDistil/LapEvec.hpp
@@ -0,0 +1,335 @@
+/*************************************************************************************
+ 
+ Grid physics library, www.github.com/paboyle/Grid
+ 
+ Source file: Hadrons/Modules/MDistil/LapEvec.hpp
+ 
+ Copyright (C) 2019
+ 
+ Author: Felix Erben <ferben@ed.ac.uk>
+ Author: Michael Marshall <Michael.Marshall@ed.ac.uk>
+ 
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ 
+ See the full license in the file "LICENSE" in the top level distribution directory
+ *************************************************************************************/
+/*  END LEGAL */
+
+#ifndef Hadrons_MDistil_LapEvec_hpp_
+#define Hadrons_MDistil_LapEvec_hpp_
+
+#include <Hadrons/Modules/MDistil/Distil.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+BEGIN_MODULE_NAMESPACE(MDistil)
+
+/******************************************************************************
+ 
+ Laplacian eigenvectors - parameters
+
+ Computes the eigenvectors of the 3D-Laplacian, built from stout-smeared 
+ gauge links with the specified number of steps and smearing parameter rho. 
+ The smearing is only applied to the spatial components of the gauge field,
+ i.e. rho_{4i} = rho_{i4} = rho_{44} = 0. 
+
+ Chebyshev-preconditioning is needed for convergence of the nvec lowest 
+ eigenvectors.
+ 
+ ******************************************************************************/
+
+struct StoutParameters: Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(StoutParameters,
+                                    int, steps,
+                                    double, rho)
+    StoutParameters() = default;
+    template <class ReaderClass> StoutParameters(Reader<ReaderClass>& Reader){read(Reader,"StoutSmearing",*this);}
+};
+
+struct ChebyshevParameters: Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(ChebyshevParameters,
+                                    int, PolyOrder,
+                                    double, alpha,
+                                    double, beta)
+    ChebyshevParameters() = default;
+    template <class ReaderClass> ChebyshevParameters(Reader<ReaderClass>& Reader){read(Reader,"Chebyshev",*this);}
+};
+
+struct LanczosParameters: Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
+                                    int, Nvec,
+                                    int, Nk,
+                                    int, Np,
+                                    int, MaxIt,
+                                    double, resid,
+                                    int, IRLLog)
+    LanczosParameters() = default;
+    template <class ReaderClass> LanczosParameters(Reader<ReaderClass>& Reader){read(Reader,"Lanczos",*this);}
+};
+
+// These are the actual parameters passed to the module during construction
+
+struct LapEvecPar: Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(LapEvecPar
+                                    ,std::string,         gauge
+                                    ,StoutParameters,     Stout
+                                    ,ChebyshevParameters, Cheby
+                                    ,LanczosParameters,   Lanczos
+                                    ,std::string,         FileName)
+};
+
+/******************************************************************************
+ 
+ Laplacian eigenvectors - Module (class) definition
+ 
+ ******************************************************************************/
+
+template <typename GImpl>
+class TLapEvec: public Module<LapEvecPar>
+{
+public:
+    GAUGE_TYPE_ALIASES(GImpl,);
+    // constructor
+    TLapEvec(const std::string name);
+    // destructor
+    virtual ~TLapEvec(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+protected:
+    std::unique_ptr<GridCartesian> gridLD; // Owned by me, so I must delete it
+};
+
+MODULE_REGISTER_TMP(LapEvec, TLapEvec<GIMPL>, MDistil);
+
+/******************************************************************************
+ TLapEvec implementation
+ ******************************************************************************/
+
+// constructor /////////////////////////////////////////////////////////////////
+template <typename GImpl>
+TLapEvec<GImpl>::TLapEvec(const std::string name) : Module<LapEvecPar>(name) {}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename GImpl>
+std::vector<std::string> TLapEvec<GImpl>::getInput(void)
+{
+    return std::vector<std::string>{par().gauge};
+}
+
+template <typename GImpl>
+std::vector<std::string> TLapEvec<GImpl>::getOutput(void)
+{
+    return {getName()}; // This is the higher dimensional eigenpack
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename GImpl>
+void TLapEvec<GImpl>::setup(void)
+{
+    GridCartesian * gridHD = env().getGrid();
+    MakeLowerDimGrid(gridLD,gridHD);
+    const int Ntlocal{gridHD->LocalDimensions()[Tdir]};
+    // Temporaries
+    envTmpLat(GaugeField, "Umu_stout");
+    envTmpLat(GaugeField, "Umu_smear");
+    envTmp(LatticeGaugeField, "UmuNoTime",1,LatticeGaugeField(gridLD.get()));
+    envTmp(LatticeColourVector, "src",1,LatticeColourVector(gridLD.get()));
+    envTmp(std::vector<LapEvecs>, "eig",1,std::vector<LapEvecs>(Ntlocal));
+    // Output objects
+    envCreate(LapEvecs, getName(), 1, par().Lanczos.Nvec, gridHD);
+}
+
+/*************************************************************************************
+ 
+ -Grad^2 (Peardon, 2009, pg 2, equation 3, https://arxiv.org/abs/0905.2160)
+ Field      Type of field the operator will be applied to
+ GaugeField Gauge field the operator will smear using
+ 
+ *************************************************************************************/
+
+template<typename Field, typename GaugeField=LatticeGaugeField>
+class Laplacian3D : public LinearOperatorBase<Field>, public LinearFunction<Field> {
+    typedef typename GaugeField::vector_type vCoeff_t;
+public:
+    int          nd; // number of spatial dimensions
+    std::vector<Lattice<iColourMatrix<vCoeff_t> > > U;
+    // Construct this operator given a gauge field and the number of dimensions it should act on
+    Laplacian3D( GaugeField& gf, int dimSpatial = Tdir ) : nd{dimSpatial}
+    {
+        if (dimSpatial<1)
+        {
+            HADRONS_ERROR(Range,"Must be at least one spatial dimension");
+        }
+        for (int mu = 0 ; mu < nd ; mu++)
+            U.push_back(PeekIndex<LorentzIndex>(gf,mu));
+    }
+    
+    // Apply this operator to "in", return result in "out"
+    void operator()(const Field& in, Field& out) {
+        if (nd > in.Grid()->Nd())
+        {
+            HADRONS_ERROR(Range,"nd too large");
+        }
+        conformable( in, out );
+        out = ( ( Real ) ( 2 * nd ) ) * in;
+        Field tmp_(in.Grid());
+        typedef typename GaugeField::vector_type vCoeff_t;
+        for (int mu = 0 ; mu < nd ; mu++)
+        {
+            out -= U[mu] * Cshift( in, mu, 1);
+            tmp_ = adj( U[mu] ) * in;
+            out -= Cshift(tmp_,mu,-1);
+        }
+    }
+    
+    void OpDiag (const Field &in, Field &out) { HADRONS_ERROR(Definition, "OpDiag() undefined"); };
+    void OpDir  (const Field &in, Field &out,int dir,int disp) { HADRONS_ERROR(Definition, "OpDir() undefined"); };
+    void Op     (const Field &in, Field &out) { HADRONS_ERROR(Definition, "Op() undefined"); };
+    void AdjOp  (const Field &in, Field &out) { HADRONS_ERROR(Definition, "AdjOp() undefined"); };
+    void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2) { HADRONS_ERROR(Definition, "HermOpAndNorm() undefined"); };
+    void HermOp(const Field &in, Field &out) { operator()(in,out); };
+};
+
+template<typename Field>
+class Laplacian3DHerm : public LinearFunction<Field> {
+public:
+    OperatorFunction<Field>   & poly_;
+    LinearOperatorBase<Field> &Linop_;
+    Laplacian3DHerm(OperatorFunction<Field> & poly,LinearOperatorBase<Field>& linop)
+    : poly_{poly}, Linop_{linop} {}
+    void operator()(const Field& in, Field& out)
+    {
+        poly_(Linop_,in,out);
+    }
+};
+
+/******************************************************************************
+ Calculate low-mode eigenvalues of the Laplacian
+ ******************************************************************************/
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename GImpl>
+void TLapEvec<GImpl>::execute(void)
+{
+    const ChebyshevParameters &ChebPar{par().Cheby};
+    const LanczosParameters   &LPar{par().Lanczos};
+    
+    // Disable IRL logging if requested
+    LOG(Message) << "IRLLog=" << LPar.IRLLog << std::endl;
+    const int PreviousIRLLogState{GridLogIRL.isActive()};
+    GridLogIRL.Active( LPar.IRLLog == 0 ? 0 : 1 );
+    
+    // Stout smearing
+    envGetTmp(GaugeField, Umu_smear);
+    Umu_smear = envGet(GaugeField, par().gauge); // The smeared field starts off as the Gauge field
+    LOG(Message) << "Initial plaquette: " << WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu_smear) << std::endl;
+    const StoutParameters &Stout{par().Stout};
+    if( Stout.steps )
+    {
+        envGetTmp(GaugeField, Umu_stout);
+        Smear_Stout<PeriodicGimplR> LS(Stout.rho, Tdir); // spatial smearing only
+        for (int i = 0; i < Stout.steps; i++) {
+            LS.smear(Umu_stout, Umu_smear);
+            Umu_smear = Umu_stout;
+        }
+        LOG(Message) << "Smeared plaquette: " << WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu_smear) << std::endl;
+    }
+    
+    ////////////////////////////////////////////////////////////////////////
+    // Invert nabla operator separately on each time-slice
+    ////////////////////////////////////////////////////////////////////////
+    
+    auto & eig4d = envGet(LapEvecs, getName() );
+    envGetTmp(std::vector<LapEvecs>, eig);   // Eigenpack for each timeslice
+    envGetTmp(LatticeGaugeField, UmuNoTime); // Gauge field without time dimension
+    envGetTmp(LatticeColourVector, src);
+    GridCartesian * gridHD = env().getGrid();
+    const int Ntlocal{gridHD->LocalDimensions()[Tdir]};
+    const int Ntfirst{gridHD->LocalStarts()[Tdir]};
+    uint32_t ConvergenceErrors{0};
+    for (int t = 0; t < Ntlocal; t++ )
+    {
+        LOG(Message) << "------------------------------------------------------------" << std::endl;
+        LOG(Message) << " Compute eigenpack, local timeslice = " << t << " / " << Ntlocal << std::endl;
+        LOG(Message) << "------------------------------------------------------------" << std::endl;
+        eig[t].resize(LPar.Nk+LPar.Np,gridLD.get());
+        
+        // Construct smearing operator
+        ExtractSliceLocal(UmuNoTime,Umu_smear,0,t,Tdir); // switch to 3d/4d objects
+        Laplacian3D<LatticeColourVector> Nabla(UmuNoTime);
+        LOG(Message) << "Chebyshev preconditioning to order " << ChebPar.PolyOrder
+                     << " with parameters (alpha,beta) = (" << ChebPar.alpha << "," << ChebPar.beta << ")" << std::endl;
+        Chebyshev<LatticeColourVector> Cheb(ChebPar.alpha,ChebPar.beta,ChebPar.PolyOrder);
+        
+        // Construct source vector according to Test_dwf_compressed_lanczos.cc
+        src = 11.0; // NB: This is a dummy parameter and just needs to be non-zero
+        RealD nn = norm2(src);
+        nn = Grid::sqrt(nn);
+        src = src * (1.0/nn);
+        
+        Laplacian3DHerm<LatticeColourVector> NablaCheby(Cheb,Nabla);
+        ImplicitlyRestartedLanczos<LatticeColourVector>
+        IRL(NablaCheby,Nabla,LPar.Nvec,LPar.Nk,LPar.Nk+LPar.Np,LPar.resid,LPar.MaxIt);
+        int Nconv = 0;
+        IRL.calc(eig[t].eval,eig[t].evec,src,Nconv);
+        if (Nconv < LPar.Nvec)
+        {
+            // NB: Can't assert here since we are processing local slices - i.e. not all nodes would assert
+            ConvergenceErrors = 1;
+            LOG(Error) << "MDistil::LapEvec : Not enough eigenvectors converged. If this occurs in practice, we should modify the eigensolver to iterate once more to ensure the second convergence test does not take us below the requested number of eigenvectors" << std::endl;
+        }
+        if( Nconv != LPar.Nvec )
+            eig[t].resize(LPar.Nvec, gridLD.get());
+        RotateEigen( eig[t].evec ); // Rotate the eigenvectors into our phase convention
+        
+        for (int i=0;i<LPar.Nvec;i++){
+            InsertSliceLocal(eig[t].evec[i],eig4d.evec[i],0,t,Tdir);
+            if(t==0 && Ntfirst==0)
+                eig4d.eval[i] = eig[t].eval[i]; // TODO: Discuss: is this needed? Is there a better way?
+        }
+    }
+    GridLogIRL.Active( PreviousIRLLogState );
+    gridHD->GlobalSum(ConvergenceErrors);
+    if(ConvergenceErrors!=0)
+    {
+        HADRONS_ERROR(Program,"The eingensolver failed to find enough eigenvectors on at least one node");
+    }
+    // Now write out the 4d eigenvectors
+    std::string sEigenPackName(par().FileName);
+    if( !sEigenPackName.empty() )
+    {
+        eig4d.record.solverXml = parString();
+        ModuleBase * b{vm().getModule(par().gauge)};
+        std::string sOperatorXml{ "<module><id><type>" };
+        sOperatorXml.append( b->getRegisteredName() );
+        sOperatorXml.append( "</type></id><options>" );
+        sOperatorXml.append( b->parString() );
+        sOperatorXml.append( "</options></module>" );
+        eig4d.record.operatorXml = sOperatorXml;
+        sEigenPackName.append(".");
+        sEigenPackName.append(std::to_string(vm().getTrajectory()));
+        eig4d.write(sEigenPackName,false);
+    }
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MDistil_LapEvec_hpp_
--- a/Hadrons/Modules/MDistil/Noises.cc
+++ b/Hadrons/Modules/MDistil/Noises.cc
@@ -0,0 +1,36 @@
+/*************************************************************************************
+ 
+ Grid physics library, www.github.com/paboyle/Grid
+ 
+ Source file: Hadrons/Modules/MDistil/Noises.cc
+ 
+ Copyright (C) 2019
+ 
+ Author: Felix Erben <ferben@ed.ac.uk>
+ Author: Michael Marshall <Michael.Marshall@ed.ac.uk>
+ 
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ 
+ See the full license in the file "LICENSE" in the top level distribution directory
+ *************************************************************************************/
+/*  END LEGAL */
+
+#include <Hadrons/Modules/MDistil/Noises.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MDistil;
+
+template class Grid::Hadrons::MDistil::TNoises<FIMPL>;
--- a/Hadrons/Modules/MDistil/Noises.hpp
+++ b/Hadrons/Modules/MDistil/Noises.hpp
@@ -0,0 +1,146 @@
+/*************************************************************************************
+ 
+ Grid physics library, www.github.com/paboyle/Grid
+ 
+ Source file: Hadrons/Modules/MDistil/Noises.hpp
+ 
+ Copyright (C) 2019
+ 
+ Author: Felix Erben <ferben@ed.ac.uk>
+ Author: Michael Marshall <Michael.Marshall@ed.ac.uk>
+ 
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ 
+ See the full license in the file "LICENSE" in the top level distribution directory
+ *************************************************************************************/
+/*  END LEGAL */
+
+#ifndef Hadrons_MDistil_Noises_hpp_
+#define Hadrons_MDistil_Noises_hpp_
+
+#include <Hadrons/Modules/MDistil/Distil.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+BEGIN_MODULE_NAMESPACE(MDistil)
+
+/******************************************************************************
+ *                         Noises                                 *
+ ******************************************************************************/
+
+class NoisesPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(NoisesPar,
+                                    std::string, DistilParams,
+                                    std::string, NoiseFileName)
+};
+
+template <typename FImpl>
+class TNoises: public Module<NoisesPar>
+{
+public:
+    // constructor
+    TNoises(const std::string name);
+    // destructor
+    virtual ~TNoises(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(Noises, TNoises<FIMPL>, MDistil);
+
+/******************************************************************************
+ *                 TNoises implementation                             *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl>
+TNoises<FImpl>::TNoises(const std::string name) : Module<NoisesPar>(name) {}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl>
+std::vector<std::string> TNoises<FImpl>::getInput(void)
+{
+    return {par().DistilParams};
+}
+
+template <typename FImpl>
+std::vector<std::string> TNoises<FImpl>::getOutput(void)
+{
+    return {getName()};
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+
+template <typename FImpl>
+void TNoises<FImpl>::setup(void)
+{
+    const DistilParameters &dp{envGet(DistilParameters, par().DistilParams)};
+    const int Nt{env().getDim(Tdir)};
+    std::cout << dp.nnoise << dp.nvec << Nt << Ns << std::endl; 
+    envCreate(NoiseTensor, getName(), 1, dp.nnoise, Nt, dp.nvec, Ns);
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TNoises<FImpl>::execute(void)
+{
+    const DistilParameters &dp{envGet(DistilParameters, par().DistilParams)};
+    const int Nt{env().getDim(Tdir)};
+    const bool full_tdil{ dp.TI == Nt };
+    const bool exact_distillation{ full_tdil && dp.LI == dp.nvec };
+    
+    // We use our own seeds so we can specify different noises per quark
+    Real rn;
+    auto &noise = envGet(NoiseTensor, getName());
+    for (int inoise = 0; inoise < dp.nnoise; inoise++) 
+    {
+        for (int t = 0; t < Nt; t++) 
+	{
+            for (int ivec = 0; ivec < dp.nvec; ivec++) 
+	    {
+                for (int is = 0; is < Ns; is++) 
+		{
+                    if (exact_distillation)
+		    {
+                        noise.tensor(inoise, t, ivec, is) = 1.;
+		    }
+    		    else
+		    {
+                        random(rngSerial(),rn);
+                        // We could use a greater number of complex roots of unity
+                        // ... but this seems to work well
+                        noise.tensor(inoise, t, ivec, is) = (rn > 0.5) ? -1 : 1;
+                    }
+                }
+            }
+        }
+    }
+    if (env().getGrid()->IsBoss())
+    {
+        std::string sName {par().NoiseFileName};
+        sName.append(".");
+        sName.append(std::to_string(vm().getTrajectory()));
+        noise.write(sName.c_str());
+    }
+}
+
+END_MODULE_NAMESPACE
+END_HADRONS_NAMESPACE
+#endif
--- a/Hadrons/Modules/MDistil/PerambFromSolve.cc
+++ b/Hadrons/Modules/MDistil/PerambFromSolve.cc
@@ -0,0 +1,36 @@
+/*************************************************************************************
+ 
+ Grid physics library, www.github.com/paboyle/Grid
+ 
+ Source file: Hadrons/Modules/MDistil/PerambFromSolve.cc
+ 
+ Copyright (C) 2019
+ 
+ Author: Felix Erben <ferben@ed.ac.uk>
+ Author: Michael Marshall <Michael.Marshall@ed.ac.uk>
+ 
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ 
+ See the full license in the file "LICENSE" in the top level distribution directory
+ *************************************************************************************/
+/*  END LEGAL */
+
+#include <Hadrons/Modules/MDistil/PerambFromSolve.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MDistil;
+
+template class Grid::Hadrons::MDistil::TPerambFromSolve<FIMPL>;
--- a/Hadrons/Modules/MDistil/PerambFromSolve.hpp
+++ b/Hadrons/Modules/MDistil/PerambFromSolve.hpp
@@ -0,0 +1,183 @@
+/*************************************************************************************
+ 
+ Grid physics library, www.github.com/paboyle/Grid
+ 
+ Source file: Hadrons/Modules/MDistil/PerambFromSolve.hpp
+ 
+ Copyright (C) 2019
+ 
+ Author: Felix Erben <ferben@ed.ac.uk>
+ Author: Michael Marshall <Michael.Marshall@ed.ac.uk>
+ 
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ 
+ See the full license in the file "LICENSE" in the top level distribution directory
+ *************************************************************************************/
+/*  END LEGAL */
+
+#ifndef Hadrons_MDistil_PerambFromSolve_hpp_
+#define Hadrons_MDistil_PerambFromSolve_hpp_
+
+#include <Hadrons/Modules/MDistil/Distil.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+BEGIN_MODULE_NAMESPACE(MDistil)
+
+/******************************************************************************
+ *                         PerambFromSolve 
+
+  This module computes a perambulator from an already completed solve. 
+  Optionally, the number of eigenvectors used in the perambulator and the 
+  parameter LI can be chosen to be lower than the ones in the solve, allowing 
+  for a study of the signal with different values of nvec. 
+
+  LI_reduced  : value of LI actually used in the computation
+  nvec_reduced: value of nvec actually used in the computation
+  LI          : value of LI used to compute the 'solve'
+  nvec        : value of nvec used to compute the 'solve'
+
+ ******************************************************************************/
+
+class PerambFromSolvePar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(PerambFromSolvePar,
+                                    std::string, lapevec,
+                                    std::string, PerambFileName,
+                                    std::string, solve,
+                                    int, nvec_reduced,
+                                    int, LI_reduced,
+                                    std::string, DistilParams);
+};
+
+template <typename FImpl>
+class TPerambFromSolve: public Module<PerambFromSolvePar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl,);
+    // constructor
+    TPerambFromSolve(const std::string name);
+    // destructor
+    virtual ~TPerambFromSolve(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+protected:
+    std::unique_ptr<GridCartesian> grid3d; // Owned by me, so I must delete it
+};
+
+MODULE_REGISTER_TMP(PerambFromSolve, TPerambFromSolve<FIMPL>, MDistil);
+
+/******************************************************************************
+ *                 TPerambFromSolve implementation                             *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl>
+TPerambFromSolve<FImpl>::TPerambFromSolve(const std::string name) : Module<PerambFromSolvePar>(name){}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl>
+std::vector<std::string> TPerambFromSolve<FImpl>::getInput(void)
+{
+    return {par().solve, par().lapevec, par().DistilParams};
+}
+
+template <typename FImpl>
+std::vector<std::string> TPerambFromSolve<FImpl>::getOutput(void)
+{
+    return std::vector<std::string>{getName()};
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TPerambFromSolve<FImpl>::setup(void)
+{
+    const DistilParameters & dp{envGet(MDistil::DistilParameters, par().DistilParams)};
+    const int Nt{env().getDim(Tdir)};
+    const bool full_tdil{ dp.TI == Nt };
+    const int Nt_inv{ full_tdil ? 1 : dp.TI };
+    MakeLowerDimGrid( grid3d, env().getGrid() );
+    const int nvec_reduced{par().nvec_reduced};
+    const int LI_reduced{  par().LI_reduced};
+    envCreate(PerambTensor, getName(), 1, Nt,nvec_reduced,LI_reduced,dp.nnoise,Nt_inv,dp.SI);
+    envCreate(NoiseTensor, getName() + "_noise", 1, dp.nnoise, Nt, dp.nvec, Ns );
+    envTmp(LatticeColourVector, "result3d_nospin",1,LatticeColourVector(grid3d.get()));
+    envTmp(LatticeColourVector, "evec3d",1,LatticeColourVector(grid3d.get()));
+    envTmpLat(LatticeColourVector, "result4d_nospin");
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TPerambFromSolve<FImpl>::execute(void)
+{
+    GridCartesian * grid4d = env().getGrid();
+    const int Ntlocal{grid4d->LocalDimensions()[3]};
+    const int Ntfirst{grid4d->LocalStarts()[3]};
+    const DistilParameters &dp{envGet(DistilParameters, par().DistilParams)};
+    const int Nt{env().getDim(Tdir)};
+    const bool full_tdil{ dp.TI == Nt };
+    const int Nt_inv{ full_tdil ? 1 : dp.TI };
+    const int nvec_reduced{par().nvec_reduced};
+    const int LI_reduced{  par().LI_reduced};
+    auto &perambulator  = envGet(PerambTensor, getName());
+    auto &solve         = envGet(std::vector<FermionField>, par().solve);
+    auto &epack         = envGet(Grid::Hadrons::EigenPack<LatticeColourVector>, par().lapevec);
+    
+    envGetTmp(LatticeColourVector, result4d_nospin);
+    envGetTmp(LatticeColourVector, result3d_nospin);
+    envGetTmp(LatticeColourVector, evec3d);
+    
+    for (int inoise = 0; inoise < dp.nnoise; inoise++)
+    {
+        for (int dk = 0; dk < LI_reduced; dk++)
+       	{
+            for (int dt = 0; dt < Nt_inv; dt++)
+	    {
+                for (int ds = 0; ds < dp.SI; ds++)
+	       	{
+                    for (int is = 0; is < Ns; is++)
+		    {
+                        result4d_nospin = peekSpin(solve[inoise+dp.nnoise*(dk+dp.LI*(dt+Nt_inv*ds))],is);
+                        for (int t = Ntfirst; t < Ntfirst + Ntlocal; t++)
+		       	{
+                            ExtractSliceLocal(result3d_nospin,result4d_nospin,0,t-Ntfirst,Tdir);
+                            for (int ivec = 0; ivec < nvec_reduced; ivec++)
+			    {
+                                ExtractSliceLocal(evec3d,epack.evec[ivec],0,t-Ntfirst,Tdir);
+                                pokeSpin(perambulator.tensor(t, ivec, dk, inoise,dt,ds),static_cast<Complex>(innerProduct(evec3d, result3d_nospin)),is);
+                                LOG(Message) <<  "perambulator(t, ivec, dk, inoise,dt,ds)(is) = (" << t << "," << ivec << "," << dk << "," << inoise << "," << dt << "," << ds << ")(" << is << ") = " <<  perambulator.tensor(t, ivec, dk, inoise,dt,ds)()(is)() << std::endl;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    if(grid4d->IsBoss())
+    {
+        std::string sPerambName{par().PerambFileName};
+        sPerambName.append( "." );
+        sPerambName.append( std::to_string(vm().getTrajectory()));
+        perambulator.write(sPerambName.c_str());
+    }
+}
+
+END_MODULE_NAMESPACE
+END_HADRONS_NAMESPACE
+#endif // Hadrons_MDistil_PerambFromSolve_hpp_
--- a/Hadrons/Modules/MDistil/Perambulator.cc
+++ b/Hadrons/Modules/MDistil/Perambulator.cc
@@ -0,0 +1,57 @@
+/*************************************************************************************
+ 
+ Grid physics library, www.github.com/paboyle/Grid
+ 
+ Source file: Hadrons/Modules/MDistil/Perambulator.cc
+ 
+ Copyright (C) 2019
+ 
+ Author: Felix Erben <ferben@ed.ac.uk>
+ Author: Michael Marshall <Michael.Marshall@ed.ac.uk>
+ 
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ 
+ See the full license in the file "LICENSE" in the top level distribution directory
+ *************************************************************************************/
+/*  END LEGAL */
+
+#include <Hadrons/Modules/MDistil/Perambulator.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MDistil;
+
+template class Grid::Hadrons::MDistil::TPerambulator<FIMPL>;
+
+BEGIN_HADRONS_NAMESPACE
+
+// Global constants for distillation
+
+#ifdef HAVE_HDF5
+extern const std::string NamedTensorFileExtension{".h5"};
+#else
+extern const std::string NamedTensorFileExtension{".dat"};
+#endif
+
+BEGIN_MODULE_NAMESPACE(MDistil)
+
+const std::string                NoiseTensor::Name__{"Noises"};
+const std::array<std::string, 4> NoiseTensor::DefaultIndexNames__{"nNoise", "nT", "nVec", "nS"};
+
+const std::string                PerambTensor::Name__{"Perambulator"};
+const std::array<std::string, 6> PerambTensor::DefaultIndexNames__{"nT", "nVec", "LI", "nNoise", "nT_inv", "SI"};
+
+END_MODULE_NAMESPACE
+END_HADRONS_NAMESPACE
--- a/Hadrons/Modules/MDistil/Perambulator.hpp
+++ b/Hadrons/Modules/MDistil/Perambulator.hpp
@@ -0,0 +1,263 @@
+/*************************************************************************************
+ 
+ Grid physics library, www.github.com/paboyle/Grid
+ 
+ Source file: Hadrons/Modules/MDistil/Perambulator.hpp
+ 
+ Copyright (C) 2019
+ 
+ Author: Felix Erben <ferben@ed.ac.uk>
+ Author: Michael Marshall <Michael.Marshall@ed.ac.uk>
+ 
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ 
+ See the full license in the file "LICENSE" in the top level distribution directory
+ *************************************************************************************/
+/*  END LEGAL */
+
+#ifndef Hadrons_MDistil_Perambulator_hpp_
+#define Hadrons_MDistil_Perambulator_hpp_
+
+#include <Hadrons/Modules/MDistil/Distil.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+BEGIN_MODULE_NAMESPACE(MDistil)
+
+/******************************************************************************
+ *                             Perambulator                                    *
+ ******************************************************************************/
+
+class PerambulatorPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(PerambulatorPar,
+                                    std::string, lapevec,
+                                    std::string, solver,
+                                    std::string, noise,
+                                    std::string, PerambFileName,
+                                    std::string, UnsmearedSinkFileName,
+                                    std::string, DistilParams);
+};
+
+template <typename FImpl>
+class TPerambulator: public Module<PerambulatorPar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl,);
+    SOLVER_TYPE_ALIASES(FImpl,);
+    // constructor
+    TPerambulator(const std::string name);
+    // destructor
+    virtual ~TPerambulator(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+protected:
+    std::unique_ptr<GridCartesian> grid3d; // Owned by me, so I must delete it
+    unsigned int Ls_;
+};
+
+MODULE_REGISTER_TMP(Perambulator, TPerambulator<FIMPL>, MDistil);
+
+/******************************************************************************
+ *                 TPerambulator implementation                             *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl>
+TPerambulator<FImpl>::TPerambulator(const std::string name) : Module<PerambulatorPar>(name) {}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl>
+std::vector<std::string> TPerambulator<FImpl>::getInput(void)
+{
+    return {par().lapevec, par().solver, par().noise, par().DistilParams};
+}
+
+template <typename FImpl>
+std::vector<std::string> TPerambulator<FImpl>::getOutput(void)
+{
+    return {getName(), getName() + "_unsmeared_sink"};
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TPerambulator<FImpl>::setup(void)
+{
+    MakeLowerDimGrid(grid3d, env().getGrid());
+    const DistilParameters &dp = envGet(DistilParameters, par().DistilParams);
+    const int  Nt{env().getDim(Tdir)};
+    const bool full_tdil{ dp.TI == Nt };
+    const int  Nt_inv{ full_tdil ? 1 : dp.TI };
+
+    envCreate(PerambTensor, getName(), 1, Nt, dp.nvec, dp.LI, dp.nnoise, Nt_inv, dp.SI);
+    envCreate(std::vector<FermionField>, getName() + "_unsmeared_sink", 1,
+              dp.nnoise*dp.LI*Ns*Nt_inv, envGetGrid(FermionField));
+    
+    envTmpLat(LatticeSpinColourVector,   "dist_source");
+    envTmpLat(LatticeSpinColourVector,   "source4d");
+    envTmp(LatticeSpinColourVector,      "source3d",1,LatticeSpinColourVector(grid3d.get()));
+    envTmp(LatticeColourVector,          "source3d_nospin",1,LatticeColourVector(grid3d.get()));
+    envTmpLat(LatticeSpinColourVector,   "result4d");
+    envTmpLat(LatticeColourVector,       "result4d_nospin");
+    envTmp(LatticeColourVector,          "result3d_nospin",1,LatticeColourVector(grid3d.get()));
+    envTmp(LatticeColourVector,          "evec3d",1,LatticeColourVector(grid3d.get()));
+    
+    Ls_ = env().getObjectLs(par().solver);
+    envTmpLat(FermionField, "v4dtmp");
+    envTmpLat(FermionField, "v5dtmp", Ls_);
+    envTmpLat(FermionField, "v5dtmp_sol", Ls_);
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TPerambulator<FImpl>::execute(void)
+{
+    const DistilParameters &dp{ envGet(DistilParameters, par().DistilParams) };
+    const int Nt{env().getDim(Tdir)};
+    const bool full_tdil{ dp.TI == Nt }; 
+    const int Nt_inv{ full_tdil ? 1 : dp.TI };
+
+    auto &solver=envGet(Solver, par().solver);
+    auto &mat = solver.getFMat();
+    envGetTmp(FermionField, v4dtmp);
+    envGetTmp(FermionField, v5dtmp);
+    envGetTmp(FermionField, v5dtmp_sol);
+    auto &noise = envGet(NoiseTensor, par().noise);
+    auto &perambulator = envGet(PerambTensor, getName());
+    auto &epack = envGet(LapEvecs, par().lapevec);
+    auto &unsmeared_sink = envGet(std::vector<FermionField>, getName() + "_unsmeared_sink");
+    envGetTmp(LatticeSpinColourVector, dist_source);
+    envGetTmp(LatticeSpinColourVector, source4d);
+    envGetTmp(LatticeSpinColourVector, source3d);
+    envGetTmp(LatticeColourVector, source3d_nospin);
+    envGetTmp(LatticeSpinColourVector, result4d);
+    envGetTmp(LatticeColourVector, result4d_nospin);
+    envGetTmp(LatticeColourVector, result3d_nospin);
+    envGetTmp(LatticeColourVector, evec3d);
+    GridCartesian * const grid4d{ env().getGrid() }; // Owned by environment (so I won't delete it)
+    const int Ntlocal{grid4d->LocalDimensions()[3]};
+    const int Ntfirst{grid4d->LocalStarts()[3]};
+    const std::string UnsmearedSinkFileName{ par().UnsmearedSinkFileName };
+
+    for (int inoise = 0; inoise < dp.nnoise; inoise++)
+    {
+        for (int dk = 0; dk < dp.LI; dk++)
+        {
+            for (int dt = 0; dt < Nt_inv; dt++)
+            {
+                for (int ds = 0; ds < dp.SI; ds++)
+                {
+                    LOG(Message) <<  "LapH source vector from noise " << inoise << " and dilution component (d_k,d_t,d_alpha) : (" << dk << ","<< dt << "," << ds << ")" << std::endl;
+                    dist_source = 0;
+                    evec3d = 0;
+                    for (int it = dt; it < Nt; it += dp.TI)
+                    {
+                        const int t_inv{full_tdil ? dp.tsrc : it};
+                        if( t_inv >= Ntfirst && t_inv < Ntfirst + Ntlocal )
+                        {
+                            for (int ik = dk; ik < dp.nvec; ik += dp.LI)
+                            {
+                                for (int is = ds; is < Ns; is += dp.SI)
+                                {
+                                    ExtractSliceLocal(evec3d,epack.evec[ik],0,t_inv-Ntfirst,Tdir);
+                                    source3d_nospin = evec3d * noise.tensor(inoise, t_inv, ik, is);
+                                    source3d=0;
+                                    pokeSpin(source3d,source3d_nospin,is);
+                                    source4d=0;
+                                    InsertSliceLocal(source3d,source4d,0,t_inv-Ntfirst,Tdir);
+                                    dist_source += source4d;
+                                }
+                            }
+                        }
+                    }
+                    result4d=0;
+                    v4dtmp = dist_source;
+                    if (Ls_ == 1)
+                        solver(result4d, v4dtmp);
+                    else
+                    {
+                        mat.ImportPhysicalFermionSource(v4dtmp, v5dtmp);
+                        solver(v5dtmp_sol, v5dtmp);
+                        mat.ExportPhysicalFermionSolution(v5dtmp_sol, v4dtmp);
+                        result4d = v4dtmp;
+                    }
+                    if (!UnsmearedSinkFileName.empty())
+                        unsmeared_sink[inoise+dp.nnoise*(dk+dp.LI*(dt+Nt_inv*ds))] = result4d;
+                    for (int is = 0; is < Ns; is++)
+                    {
+                        result4d_nospin = peekSpin(result4d,is);
+                        for (int t = Ntfirst; t < Ntfirst + Ntlocal; t++)
+                        {
+                            ExtractSliceLocal(result3d_nospin,result4d_nospin,0,t-Ntfirst,Tdir); 
+			    for (int ivec = 0; ivec < dp.nvec; ivec++)
+                            {
+                                ExtractSliceLocal(evec3d,epack.evec[ivec],0,t-Ntfirst,Tdir);
+                                pokeSpin(perambulator.tensor(t, ivec, dk, inoise,dt,ds),static_cast<Complex>(innerProduct(evec3d, result3d_nospin)),is);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    // Now share my timeslice data with other members of the grid
+    const int NumSlices{grid4d->_processors[Tdir] / grid3d->_processors[Tdir]};
+    if (NumSlices > 1)
+    {
+        LOG(Debug) <<  "Sharing perambulator data with other nodes" << std::endl;
+        const int MySlice {grid4d->_processor_coor[Tdir]};
+        const int SliceCount {static_cast<int>(perambulator.tensor.size()/NumSlices)};
+        PerambTensor::Scalar * const MyData {perambulator.tensor.data()+MySlice*SliceCount};
+        Coordinate coor(Nd);
+        for (int i = 0 ; i < Tdir ; i++) coor[i] = grid4d->_processor_coor[i];
+        std::vector<CommsRequest_t> reqs(0);
+        for (int i = 1; i < NumSlices ; i++)
+        {
+            coor[Tdir] = (MySlice+i)%NumSlices;
+            const int SendRank { grid4d->RankFromProcessorCoor(coor) };
+            const int RecvSlice { ( MySlice - i + NumSlices ) % NumSlices };
+            coor[Tdir] = RecvSlice;
+            const auto RecvRank = grid4d->RankFromProcessorCoor(coor);
+            grid4d->SendToRecvFromBegin(reqs,MyData,SendRank, perambulator.tensor.data()
+                                        + RecvSlice*SliceCount,RecvRank,SliceCount*sizeof(PerambTensor::Scalar));
+        }
+        grid4d->SendToRecvFromComplete(reqs);
+    }
+    
+    // Save the perambulator to disk from the boss node
+    if (grid4d->IsBoss())
+    {
+        std::string sPerambName {par().PerambFileName};
+        sPerambName.append(".");
+        sPerambName.append(std::to_string(vm().getTrajectory()));
+        perambulator.write(sPerambName.c_str());
+    }
+    
+    //Save the unsmeared sinks if filename specified
+    if (!UnsmearedSinkFileName.empty())
+    {
+        LOG(Message) << "Writing unsmeared sink to " << UnsmearedSinkFileName << std::endl;
+        A2AVectorsIo::write(UnsmearedSinkFileName, unsmeared_sink, false, vm().getTrajectory());
+    }
+}
+
+END_MODULE_NAMESPACE
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MDistil_Perambulator_hpp_
--- a/Show More
+++ b/Show More