Merge branch 'develop' into feature/zmobius_paramcompute

2026-05-06 10:14:31 +01:00 · 2020-04-20 09:12:34 -04:00
parent 96671bbb24 90229cfb0f
commit 181709bba4
300 changed files with 3177 additions and 31957 deletions
@@ -35,6 +35,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 #include <Grid/algorithms/approx/Zolotarev.h>
 #include <Grid/algorithms/approx/Chebyshev.h>
+#include <Grid/algorithms/approx/JacobiPolynomial.h>
 #include <Grid/algorithms/approx/Remez.h>
 #include <Grid/algorithms/approx/MultiShiftFunction.h>
 #include <Grid/algorithms/approx/Forecast.h>
@@ -43,11 +44,13 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 #include <Grid/algorithms/iterative/Deflation.h>
 #include <Grid/algorithms/iterative/ConjugateGradient.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
 #include <Grid/algorithms/iterative/ConjugateResidual.h>
 #include <Grid/algorithms/iterative/NormalEquations.h>
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
+#include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
 #include <Grid/algorithms/iterative/MinimalResidual.h>
@@ -1,3 +1,14 @@
+    // blockZaxpy in bockPromote - 3s, 5%
+    // noncoalesced linalg in Preconditionoer ~ 3s 5%
+    // Lancos tuning or replace 10-20s ~ 25%, open ended
+    // setup tuning   5s  ~  8%
+    //    -- e.g. ordermin, orderstep tunables.
+    // MdagM path without norm in LinOp code.     few seconds
+
+    // Mdir calc blocking kernels
+    // Fuse kernels in blockMaskedInnerProduct
+    // preallocate Vectors in Cayley 5D ~ few percent few seconds
+
 /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 
@@ -34,15 +45,36 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

+template<class vobj,class CComplex>
+inline void blockMaskedInnerProduct(Lattice<CComplex> &CoarseInner,
+				    const Lattice<decltype(innerProduct(vobj(),vobj()))> &FineMask,
+				    const Lattice<vobj> &fineX,
+				    const Lattice<vobj> &fineY)
+{
+  typedef decltype(innerProduct(vobj(),vobj())) dotp;
+
+  GridBase *coarse(CoarseInner.Grid());
+  GridBase *fine  (fineX.Grid());
+
+  Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
+  Lattice<dotp> fine_inner_msk(fine);
+
+  // Multiply could be fused with innerProduct
+  // Single block sum kernel could do both masks.
+  fine_inner = localInnerProduct(fineX,fineY);
+  mult(fine_inner_msk, fine_inner,FineMask);
+  blockSum(CoarseInner,fine_inner_msk);
+}
+
+
 class Geometry {
-  //    int dimension;
 public:
  int npoint;
  std::vector<int> directions   ;
  std::vector<int> displacements;

  Geometry(int _d)  {
-  
+    
    int base = (_d==5) ? 1:0;

    // make coarse grid stencil for 4d , not 5d
@@ -52,10 +84,10 @@ public:
    directions.resize(npoint);
    displacements.resize(npoint);
    for(int d=0;d<_d;d++){
-      directions[2*d  ] = d+base;
-      directions[2*d+1] = d+base;
-      displacements[2*d  ] = +1;
-      displacements[2*d+1] = -1;
+      directions[d   ] = d+base;
+      directions[d+_d] = d+base;
+      displacements[d  ] = +1;
+      displacements[d+_d]= -1;
    }
    directions   [2*_d]=0;
    displacements[2*_d]=0;
@@ -63,7 +95,7 @@ public:
    //// report back
    std::cout<<GridLogMessage<<"directions    :";
    for(int d=0;d<npoint;d++) std::cout<< directions[d]<< " ";
-    std::cout <<std::endl;
+    std::cout<<std::endl;
    std::cout<<GridLogMessage<<"displacements :";
    for(int d=0;d<npoint;d++) std::cout<< displacements[d]<< " ";
    std::cout<<std::endl;
@@ -115,10 +147,10 @@ public:
  
  void Orthogonalise(void){
    CoarseScalar InnerProd(CoarseGrid); 
-    std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
-    blockOrthogonalise(InnerProd,subspace);
-    std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
+    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
    blockOrthogonalise(InnerProd,subspace);
+    //    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 2"<<std::endl; // Really have to do twice? Yuck
+    //    blockOrthogonalise(InnerProd,subspace);
    //      std::cout << GridLogMessage <<" Gramm-Schmidt checking orthogonality"<<std::endl;
    //      CheckOrthogonal();
  } 
@@ -128,7 +160,7 @@ public:
    for(int i=0;i<nbasis;i++){
      blockProject(iProj,subspace[i],subspace);
      eProj=Zero(); 
-      thread_for(ss, CoarseGrid->oSites(),{
+      accelerator_for(ss, CoarseGrid->oSites(),1,{
 	eProj[ss](i)=CComplex(1.0);
      });
      eProj=eProj - iProj;
@@ -146,61 +178,9 @@ public:
  void CreateSubspaceRandom(GridParallelRNG &RNG){
    for(int i=0;i<nbasis;i++){
      random(RNG,subspace[i]);
-      std::cout<<GridLogMessage<<" norm subspace["<<i<<"] "<<norm2(subspace[i])<<std::endl;
    }
-    Orthogonalise();
  }

-  /*
-    virtual void CreateSubspaceLanczos(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) 
-    {
-    // Run a Lanczos with sloppy convergence
-    const int Nstop = nn;
-    const int Nk = nn+20;
-    const int Np = nn+20;
-    const int Nm = Nk+Np;
-    const int MaxIt= 10000;
-    RealD resid = 1.0e-3;
-
-    Chebyshev<FineField> Cheb(0.5,64.0,21);
-    ImplicitlyRestartedLanczos<FineField> IRL(hermop,Cheb,Nstop,Nk,Nm,resid,MaxIt);
-    //	IRL.lock = 1;
-
-    FineField noise(FineGrid); gaussian(RNG,noise);
-    FineField tmp(FineGrid); 
-    std::vector<RealD>     eval(Nm);
-    std::vector<FineField> evec(Nm,FineGrid);
-
-    int Nconv;
-    IRL.calc(eval,evec,
-    noise,
-    Nconv);
-
-    // pull back nn vectors
-    for(int b=0;b<nn;b++){
-
-    subspace[b]   = evec[b];
-
-    std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
-
-    hermop.Op(subspace[b],tmp); 
-    std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(tmp)<<std::endl;
-
-    noise = tmp -  sqrt(eval[b])*subspace[b] ;
-
-    std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<"  ;  [ M - Lambda ]_"<<b<<" vec_"<<b<<"  = " <<norm2(noise)<<std::endl;
-
-    noise = tmp +  eval[b]*subspace[b] ;
-
-    std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<"  ;  [ M - Lambda ]_"<<b<<" vec_"<<b<<"  = " <<norm2(noise)<<std::endl;
-
-    }
-    Orthogonalise();
-    for(int b=0;b<nn;b++){
-    std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
-    }
-    }
-  */
  virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {

    RealD scale;
@@ -232,54 +212,316 @@ public:
      subspace[b]   = noise;

    }
-
-    Orthogonalise();
-
  }

-  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
+  // and this is the best I found
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+#if 1
+  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
+				       int nn,
+				       double hi,
+				       double lo,
+				       int orderfilter,
+				       int ordermin,
+				       int orderstep,
+				       double filterlo
+				       ) {

    RealD scale;

+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+    FineField tmp(FineGrid);

-    Chebyshev<FineField> Cheb(0.1,64.0,900);
+    // New normalised noise
+    gaussian(RNG,noise);
+    scale = std::pow(norm2(noise),-0.5); 
+    noise=noise*scale;
+
+    // Initial matrix element
+    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+    int b =0;
+    {
+      // Filter
+      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
+      Cheb(hermop,noise,Mn);
+      // normalise
+      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
+      subspace[b]   = Mn;
+      hermop.Op(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+      b++;
+    }
+
+    // Generate a full sequence of Chebyshevs
+    {
+      lo=filterlo;
+      noise=Mn;
+
+      FineField T0(FineGrid); T0 = noise;  
+      FineField T1(FineGrid); 
+      FineField T2(FineGrid);
+      FineField y(FineGrid);
+      
+      FineField *Tnm = &T0;
+      FineField *Tn  = &T1;
+      FineField *Tnp = &T2;
+
+      // Tn=T1 = (xscale M + mscale)in
+      RealD xscale = 2.0/(hi-lo);
+      RealD mscale = -(hi+lo)/(hi-lo);
+      hermop.HermOp(T0,y);
+      T1=y*xscale+noise*mscale;
+
+      for(int n=2;n<=ordermin+orderstep*(nn-2);n++){
+	
+	hermop.HermOp(*Tn,y);
+
+	auto y_v = y.View();
+	auto Tn_v = Tn->View();
+	auto Tnp_v = Tnp->View();
+	auto Tnm_v = Tnm->View();
+	const int Nsimd = CComplex::Nsimd();
+	accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
+	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
+	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
+        });
+
+	// Possible more fine grained control is needed than a linear sweep,
+	// but huge productivity gain if this is simple algorithm and not a tunable
+	int m =1;
+	if ( n>=ordermin ) m=n-ordermin;
+	if ( (m%orderstep)==0 ) { 
+	  Mn=*Tnp;
+	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale;
+	  subspace[b] = Mn;
+	  hermop.Op(Mn,tmp); 
+	  std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+	  b++;
+	}
+
+	// Cycle pointers to avoid copies
+	FineField *swizzle = Tnm;
+	Tnm    =Tn;
+	Tn     =Tnp;
+	Tnp    =swizzle;
+	  
+      }
+    }
+    assert(b==nn);
+  }
+#endif
+#if 0
+  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
+				       int nn,
+				       double hi,
+				       double lo,
+				       int orderfilter,
+				       int ordermin,
+				       int orderstep,
+				       double filterlo
+				       ) {
+
+    RealD scale;

    FineField noise(FineGrid);
    FineField Mn(FineGrid);
+    FineField tmp(FineGrid);
+    FineField combined(FineGrid);

-    for(int b=0;b<nn;b++){
-	
-      gaussian(RNG,noise);
-      scale = std::pow(norm2(noise),-0.5); 
-      noise=noise*scale;
+    // New normalised noise
+    gaussian(RNG,noise);
+    scale = std::pow(norm2(noise),-0.5); 
+    noise=noise*scale;

-      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+    // Initial matrix element
+    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;

-      Cheb(hermop,noise,Mn);
+    int b =0;
+#define FILTERb(llo,hhi,oorder)						\
+    {									\
+      Chebyshev<FineField> Cheb(llo,hhi,oorder);			\
+      Cheb(hermop,noise,Mn);						\
+      scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;			\
+      subspace[b]   = Mn;						\
+      hermop.Op(Mn,tmp);						\
+      std::cout<<GridLogMessage << oorder<< " Cheb filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
+      b++;								\
+    }									

-      scale = std::pow(norm2(Mn),-0.5); 
-      Mn=Mn*scale;
-      subspace[b]   = Mn;
+    //      JacobiPolynomial<FineField> Cheb(0.002,60.0,1500,-0.5,3.5);	\

-      hermop.Op(Mn,noise); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(noise)<<std::endl;
-
-    }
-
-    Orthogonalise();
+    RealD alpha=-0.8;
+    RealD beta =-0.8;
+#define FILTER(llo,hhi,oorder)						\
+    {									\
+      Chebyshev<FineField> Cheb(llo,hhi,oorder);			\
+      /* JacobiPolynomial<FineField> Cheb(0.0,60.0,oorder,alpha,beta);*/\
+      Cheb(hermop,noise,Mn);						\
+      scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;			\
+      subspace[b]   = Mn;						\
+      hermop.Op(Mn,tmp);						\
+      std::cout<<GridLogMessage << oorder<< "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
+      b++;								\
+    }									
+    
+#define FILTERc(llo,hhi,oorder)				\
+    {							\
+      Chebyshev<FineField> Cheb(llo,hhi,oorder);	\
+      Cheb(hermop,noise,combined);			\
+    }									

+    double node = 0.000;
+    FILTERb(lo,hi,orderfilter);// 0
+    //    FILTERc(node,hi,51);// 0
+    noise = Mn;
+    int base = 0;
+    int mult = 100;
+    FILTER(node,hi,base+1*mult);
+    FILTER(node,hi,base+2*mult);
+    FILTER(node,hi,base+3*mult);
+    FILTER(node,hi,base+4*mult);
+    FILTER(node,hi,base+5*mult);
+    FILTER(node,hi,base+6*mult);
+    FILTER(node,hi,base+7*mult);
+    FILTER(node,hi,base+8*mult);
+    FILTER(node,hi,base+9*mult);
+    FILTER(node,hi,base+10*mult);
+    FILTER(node,hi,base+11*mult);
+    FILTER(node,hi,base+12*mult);
+    FILTER(node,hi,base+13*mult);
+    FILTER(node,hi,base+14*mult);
+    FILTER(node,hi,base+15*mult);
+    assert(b==nn);
  }
+#endif
+
+#if 0
+  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
+				       int nn,
+				       double hi,
+				       double lo,
+				       int orderfilter,
+				       int ordermin,
+				       int orderstep,
+				       double filterlo
+				       ) {
+
+    RealD scale;
+
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+    FineField tmp(FineGrid);
+    FineField combined(FineGrid);
+
+    // New normalised noise
+    gaussian(RNG,noise);
+    scale = std::pow(norm2(noise),-0.5); 
+    noise=noise*scale;
+
+    // Initial matrix element
+    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+    int b =0;
+    {						
+      Chebyshev<FineField> JacobiPoly(0.005,60.,1500);
+      //      JacobiPolynomial<FineField> JacobiPoly(0.002,60.0,1500,-0.5,3.5);
+      //JacobiPolynomial<FineField> JacobiPoly(0.03,60.0,500,-0.5,3.5);
+      //      JacobiPolynomial<FineField> JacobiPoly(0.00,60.0,1000,-0.5,3.5);
+      JacobiPoly(hermop,noise,Mn);
+      scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
+      subspace[b]   = Mn;
+      hermop.Op(Mn,tmp);
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; 
+      b++;
+      //      scale = std::pow(norm2(tmp),-0.5);     tmp=tmp*scale;
+      //      subspace[b]   = tmp;      b++;
+      //    }									
+    }									
+
+#define FILTER(lambda)						\
+    {								\
+      hermop.HermOp(subspace[0],tmp);				\
+      tmp = tmp - lambda *subspace[0];				\
+      scale = std::pow(norm2(tmp),-0.5);			\
+      tmp=tmp*scale;							\
+      subspace[b]   = tmp;						\
+      hermop.Op(subspace[b],tmp);					\
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
+      b++;								\
+    }									
+    //      scale = std::pow(norm2(tmp),-0.5);     tmp=tmp*scale;
+    //      subspace[b]   = tmp;      b++;
+    //    }									
+
+    FILTER(2.0e-5);
+    FILTER(2.0e-4);
+    FILTER(4.0e-4);
+    FILTER(8.0e-4);
+    FILTER(8.0e-4);
+
+    FILTER(2.0e-3);
+    FILTER(3.0e-3);
+    FILTER(4.0e-3);
+    FILTER(5.0e-3);
+    FILTER(6.0e-3);
+
+    FILTER(2.5e-3);
+    FILTER(3.5e-3);
+    FILTER(4.5e-3);
+    FILTER(5.5e-3);
+    FILTER(6.5e-3);
+
+    //    FILTER(6.0e-5);//6
+    //    FILTER(7.0e-5);//8
+    //    FILTER(8.0e-5);//9
+    //    FILTER(9.0e-5);//3
+
+    /*
+    //    FILTER(1.0e-4);//10
+    FILTER(2.0e-4);//11
+    //   FILTER(3.0e-4);//12
+    //    FILTER(4.0e-4);//13
+    FILTER(5.0e-4);//14
+
+    FILTER(6.0e-3);//4
+    FILTER(7.0e-4);//1
+    FILTER(8.0e-4);//7
+    FILTER(9.0e-4);//15
+    FILTER(1.0e-3);//2
+
+    FILTER(2.0e-3);//2
+    FILTER(3.0e-3);//2
+    FILTER(4.0e-3);//2
+    FILTER(5.0e-3);//2
+    FILTER(6.0e-3);//2
+
+    FILTER(7.0e-3);//2
+    FILTER(8.0e-3);//2
+    FILTER(1.0e-2);//2
+    */
+    std::cout << GridLogMessage <<"Jacobi filtering done" <<std::endl;
+    assert(b==nn);
+  }
+#endif
+

 };
+
 // Fine Object == (per site) type of fine field
 // nbasis      == number of deflation vectors
 template<class Fobj,class CComplex,int nbasis>
 class CoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > >  {
 public:
    
-  typedef iVector<CComplex,nbasis >             siteVector;
+  typedef iVector<CComplex,nbasis >           siteVector;
+  typedef Lattice<CComplex >                  CoarseComplexField;
  typedef Lattice<siteVector>                 CoarseVector;
  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
-
+  typedef iMatrix<CComplex,nbasis >  Cobj;
  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
  typedef Lattice<Fobj >        FineField;

@@ -293,7 +535,6 @@ public:
  CartesianStencil<siteVector,siteVector,int> Stencil; 

  std::vector<CoarseMatrix> A;
-
      
  ///////////////////////
  // Interface
@@ -305,33 +546,71 @@ public:
    conformable(_grid,in.Grid());
    conformable(in.Grid(),out.Grid());

-    RealD Nin = norm2(in);
+    //    RealD Nin = norm2(in);
    SimpleCompressor<siteVector> compressor;
+
+    double comms_usec = -usecond();
    Stencil.HaloExchange(in,compressor);
+    comms_usec += usecond();
+
    auto in_v = in.View();
    auto out_v = out.View();
-    thread_for(ss,Grid()->oSites(),{
-      siteVector res = Zero();
-      siteVector nbr;
+    typedef LatticeView<Cobj> Aview;
+
+    Vector<Aview> AcceleratorViewContainer;
+    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View());
+    Aview *Aview_p = & AcceleratorViewContainer[0];
+
+    const int Nsimd = CComplex::Nsimd();
+    typedef decltype(coalescedRead(in_v[0])) calcVector;
+    typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
+
+    GridStopWatch ArithmeticTimer;
+    int osites=Grid()->oSites();
+    //    double flops = osites*Nsimd*nbasis*nbasis*8.0*geom.npoint;
+    //    double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex);
+    double usecs =-usecond();
+    // assert(geom.npoint==9);
+
+    accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
+      int ss = sss/nbasis;
+      int b  = sss%nbasis;
+      calcComplex res = Zero();
+      calcVector nbr;
      int ptype;
      StencilEntry *SE;
+
+      int lane=SIMTlane(Nsimd);
      for(int point=0;point<geom.npoint;point++){

 	SE=Stencil.GetEntry(ptype,point,ss);
 	  
-	if(SE->_is_local&&SE->_permute) { 
-	  permute(nbr,in_v[SE->_offset],ptype);
-	} else if(SE->_is_local) { 
-	  nbr = in_v[SE->_offset];
+	if(SE->_is_local) { 
+	  nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
 	} else {
-	  nbr = Stencil.CommBuf()[SE->_offset];
+	  nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
+	}
+	synchronise();
+
+	for(int bb=0;bb<nbasis;bb++) {
+	  res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
 	}
-	auto A_point = A[point].View();
-	res = res + A_point[ss]*nbr;
      }
-      vstream(out_v[ss],res);
+      coalescedWrite(out_v[ss](b),res,lane);
    });
+    usecs +=usecond();
+
+    double nrm_usec=-usecond();
    RealD Nout= norm2(out);
+    nrm_usec+=usecond();
+
+    /*
+        std::cout << GridLogMessage << "\tNorm        " << nrm_usec << " us" <<std::endl;
+        std::cout << GridLogMessage << "\tHalo        " << comms_usec << " us" <<std::endl;
+        std::cout << GridLogMessage << "\tMatrix      " << usecs << " us" <<std::endl;
+        std::cout << GridLogMessage << "\t  mflop/s   " << flops/usecs<<std::endl;
+        std::cout << GridLogMessage << "\t  MB/s      " << bytes/usecs<<std::endl;
+    */
    return Nout;
  };

@@ -349,25 +628,54 @@ public:
      return norm2(out);
    }
  };
-
-  void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){
-    
-    conformable(_grid,in.Grid());
-    conformable(in.Grid(),out.Grid());
-    
+  void MdirComms(const CoarseVector &in)
+  {
    SimpleCompressor<siteVector> compressor;
    Stencil.HaloExchange(in,compressor);
-    
-    auto point = [dir, disp](){
-      if(dir == 0 and disp == 0)
-	return 8;
-      else
-	return (4 * dir + 1 - disp) / 2;
-    }();
+  }
+  void MdirCalc(const CoarseVector &in, CoarseVector &out, int point)
+  {
+    conformable(_grid,in.Grid());
+    conformable(_grid,out.Grid());
+
+    typedef LatticeView<Cobj> Aview;
+    Vector<Aview> AcceleratorViewContainer;
+    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View());
+    Aview *Aview_p = & AcceleratorViewContainer[0];

    auto out_v = out.View();
    auto in_v  = in.View();
-    thread_for(ss,Grid()->oSites(),{
+
+    const int Nsimd = CComplex::Nsimd();
+    typedef decltype(coalescedRead(in_v[0])) calcVector;
+    typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
+
+    accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
+      int ss = sss/nbasis;
+      int b  = sss%nbasis;
+      calcComplex res = Zero();
+      calcVector nbr;
+      int ptype;
+      StencilEntry *SE;
+
+      int lane=SIMTlane(Nsimd);
+      SE=Stencil.GetEntry(ptype,point,ss);
+	  
+      if(SE->_is_local) { 
+	nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
+      } else {
+	nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
+      }
+      synchronise();
+
+      for(int bb=0;bb<nbasis;bb++) {
+	res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
+      }
+      coalescedWrite(out_v[ss](b),res,lane);
+    });
+#if 0
+    accelerator_for(ss,Grid()->oSites(),1,{
+
      siteVector res = Zero();
      siteVector nbr;
      int ptype;
@@ -382,16 +690,65 @@ public:
      } else {
 	nbr = Stencil.CommBuf()[SE->_offset];
      }
+      synchronise();

-      auto A_point = A[point].View();
-      res = res + A_point[ss]*nbr;
+      res = res + Aview_p[point][ss]*nbr;
      
-      vstream(out_v[ss],res);
+      out_v[ss]=res;
    });
+#endif
+  }
+  void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
+  {
+    this->MdirComms(in);
+    int ndir=geom.npoint-1;
+    if ((out.size()!=ndir)&&(out.size()!=ndir+1)) { 
+      std::cout <<"MdirAll out size "<< out.size()<<std::endl;
+      std::cout <<"MdirAll ndir "<< ndir<<std::endl;
+      assert(0);
+    }
+    for(int p=0;p<ndir;p++){
+      MdirCalc(in,out[p],p);
+    }
+  };
+  void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){
+
+    this->MdirComms(in);
+
+    int ndim = in.Grid()->Nd();
+
+    //////////////
+    // 4D action like wilson
+    // 0+ => 0 
+    // 0- => 1
+    // 1+ => 2 
+    // 1- => 3
+    // etc..
+    //////////////
+    // 5D action like DWF
+    // 1+ => 0 
+    // 1- => 1
+    // 2+ => 2 
+    // 2- => 3
+    // etc..
+    auto point = [dir, disp, ndim](){
+      if(dir == 0 and disp == 0)
+	return 8;
+      else if ( ndim==4 ) { 
+	return (4 * dir + 1 - disp) / 2;
+      } else { 
+	return (4 * (dir-1) + 1 - disp) / 2;
+      }
+    }();
+
+    MdirCalc(in,out,point);
+
  };

-  void Mdiag(const CoarseVector &in, CoarseVector &out){
-    Mdir(in, out, 0, 0); // use the self coupling (= last) point of the stencil
+  void Mdiag(const CoarseVector &in, CoarseVector &out)
+  {
+    int point=geom.npoint-1;
+    MdirCalc(in, out, point); // No comms
  };

  
@@ -401,25 +758,44 @@ public:
    geom(CoarseGrid._ndimension),
    hermitian(hermitian_),
    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
-    A(geom.npoint,&CoarseGrid)
+      A(geom.npoint,&CoarseGrid)
  {
  };

  void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
-		       Aggregation<Fobj,CComplex,nbasis> & Subspace){
+		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
+  {
+    typedef Lattice<typename Fobj::tensor_reduced> FineComplexField;
+    typedef typename Fobj::scalar_type scalar_type;

-    FineField iblock(FineGrid); // contributions from within this block
-    FineField oblock(FineGrid); // contributions from outwith this block
+    FineComplexField one(FineGrid); one=scalar_type(1.0,0.0);
+    FineComplexField zero(FineGrid); zero=scalar_type(0.0,0.0);
+
+    std::vector<FineComplexField> masks(geom.npoint,FineGrid);
+    FineComplexField imask(FineGrid); // contributions from within this block
+    FineComplexField omask(FineGrid); // contributions from outwith this block
+
+    FineComplexField evenmask(FineGrid);
+    FineComplexField oddmask(FineGrid); 

    FineField     phi(FineGrid);
    FineField     tmp(FineGrid);
    FineField     zz(FineGrid); zz=Zero();
    FineField    Mphi(FineGrid);
+    FineField    Mphie(FineGrid);
+    FineField    Mphio(FineGrid);
+    std::vector<FineField>     Mphi_p(geom.npoint,FineGrid);

-    Lattice<iScalar<vInteger> > coor(FineGrid);
+    Lattice<iScalar<vInteger> > coor (FineGrid);
+    Lattice<iScalar<vInteger> > bcoor(FineGrid);
+    Lattice<iScalar<vInteger> > bcb  (FineGrid); bcb = Zero();

    CoarseVector iProj(Grid()); 
    CoarseVector oProj(Grid()); 
+    CoarseVector SelfProj(Grid()); 
+    CoarseComplexField iZProj(Grid()); 
+    CoarseComplexField oZProj(Grid()); 
+
    CoarseScalar InnerProd(Grid()); 

    // Orthogonalise the subblocks over the basis
@@ -428,69 +804,117 @@ public:
    // Compute the matrix elements of linop between this orthonormal
    // set of vectors.
    int self_stencil=-1;
-    for(int p=0;p<geom.npoint;p++){ 
+    for(int p=0;p<geom.npoint;p++)
+    { 
+      int dir   = geom.directions[p];
+      int disp  = geom.displacements[p];
      A[p]=Zero();
      if( geom.displacements[p]==0){
 	self_stencil=p;
      }
+
+      Integer block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]);
+
+      LatticeCoordinate(coor,dir);
+
+      ///////////////////////////////////////////////////////
+      // Work out even and odd block checkerboarding for fast diagonal term
+      ///////////////////////////////////////////////////////
+      if ( disp==1 ) {
+	bcb   = bcb + div(coor,block);
+      }
+	
+      if ( disp==0 ) {
+	  masks[p]= Zero();
+      } else if ( disp==1 ) {
+	masks[p] = where(mod(coor,block)==(block-1),one,zero);
+      } else if ( disp==-1 ) {
+	masks[p] = where(mod(coor,block)==(Integer)0,one,zero);
+      }
    }
+    evenmask = where(mod(bcb,2)==(Integer)0,one,zero);
+    oddmask  = one-evenmask;
+
    assert(self_stencil!=-1);

    for(int i=0;i<nbasis;i++){
+
      phi=Subspace.subspace[i];
-	
-      std::cout<<GridLogMessage<<"("<<i<<").."<<std::endl;
+
+      //      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl;
+      linop.OpDirAll(phi,Mphi_p);
+      linop.OpDiag  (phi,Mphi_p[geom.npoint-1]);

      for(int p=0;p<geom.npoint;p++){ 

+	Mphi = Mphi_p[p];
+
 	int dir   = geom.directions[p];
 	int disp  = geom.displacements[p];

-	Integer block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]);
+	if ( (disp==-1) || (!hermitian ) ) {

-	LatticeCoordinate(coor,dir);
-
-	if ( disp==0 ){
-	  linop.OpDiag(phi,Mphi);
-	}
-	else  {
-	  linop.OpDir(phi,Mphi,dir,disp); 
-	}
-
-	////////////////////////////////////////////////////////////////////////
-	// Pick out contributions coming from this cell and neighbour cell
-	////////////////////////////////////////////////////////////////////////
-	if ( disp==0 ) {
-	  iblock = Mphi;
-	  oblock = Zero();
-	} else if ( disp==1 ) {
-	  oblock = where(mod(coor,block)==(block-1),Mphi,zz);
-	  iblock = where(mod(coor,block)!=(block-1),Mphi,zz);
-	} else if ( disp==-1 ) {
-	  oblock = where(mod(coor,block)==(Integer)0,Mphi,zz);
-	  iblock = where(mod(coor,block)!=(Integer)0,Mphi,zz);
-	} else {
-	  assert(0);
-	}
-
-	Subspace.ProjectToSubspace(iProj,iblock);
-	Subspace.ProjectToSubspace(oProj,oblock);
-	//	  blockProject(iProj,iblock,Subspace.subspace);
-	//	  blockProject(oProj,oblock,Subspace.subspace);
-	auto iProj_v = iProj.View() ;
-	auto oProj_v = oProj.View() ;
-	auto A_p     =  A[p].View();
-	auto A_self  = A[self_stencil].View();
-	thread_for(ss, Grid()->oSites(),{
+	  ////////////////////////////////////////////////////////////////////////
+	  // Pick out contributions coming from this cell and neighbour cell
+	  ////////////////////////////////////////////////////////////////////////
+	  omask = masks[p];
+	  imask = one-omask;
+	
 	  for(int j=0;j<nbasis;j++){
-	    if( disp!= 0 ) {
-	      A_p[ss](j,i) = oProj_v[ss](j);
-	    }
-	    A_self[ss](j,i) =	A_self[ss](j,i) + iProj_v[ss](j);
+	    
+	    blockMaskedInnerProduct(oZProj,omask,Subspace.subspace[j],Mphi);
+	    
+	    auto iZProj_v = iZProj.View() ;
+	    auto oZProj_v = oZProj.View() ;
+	    auto A_p     =  A[p].View();
+	    auto A_self  = A[self_stencil].View();
+
+	    accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
+	    //      if( disp!= 0 ) { accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });}
+	    //	    accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_self[ss](j,i),A_self(ss)(j,i)+iZProj_v(ss)); });
+
+	  }
+	}
+      }
+
+      ///////////////////////////////////////////
+      // Faster alternate self coupling.. use hermiticity to save 2x
+      ///////////////////////////////////////////
+      {
+	mult(tmp,phi,evenmask);  linop.Op(tmp,Mphie);
+	mult(tmp,phi,oddmask );  linop.Op(tmp,Mphio);
+
+	{
+	  auto tmp_      = tmp.View();
+	  auto evenmask_ = evenmask.View();
+	  auto oddmask_  =  oddmask.View();
+	  auto Mphie_    =  Mphie.View();
+	  auto Mphio_    =  Mphio.View();
+	  accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{ 
+	      coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss));
+	    });
+	}
+
+	blockProject(SelfProj,tmp,Subspace.subspace);
+
+	auto SelfProj_ = SelfProj.View();
+	auto A_self  = A[self_stencil].View();
+
+	accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{
+	  for(int j=0;j<nbasis;j++){
+	    coalescedWrite(A_self[ss](j,i), SelfProj_(ss)(j));
 	  }
 	});
+
      }
    }
+    if(hermitian) {
+      std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
+      ForceHermitian();
+    }
+      // AssertHermitian();
+      // ForceDiagonal();
+  }

 #if 0
    ///////////////////////////
@@ -513,17 +937,26 @@ public:
    std::cout<<GridLogMessage<< iProj <<std::endl;
    std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
 #endif
-      //      ForceHermitian();
-      // AssertHermitian();
-      // ForceDiagonal();
-  }
+

  void ForceHermitian(void) {
-    for(int d=0;d<4;d++){
-      int dd=d+1;
-      A[2*d] = adj(Cshift(A[2*d+1],dd,1));
+    CoarseMatrix Diff  (Grid());
+    for(int p=0;p<geom.npoint;p++){
+      int dir   = geom.directions[p];
+      int disp  = geom.displacements[p];
+      if(disp==-1) {
+	// Find the opposite link
+	for(int pp=0;pp<geom.npoint;pp++){
+	  int dirp   = geom.directions[pp];
+	  int dispp  = geom.displacements[pp];
+	  if ( (dirp==dir) && (dispp==1) ){
+	    //	    Diff = adj(Cshift(A[p],dir,1)) - A[pp]; 
+	    //	    std::cout << GridLogMessage<<" Replacing stencil leg "<<pp<<" with leg "<<p<< " diff "<<norm2(Diff) <<std::endl;
+	    A[pp] = adj(Cshift(A[p],dir,1));
+	  }
+	}
+      }
    }
-    //      A[8] = 0.5*(A[8] + adj(A[8]));
  }
  void AssertHermitian(void) {
    CoarseMatrix AA    (Grid());
@@ -47,6 +47,7 @@ public:
  // Support for coarsening to a multigrid
  virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base
  virtual void OpDir  (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base
+  virtual void OpDirAll  (const Field &in, std::vector<Field> &out) = 0; // Abstract base

  virtual void Op     (const Field &in, Field &out) = 0; // Abstract base
  virtual void AdjOp  (const Field &in, Field &out) = 0; // Abstract base
@@ -83,6 +84,9 @@ public:
  void OpDir  (const Field &in, Field &out,int dir,int disp) {
    _Mat.Mdir(in,out,dir,disp);
  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    _Mat.MdirAll(in,out);
+  };
  void Op     (const Field &in, Field &out){
    _Mat.M(in,out);
  }
@@ -93,8 +97,7 @@ public:
    _Mat.MdagM(in,out,n1,n2);
  }
  void HermOp(const Field &in, Field &out){
-    RealD n1,n2;
-    HermOpAndNorm(in,out,n1,n2);
+    _Mat.MdagM(in,out);
  }
 };

@@ -116,6 +119,9 @@ public:
    _Mat.Mdir(in,out,dir,disp);
    assert(0);
  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    assert(0);
+  };
  void Op     (const Field &in, Field &out){
    _Mat.M(in,out);
    assert(0);
@@ -154,6 +160,9 @@ public:
  void OpDir  (const Field &in, Field &out,int dir,int disp) {
    _Mat.Mdir(in,out,dir,disp);
  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    _Mat.MdirAll(in,out);
+  };
  void Op     (const Field &in, Field &out){
    _Mat.M(in,out);
  }
@@ -162,7 +171,6 @@ public:
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
    _Mat.M(in,out);
-	
    ComplexD dot= innerProduct(in,out); n1=real(dot);
    n2=norm2(out);
  }
@@ -183,6 +191,9 @@ public:
  void OpDir  (const Field &in, Field &out,int dir,int disp) {
    _Mat.Mdir(in,out,dir,disp);
  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    _Mat.MdirAll(in,out);
+  };
  void Op     (const Field &in, Field &out){
    _Mat.M(in,out);
  }
@@ -234,6 +245,9 @@ public:
      void OpDir  (const Field &in, Field &out,int dir,int disp) {
 	assert(0);
      }
+      void OpDirAll  (const Field &in, std::vector<Field> &out){
+	assert(0);
+      };
    };
    template<class Matrix,class Field>
    class SchurDiagMooeeOperator :  public SchurOperatorBase<Field> {
@@ -320,9 +334,135 @@ public:
 	return axpy_norm(out,-1.0,tmp,in);
      }
    };
+
+    template<class Field>
+    class NonHermitianSchurOperatorBase :  public LinearOperatorBase<Field> 
+    {
+      public:
+        virtual RealD Mpc      (const Field& in, Field& out) = 0;
+        virtual RealD MpcDag   (const Field& in, Field& out) = 0;
+        virtual void  MpcDagMpc(const Field& in, Field& out, RealD& ni, RealD& no) {
+          Field tmp(in.Grid());
+          tmp.Checkerboard() = in.Checkerboard();
+	        ni = Mpc(in,tmp);
+	        no = MpcDag(tmp,out);
+        }
+        virtual void HermOpAndNorm(const Field& in, Field& out, RealD& n1, RealD& n2) {
+          assert(0);
+        }
+        virtual void HermOp(const Field& in, Field& out) {
+          assert(0);
+        }
+        void Op(const Field& in, Field& out) {
+          Mpc(in, out);
+        }
+        void AdjOp(const Field& in, Field& out) { 
+          MpcDag(in, out);
+        }
+        // Support for coarsening to a multigrid
+        void OpDiag(const Field& in, Field& out) {
+          assert(0); // must coarsen the unpreconditioned system
+        }
+        void OpDir(const Field& in, Field& out, int dir, int disp) {
+          assert(0);
+        }
+    };
+
+    template<class Matrix, class Field>
+    class NonHermitianSchurDiagMooeeOperator :  public NonHermitianSchurOperatorBase<Field> 
+    {
+      public:
+        Matrix& _Mat;
+        NonHermitianSchurDiagMooeeOperator(Matrix& Mat): _Mat(Mat){};
+        virtual RealD Mpc(const Field& in, Field& out) {
+          Field tmp(in.Grid());
+          tmp.Checkerboard() = !in.Checkerboard();
+
+  	      _Mat.Meooe(in, tmp);
+	        _Mat.MooeeInv(tmp, out);
+	        _Mat.Meooe(out, tmp);
+
+	        _Mat.Mooee(in, out);
+	
+          return axpy_norm(out, -1.0, tmp, out);
+        }
+        virtual RealD MpcDag(const Field& in, Field& out) {
+	        Field tmp(in.Grid());
+
+	        _Mat.MeooeDag(in, tmp);
+          _Mat.MooeeInvDag(tmp, out);
+	        _Mat.MeooeDag(out, tmp);
+
+	        _Mat.MooeeDag(in, out);
+	
+          return axpy_norm(out, -1.0, tmp, out);
+      }
+    };
+    
+    template<class Matrix,class Field>
+    class NonHermitianSchurDiagOneOperator : public NonHermitianSchurOperatorBase<Field> 
+    {
+      protected:
+        Matrix &_Mat;
+    
+      public:
+        NonHermitianSchurDiagOneOperator (Matrix& Mat): _Mat(Mat){};
+        virtual RealD Mpc(const Field& in, Field& out) {
+	        Field tmp(in.Grid());
+
+	        _Mat.Meooe(in, out);
+	        _Mat.MooeeInv(out, tmp);
+	        _Mat.Meooe(tmp, out);
+	        _Mat.MooeeInv(out, tmp);
+
+	        return axpy_norm(out, -1.0, tmp, in);
+        }
+        virtual RealD MpcDag(const Field& in, Field& out) {
+	        Field tmp(in.Grid());
+
+	        _Mat.MooeeInvDag(in, out);
+	        _Mat.MeooeDag(out, tmp);
+	        _Mat.MooeeInvDag(tmp, out);
+	        _Mat.MeooeDag(out, tmp);
+
+	        return axpy_norm(out, -1.0, tmp, in);
+        }
+    };
+
+    template<class Matrix, class Field>
+    class NonHermitianSchurDiagTwoOperator : public NonHermitianSchurOperatorBase<Field> 
+    {
+      protected:
+        Matrix& _Mat;
+    
+      public:
+        NonHermitianSchurDiagTwoOperator(Matrix& Mat): _Mat(Mat){};
+
+        virtual RealD Mpc(const Field& in, Field& out) {
+          Field tmp(in.Grid());
+
+	        _Mat.MooeeInv(in, out);
+	        _Mat.Meooe(out, tmp);
+	        _Mat.MooeeInv(tmp, out);
+	        _Mat.Meooe(out, tmp);
+
+	        return axpy_norm(out, -1.0, tmp, in);
+        }
+        virtual RealD MpcDag(const Field& in, Field& out) {
+	        Field tmp(in.Grid());
+
+          _Mat.MeooeDag(in, out);
+          _Mat.MooeeInvDag(out, tmp);
+          _Mat.MeooeDag(tmp, out);
+          _Mat.MooeeInvDag(out, tmp);
+
+          return axpy_norm(out, -1.0, tmp, in);
+        }
+    };
+
    ///////////////////////////////////////////////////////////////////////////////////////////////////
    // Left  handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta  -->  ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta
-    // Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta  -->  ( 1 - Moe Mee^-1 Meo ) Moo^-1 phi=eta ; psi = Moo^-1 phi
+    // Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta  -->  ( 1 - Moe Mee^-1 Meo Moo^-1) phi=eta ; psi = Moo^-1 phi
    ///////////////////////////////////////////////////////////////////////////////////////////////////
    template<class Matrix,class Field> using SchurDiagOneRH = SchurDiagTwoOperator<Matrix,Field> ;
    template<class Matrix,class Field> using SchurDiagOneLH = SchurDiagOneOperator<Matrix,Field> ;
@@ -45,8 +45,13 @@ public:
    ni=M(in,tmp);
    no=Mdag(tmp,out);
  }
+  virtual void  MdagM(const Field &in, Field &out) {
+    RealD ni, no;
+    MdagM(in,out,ni,no);
+  }
  virtual  void Mdiag    (const Field &in, Field &out)=0;
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
+  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)=0;
 };

 /////////////////////////////////////////////////////////////////////////////////////////////
@@ -56,12 +61,12 @@ template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrix
 public:
  virtual GridBase *RedBlackGrid(void)=0;

-      //////////////////////////////////////////////////////////////////////
-      // Query the even even properties to make algorithmic decisions
-      //////////////////////////////////////////////////////////////////////
-      virtual RealD  Mass(void)        { return 0.0; };
-      virtual int    ConstEE(void)     { return 1; }; // Disable assumptions unless overridden
-      virtual int    isTrivialEE(void) { return 0; }; // by a derived class that knows better
+  //////////////////////////////////////////////////////////////////////
+  // Query the even even properties to make algorithmic decisions
+  //////////////////////////////////////////////////////////////////////
+  virtual RealD  Mass(void)        { return 0.0; };
+  virtual int    ConstEE(void)     { return 1; }; // Disable assumptions unless overridden
+  virtual int    isTrivialEE(void) { return 0; }; // by a derived class that knows better

  // half checkerboard operaions
  virtual  void Meooe    (const Field &in, Field &out)=0;
@@ -94,6 +94,24 @@ public:
    Coeffs.assign(0.,order);
    Coeffs[order-1] = 1.;
  };
+  
+  // PB - more efficient low pass drops high modes above the low as 1/x uses all Chebyshev's.
+  // Similar kick effect below the threshold as Lanczos filter approach
+  void InitLowPass(RealD _lo,RealD _hi,int _order)
+  {
+    lo=_lo;
+    hi=_hi;
+    order=_order;
+      
+    if(order < 2) exit(-1);
+    Coeffs.resize(order);
+    for(int j=0;j<order;j++){
+      RealD k=(order-1.0);
+      RealD s=std::cos( j*M_PI*(k+0.5)/order );
+      Coeffs[j] = s * 2.0/order;
+    }
+    
+  };

  void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
  {
@@ -234,20 +252,20 @@ public:
    RealD xscale = 2.0/(hi-lo);
    RealD mscale = -(hi+lo)/(hi-lo);
    Linop.HermOp(T0,y);
-    T1=y*xscale+in*mscale;
+    axpby(T1,xscale,mscale,y,in);

    // sum = .5 c[0] T0 + c[1] T1
-    out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
+    //    out = ()*T0 + Coeffs[1]*T1;
+    axpby(out,0.5*Coeffs[0],Coeffs[1],T0,T1);
    for(int n=2;n<order;n++){
 	
      Linop.HermOp(*Tn,y);
-
-      y=xscale*y+mscale*(*Tn);
-
-      *Tnp=2.0*y-(*Tnm);
-
-      out=out+Coeffs[n]* (*Tnp);
-
+      //     y=xscale*y+mscale*(*Tn);
+      //      *Tnp=2.0*y-(*Tnm);
+      //      out=out+Coeffs[n]* (*Tnp);
+      axpby(y,xscale,mscale,y,(*Tn));
+      axpby(*Tnp,2.0,-1.0,y,(*Tnm));
+      axpy(out,Coeffs[n],*Tnp,out);
      // Cycle pointers to avoid copies
      Field *swizzle = Tnm;
      Tnm    =Tn;
@@ -0,0 +1,129 @@
+#ifndef GRID_JACOBIPOLYNOMIAL_H
+#define GRID_JACOBIPOLYNOMIAL_H
+
+#include <Grid/algorithms/LinearOperator.h>
+
+NAMESPACE_BEGIN(Grid);
+
+template<class Field>
+class JacobiPolynomial : public OperatorFunction<Field> {
+ private:
+  using OperatorFunction<Field>::operator();
+
+  int order;
+  RealD hi;
+  RealD lo;
+  RealD alpha;
+  RealD beta;
+
+ public:
+  void csv(std::ostream &out){
+    csv(out,lo,hi);
+  }
+  void csv(std::ostream &out,RealD llo,RealD hhi){
+    RealD diff = hhi-llo;
+    RealD delta = diff*1.0e-5;
+    for (RealD x=llo-delta; x<=hhi; x+=delta) {
+      RealD f = approx(x);
+      out<< x<<" "<<f <<std::endl;
+    }
+    return;
+  }
+
+  JacobiPolynomial(){};
+  JacobiPolynomial(RealD _lo,RealD _hi,int _order,RealD _alpha, RealD _beta)
+  {
+      lo=_lo;
+      hi=_hi;
+      alpha=_alpha;
+      beta=_beta;
+      order=_order;
+  };
+
+  RealD approx(RealD x) // Convenience for plotting the approximation                                                       
+  {
+    RealD Tn;
+    RealD Tnm;
+    RealD Tnp;
+
+    RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
+
+    RealD T0=1.0;
+    RealD T1=(alpha-beta)*0.5+(alpha+beta+2.0)*0.5*y;
+
+    Tn =T1;
+    Tnm=T0;
+    for(int n=2;n<=order;n++){
+      RealD cnp = 2.0*n*(n+alpha+beta)*(2.0*n-2.0+alpha+beta);
+      RealD cny = (2.0*n-2.0+alpha+beta)*(2.0*n-1.0+alpha+beta)*(2.0*n+alpha+beta);
+      RealD cn1 = (2.0*n+alpha+beta-1.0)*(alpha*alpha-beta*beta);
+      RealD cnm = - 2.0*(n+alpha-1.0)*(n+beta-1.0)*(2.0*n+alpha+beta);
+      Tnp= ( cny * y *Tn + cn1 * Tn + cnm * Tnm )/ cnp;
+      Tnm=Tn;
+      Tn =Tnp;
+    }
+    return Tnp;
+  };
+
+  // Implement the required interface                                                                                       
+  void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
+    GridBase *grid=in.Grid();
+
+    int vol=grid->gSites();
+
+    Field T0(grid);
+    Field T1(grid);
+    Field T2(grid);
+    Field y(grid);
+
+
+    Field *Tnm = &T0;
+    Field *Tn  = &T1;
+    Field *Tnp = &T2;
+
+    //    RealD T0=1.0;                                                                                                     
+    T0=in;
+
+    //    RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));                                                                           
+    //           = x * 2/(hi-lo) - (hi+lo)/(hi-lo)                                                                          
+    Linop.HermOp(T0,y);
+    RealD xscale = 2.0/(hi-lo);
+    RealD mscale = -(hi+lo)/(hi-lo);
+    Linop.HermOp(T0,y);
+    y=y*xscale+in*mscale;
+
+    // RealD T1=(alpha-beta)*0.5+(alpha+beta+2.0)*0.5*y;
+    RealD halfAmB  = (alpha-beta)*0.5;
+    RealD halfApBp2= (alpha+beta+2.0)*0.5;
+    T1 = halfAmB * in + halfApBp2*y;
+
+    for(int n=2;n<=order;n++){
+
+      Linop.HermOp(*Tn,y);
+      y=xscale*y+mscale*(*Tn);
+
+      RealD cnp = 2.0*n*(n+alpha+beta)*(2.0*n-2.0+alpha+beta);
+      RealD cny = (2.0*n-2.0+alpha+beta)*(2.0*n-1.0+alpha+beta)*(2.0*n+alpha+beta);
+      RealD cn1 = (2.0*n+alpha+beta-1.0)*(alpha*alpha-beta*beta);
+      RealD cnm = - 2.0*(n+alpha-1.0)*(n+beta-1.0)*(2.0*n+alpha+beta);
+
+      //      Tnp= ( cny * y *Tn + cn1 * Tn + cnm * Tnm )/ cnp;                                                             
+      cny=cny/cnp;
+      cn1=cn1/cnp;
+      cn1=cn1/cnp;
+      cnm=cnm/cnp;
+
+      *Tnp=cny*y + cn1 *(*Tn) + cnm * (*Tnm);
+
+      // Cycle pointers to avoid copies                                                                                     
+      Field *swizzle = Tnm;
+      Tnm    =Tn;
+      Tn     =Tnp;
+      Tnp    =swizzle;
+    }
+    out=*Tnp;
+
+  }
+};
+NAMESPACE_END(Grid);
+#endif
@@ -0,0 +1,222 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/algorithms/iterative/BiCGSTAB.h
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: juettner <juettner@soton.ac.uk>
+Author: David Murphy <djmurphy@mit.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef GRID_BICGSTAB_H
+#define GRID_BICGSTAB_H
+
+NAMESPACE_BEGIN(Grid);
+
+/////////////////////////////////////////////////////////////
+// Base classes for iterative processes based on operators
+// single input vec, single output vec.
+/////////////////////////////////////////////////////////////
+
+template <class Field>
+class BiCGSTAB : public OperatorFunction<Field> 
+{
+  public:
+    using OperatorFunction<Field>::operator();
+    
+    bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
+                             // Defaults true.
+    RealD Tolerance;
+    Integer MaxIterations;
+    Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+  
+    BiCGSTAB(RealD tol, Integer maxit, bool err_on_no_conv = true) : 
+      Tolerance(tol), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv){};
+
+    void operator()(LinearOperatorBase<Field>& Linop, const Field& src, Field& psi) 
+    {
+      psi.Checkerboard() = src.Checkerboard();
+      conformable(psi, src);
+
+      RealD cp(0), rho(1), rho_prev(0), alpha(1), beta(0), omega(1);
+      RealD a(0), bo(0), b(0), ssq(0);
+
+      Field p(src);
+      Field r(src);
+      Field rhat(src);
+      Field v(src);
+      Field s(src);
+      Field t(src);
+      Field h(src);
+
+      v = Zero();
+      p = Zero();
+
+      // Initial residual computation & set up
+      RealD guess = norm2(psi);
+      assert(std::isnan(guess) == 0);
+    
+      Linop.Op(psi, v);
+      b = norm2(v);
+
+      r = src - v;
+      rhat = r;
+      a = norm2(r);
+      ssq = norm2(src);
+
+      std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB: guess " << guess << std::endl;
+      std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB:   src " << ssq << std::endl;
+      std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB:    mp " << b << std::endl;
+      std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB:     r " << a << std::endl;
+
+      RealD rsq = Tolerance * Tolerance * ssq;
+
+      // Check if guess is really REALLY good :)
+      if(a <= rsq){ return; }
+
+      std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB: k=0 residual " << a << " target " << rsq << std::endl;
+
+      GridStopWatch LinalgTimer;
+      GridStopWatch InnerTimer;
+      GridStopWatch AxpyNormTimer;
+      GridStopWatch LinearCombTimer;
+      GridStopWatch MatrixTimer;
+      GridStopWatch SolverTimer;
+
+      SolverTimer.Start();
+      int k;
+      for (k = 1; k <= MaxIterations; k++) 
+      {
+        rho_prev = rho;
+
+        LinalgTimer.Start();
+        InnerTimer.Start();
+        ComplexD Crho  = innerProduct(rhat,r);
+        InnerTimer.Stop();
+        rho = Crho.real();
+
+        beta = (rho / rho_prev) * (alpha / omega);
+
+        LinearCombTimer.Start();
+        bo = beta * omega;
+        auto p_v = p.View();
+        auto r_v = r.View();
+        auto v_v = v.View();
+        accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{
+          coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss));
+        });
+        LinearCombTimer.Stop();
+        LinalgTimer.Stop();
+
+        MatrixTimer.Start();
+        Linop.Op(p,v);
+        MatrixTimer.Stop();
+
+        LinalgTimer.Start();
+        InnerTimer.Start();
+        ComplexD Calpha = innerProduct(rhat,v);
+        InnerTimer.Stop();
+        alpha = rho / Calpha.real();
+
+        LinearCombTimer.Start();
+        auto h_v = h.View();
+        auto psi_v = psi.View();
+        accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{
+          coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss));
+        });
+        
+        auto s_v = s.View();
+        accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{
+          coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss));
+        });
+        LinearCombTimer.Stop();
+        LinalgTimer.Stop();
+
+        MatrixTimer.Start();
+        Linop.Op(s,t);
+        MatrixTimer.Stop();
+
+        LinalgTimer.Start();
+        InnerTimer.Start();
+        ComplexD Comega = innerProduct(t,s);
+        InnerTimer.Stop();
+        omega = Comega.real() / norm2(t);
+
+        LinearCombTimer.Start();
+        auto t_v = t.View();
+        accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{
+          coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss));
+          coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss));
+        });
+        LinearCombTimer.Stop();
+
+        cp = norm2(r);
+        LinalgTimer.Stop();
+
+        std::cout << GridLogIterative << "BiCGSTAB: Iteration " << k << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
+
+        // Stopping condition
+        if(cp <= rsq) 
+        {
+          SolverTimer.Stop();
+          Linop.Op(psi, v);
+          p = v - src;
+
+          RealD srcnorm = sqrt(norm2(src));
+          RealD resnorm = sqrt(norm2(p));
+          RealD true_residual = resnorm / srcnorm;
+
+          std::cout << GridLogMessage << "BiCGSTAB Converged on iteration " << k << std::endl;
+          std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp/ssq) << std::endl;
+          std::cout << GridLogMessage << "\tTrue residual " << true_residual << std::endl;
+          std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
+
+          std::cout << GridLogMessage << "Time breakdown " << std::endl;
+          std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() << std::endl;
+          std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() << std::endl;
+          std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() << std::endl;
+          std::cout << GridLogMessage << "\tInner      " << InnerTimer.Elapsed() << std::endl;
+          std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() << std::endl;
+          std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() << std::endl;
+
+          if(ErrorOnNoConverge){ assert(true_residual / Tolerance < 10000.0); }
+
+          IterationsToComplete = k;	
+
+          return;
+        }
+      }
+      
+      std::cout << GridLogMessage << "BiCGSTAB did NOT converge" << std::endl;
+
+      if(ErrorOnNoConverge){ assert(0); }
+      IterationsToComplete = k;
+    }
+};
+
+NAMESPACE_END(Grid);
+
+#endif
@@ -0,0 +1,158 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: ./lib/algorithms/iterative/BiCGSTABMixedPrec.h
+
+Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@phys.columbia.edu>
+Author: David Murphy <djmurphy@mit.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef GRID_BICGSTAB_MIXED_PREC_H
+#define GRID_BICGSTAB_MIXED_PREC_H
+
+NAMESPACE_BEGIN(Grid);
+
+// Mixed precision restarted defect correction BiCGSTAB
+template<class FieldD, class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
+class MixedPrecisionBiCGSTAB : public LinearFunction<FieldD> 
+{
+  public:                                                
+    RealD   Tolerance;
+    RealD   InnerTolerance; // Initial tolerance for inner CG. Defaults to Tolerance but can be changed
+    Integer MaxInnerIterations;
+    Integer MaxOuterIterations;
+    GridBase* SinglePrecGrid; // Grid for single-precision fields
+    RealD OuterLoopNormMult; // Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
+    LinearOperatorBase<FieldF> &Linop_f;
+    LinearOperatorBase<FieldD> &Linop_d;
+
+    Integer TotalInnerIterations; //Number of inner CG iterations
+    Integer TotalOuterIterations; //Number of restarts
+    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
+
+    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
+    LinearFunction<FieldF> *guesser;
+    
+    MixedPrecisionBiCGSTAB(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid, 
+        LinearOperatorBase<FieldF>& _Linop_f, LinearOperatorBase<FieldD>& _Linop_d) : 
+      Linop_f(_Linop_f), Linop_d(_Linop_d), Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), 
+      MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid), OuterLoopNormMult(100.), guesser(NULL) {};
+
+    void useGuesser(LinearFunction<FieldF>& g){
+      guesser = &g;
+    }
+  
+    void operator() (const FieldD& src_d_in, FieldD& sol_d)
+    {
+      TotalInnerIterations = 0;
+    
+      GridStopWatch TotalTimer;
+      TotalTimer.Start();
+      
+      int cb = src_d_in.Checkerboard();
+      sol_d.Checkerboard() = cb;
+      
+      RealD src_norm = norm2(src_d_in);
+      RealD stop = src_norm * Tolerance*Tolerance;
+
+      GridBase* DoublePrecGrid = src_d_in.Grid();
+      FieldD tmp_d(DoublePrecGrid);
+      tmp_d.Checkerboard() = cb;
+      
+      FieldD tmp2_d(DoublePrecGrid);
+      tmp2_d.Checkerboard() = cb;
+      
+      FieldD src_d(DoublePrecGrid);
+      src_d = src_d_in; //source for next inner iteration, computed from residual during operation
+      
+      RealD inner_tol = InnerTolerance;
+      
+      FieldF src_f(SinglePrecGrid);
+      src_f.Checkerboard() = cb;
+      
+      FieldF sol_f(SinglePrecGrid);
+      sol_f.Checkerboard() = cb;
+      
+      BiCGSTAB<FieldF> CG_f(inner_tol, MaxInnerIterations);
+      CG_f.ErrorOnNoConverge = false;
+
+      GridStopWatch InnerCGtimer;
+
+      GridStopWatch PrecChangeTimer;
+      
+      Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
+        
+      for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++)
+      {
+        // Compute double precision rsd and also new RHS vector.
+        Linop_d.Op(sol_d, tmp_d);
+        RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
+        
+        std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Outer iteration " << outer_iter << " residual " << norm << " target " << stop << std::endl;
+
+        if(norm < OuterLoopNormMult * stop){
+          std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Outer iteration converged on iteration " << outer_iter << std::endl;
+          break;
+        }
+        while(norm * inner_tol * inner_tol < stop){ inner_tol *= 2; } // inner_tol = sqrt(stop/norm) ??
+
+        PrecChangeTimer.Start();
+        precisionChange(src_f, src_d);
+        PrecChangeTimer.Stop();
+        
+        sol_f = Zero();
+
+        //Optionally improve inner solver guess (eg using known eigenvectors)
+        if(guesser != NULL){ (*guesser)(src_f, sol_f); }
+
+        //Inner CG
+        CG_f.Tolerance = inner_tol;
+        InnerCGtimer.Start();
+        CG_f(Linop_f, src_f, sol_f);
+        InnerCGtimer.Stop();
+        TotalInnerIterations += CG_f.IterationsToComplete;
+        
+        //Convert sol back to double and add to double prec solution
+        PrecChangeTimer.Start();
+        precisionChange(tmp_d, sol_f);
+        PrecChangeTimer.Stop();
+        
+        axpy(sol_d, 1.0, tmp_d, sol_d);
+      }
+      
+      //Final trial CG
+      std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Starting final patch-up double-precision solve" << std::endl;
+      
+      BiCGSTAB<FieldD> CG_d(Tolerance, MaxInnerIterations);
+      CG_d(Linop_d, src_d_in, sol_d);
+      TotalFinalStepIterations = CG_d.IterationsToComplete;
+
+      TotalTimer.Stop();
+      std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
+      std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
+  }
+};
+
+NAMESPACE_END(Grid);
+
+#endif
@@ -52,6 +52,7 @@ class BlockConjugateGradient : public OperatorFunction<Field> {
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  Integer PrintInterval; //GridLogMessages or Iterative
+  RealD TrueResidual;
  
  BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
    : Tolerance(tol), CGtype(cgtype),   blockDim(_Orthog),  MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv),PrintInterval(100)
@@ -306,7 +307,8 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)

      Linop.HermOp(X, AD);
      AD = AD-B;
-      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AD)/norm2(B)) <<std::endl;
+      TrueResidual = std::sqrt(norm2(AD)/norm2(B));
+      std::cout << GridLogMessage <<"\tTrue residual is " << TrueResidual <<std::endl;

      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
@@ -442,7 +444,8 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &

      Linop.HermOp(Psi, AP);
      AP = AP-Src;
-      std::cout <<GridLogMessage << "\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
+      TrueResidual = std::sqrt(norm2(AP)/norm2(Src));
+      std::cout <<GridLogMessage << "\tTrue residual is " << TrueResidual <<std::endl;

      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
@@ -653,7 +656,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
      if ( rr > max_resid ) max_resid = rr;
    }

-    std::cout << GridLogIterative << "\t Block Iteration "<<k<<" ave resid "<< sqrt(rrsum/sssum) << " max "<< sqrt(max_resid) <<std::endl;
+    std::cout << GridLogIterative << "\t Block Iteration "<<k<<" ave resid "<< std::sqrt(rrsum/sssum) << " max "<< std::sqrt(max_resid) <<std::endl;

    if ( max_resid < Tolerance*Tolerance ) { 

@@ -668,7 +671,8 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field

      for(int b=0;b<Nblock;b++) Linop.HermOp(X[b], AD[b]);
      for(int b=0;b<Nblock;b++) AD[b] = AD[b]-B[b];
-      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(normv(AD)/normv(B)) <<std::endl;
+      TrueResidual = std::sqrt(normv(AD)/normv(B));
+      std::cout << GridLogMessage << "\tTrue residual is " << TrueResidual <<std::endl;

      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
@@ -49,6 +49,7 @@ public:
  RealD Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+  RealD TrueResidual;
  
  ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
    : Tolerance(tol),
@@ -71,7 +72,6 @@ public:
    // Initial residual computation & set up
    RealD guess = norm2(psi);
    assert(std::isnan(guess) == 0);
-
    
    Linop.HermOpAndNorm(psi, mmp, d, b);
    
@@ -82,6 +82,14 @@ public:
    cp = a;
    ssq = norm2(src);

+    // Handle trivial case of zero src
+    if (ssq == 0.){
+      psi = Zero();
+      IterationsToComplete = 1;
+      TrueResidual = 0.;
+      return;
+    }
+
    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: guess " << guess << std::endl;
    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:   src " << ssq << std::endl;
    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:    mp " << d << std::endl;
@@ -93,6 +101,7 @@ public:

    // Check if guess is really REALLY good :)
    if (cp <= rsq) {
+      TrueResidual = std::sqrt(a/ssq);
      std::cout << GridLogMessage << "ConjugateGradient guess is converged already " << std::endl;
      IterationsToComplete = 0;	
      return;
@@ -142,7 +151,7 @@ public:
      LinalgTimer.Stop();

      std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
-                << " residual^2 " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
+                << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;

      // Stopping condition
      if (cp <= rsq) {
@@ -154,26 +163,33 @@ public:
        RealD resnorm = std::sqrt(norm2(p));
        RealD true_residual = resnorm / srcnorm;

-        std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl;
-        std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)<<std::endl;
-	std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
-	std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
+        std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k 
+		  << "\tComputed residual " << std::sqrt(cp / ssq)
+		  << "\tTrue residual " << true_residual
+		  << "\tTarget " << Tolerance << std::endl;

-        std::cout << GridLogMessage << "Time breakdown "<<std::endl;
-	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
+        std::cout << GridLogIterative << "Time breakdown "<<std::endl;
+	std::cout << GridLogIterative << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
+	std::cout << GridLogIterative << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;

        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);

 	IterationsToComplete = k;	
+	TrueResidual = true_residual;

        return;
      }
    }
+    // Failed. Calculate true residual before giving up                                                         
+    Linop.HermOpAndNorm(psi, mmp, d, qq);
+    p = mmp - src;
+
+    TrueResidual = sqrt(norm2(p)/ssq);
+
    std::cout << GridLogMessage << "ConjugateGradient did NOT converge "<<k<<" / "<< MaxIterations<< std::endl;

    if (ErrorOnNoConverge) assert(0);
@@ -46,15 +46,19 @@ public:

  RealD   Tolerance;
  Integer MaxIterations;
-    Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
  int verbose;
  MultiShiftFunction shifts;
+  std::vector<RealD> TrueResidualShift;

  ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) : 
    MaxIterations(maxit),
    shifts(_shifts)
  { 
    verbose=1;
+    IterationsToCompleteShift.resize(_shifts.order);
+    TrueResidualShift.resize(_shifts.order);
  }

  void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
@@ -125,6 +129,17 @@ public:
    // Residuals "r" are src
    // First search direction "p" is also src
    cp = norm2(src);
+
+    // Handle trivial case of zero src.
+    if( cp == 0. ){
+      for(int s=0;s<nshift;s++){
+	psi[s] = Zero();
+	IterationsToCompleteShift[s] = 1;
+	TrueResidualShift[s] = 0.;
+      }
+      return;
+    }
+
    for(int s=0;s<nshift;s++){
      rsq[s] = cp * mresidual[s] * mresidual[s];
      std::cout<<GridLogMessage<<"ConjugateGradientMultiShift: shift "<<s
@@ -270,6 +285,7 @@ public:
      for(int s=0;s<nshift;s++){
      
 	if ( (!converged[s]) ){
+	  IterationsToCompleteShift[s] = k;
 	
 	  RealD css  = c * z[s][iz]* z[s][iz];
 	
@@ -299,7 +315,8 @@ public:
 	  axpy(r,-alpha[s],src,tmp);
 	  RealD rn = norm2(r);
 	  RealD cn = norm2(src);
-	  std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
+	  TrueResidualShift[s] = std::sqrt(rn/cn);
+	  std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<< TrueResidualShift[s] <<std::endl;
 	}

      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
@@ -43,6 +43,11 @@ NAMESPACE_BEGIN(Grid);
 template<class Field>
 void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k) 
 {
+  // If assume basis[j] are already orthonormal,
+  // can take all inner products in parallel saving 2x bandwidth
+  // Save 3x bandwidth on the second line of loop.
+  // perhaps 2.5x speed up.
+  // 2x overall in Multigrid Lanczos  
  for(int j=0; j<k; ++j){
    auto ip = innerProduct(basis[j],w);
    w = w - ip*basis[j];
@@ -54,16 +59,15 @@ void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, i
 {
  typedef decltype(basis[0].View()) View;
  auto tmp_v = basis[0].View();
-  std::vector<View> basis_v(basis.size(),tmp_v);
+  Vector<View> basis_v(basis.size(),tmp_v);
  typedef typename Field::vector_object vobj;
  GridBase* grid = basis[0].Grid();
-      
+
  for(int k=0;k<basis.size();k++){
    basis_v[k] = basis[k].View();
  }
-
+#if 0
  std::vector < vobj , commAllocator<vobj> > Bt(thread_max() * Nm); // Thread private
-
  thread_region
  {
    vobj* B = Bt.data() + Nm * thread_num();
@@ -81,24 +85,89 @@ void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, i
      }
    });
  }
+#else
+
+  int nrot = j1-j0;
+
+
+  uint64_t oSites   =grid->oSites();
+  uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
+
+  //  printf("BasisRotate %d %d nrot %d siteBlock %d\n",j0,j1,nrot,siteBlock);
+
+  Vector <vobj> Bt(siteBlock * nrot); 
+  auto Bp=&Bt[0];
+
+  // GPU readable copy of Eigen matrix
+  Vector<double> Qt_jv(Nm*Nm);
+  double *Qt_p = & Qt_jv[0];
+  for(int k=0;k<Nm;++k){
+    for(int j=0;j<Nm;++j){
+      Qt_p[j*Nm+k]=Qt(j,k);
+    }
+  }
+
+  // Block the loop to keep storage footprint down
+  vobj zz=Zero();
+  for(uint64_t s=0;s<oSites;s+=siteBlock){
+
+    // remaining work in this block
+    int ssites=MIN(siteBlock,oSites-s);
+
+    // zero out the accumulators
+    accelerator_for(ss,siteBlock*nrot,vobj::Nsimd(),{
+	auto z=coalescedRead(zz);
+	coalescedWrite(Bp[ss],z);
+    });
+
+    accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
+	
+      int j =sj%nrot;
+      int jj  =j0+j;
+      int ss =sj/nrot;
+      int sss=ss+s;
+
+      for(int k=k0; k<k1; ++k){
+	auto tmp = coalescedRead(Bp[ss*nrot+j]);
+	coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_v[k][sss]));
+      }
+    });
+
+    accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
+      int j =sj%nrot;
+      int jj  =j0+j;
+      int ss =sj/nrot;
+      int sss=ss+s;
+      coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
+    });
+  }
+#endif
 }

 // Extract a single rotated vector
 template<class Field>
 void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) 
 {
+  typedef decltype(basis[0].View()) View;
  typedef typename Field::vector_object vobj;
  GridBase* grid = basis[0].Grid();

  result.Checkerboard() = basis[0].Checkerboard();
  auto result_v=result.View();
-  thread_for(ss, grid->oSites(),{
-    vobj B = Zero();
+  Vector<View> basis_v(basis.size(),result_v);
+  for(int k=0;k<basis.size();k++){
+    basis_v[k] = basis[k].View();
+  }
+  vobj zz=Zero();
+  Vector<double> Qt_jv(Nm);
+  double * Qt_j = & Qt_jv[0];
+  for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
+  accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
+    auto B=coalescedRead(zz);
    for(int k=k0; k<k1; ++k){
-      auto basis_k = basis[k].View();
-      B +=Qt(j,k) * basis_k[ss];
+      B +=Qt_j[k] * coalescedRead(basis_v[k][ss]);
    }
-    result_v[ss] = B;
+    coalescedWrite(result_v[ss], B);
  });
 }

@@ -282,7 +351,7 @@ public:
 			    RealD _eresid, // resid in lmdue deficit 
 			    int _MaxIter, // Max iterations
 			    RealD _betastp=0.0, // if beta(k) < betastp: converged
-			    int _MinRestart=1, int _orth_period = 1,
+			    int _MinRestart=0, int _orth_period = 1,
 			    IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
    SimpleTester(HermOp), _PolyOp(PolyOp),      _HermOp(HermOp), _Tester(Tester),
    Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm),
@@ -298,7 +367,7 @@ public:
 			       RealD _eresid, // resid in lmdue deficit 
 			       int _MaxIter, // Max iterations
 			       RealD _betastp=0.0, // if beta(k) < betastp: converged
-			       int _MinRestart=1, int _orth_period = 1,
+			       int _MinRestart=0, int _orth_period = 1,
 			       IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
    SimpleTester(HermOp),  _PolyOp(PolyOp),      _HermOp(HermOp), _Tester(SimpleTester),
    Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm),
@@ -347,7 +416,7 @@ until convergence
    GridBase *grid = src.Grid();
    assert(grid == evec[0].Grid());
    
-    GridLogIRL.TimingMode(1);
+    //    GridLogIRL.TimingMode(1);
    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
    std::cout << GridLogIRL <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 /  "<< MaxIter<< std::endl;
    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
@@ -372,14 +441,17 @@ until convergence
    {
      auto src_n = src;
      auto tmp = src;
+      std::cout << GridLogIRL << " IRL source norm " << norm2(src) << std::endl;
      const int _MAX_ITER_IRL_MEVAPP_ = 50;
      for (int i=0;i<_MAX_ITER_IRL_MEVAPP_;i++) {
 	normalise(src_n);
 	_HermOp(src_n,tmp);
+	//	std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
+	//	std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
 	RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
 	RealD vden = norm2(src_n);
 	RealD na = vnum/vden;
-	if (fabs(evalMaxApprox/na - 1.0) < 0.05)
+	if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
 	  i=_MAX_ITER_IRL_MEVAPP_;
 	evalMaxApprox = na;
 	std::cout << GridLogIRL << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
@@ -577,11 +649,11 @@ until convergence
 /* Saad PP. 195
 1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0
 2. For k = 1,2,...,m Do:
-3. wk:=Avk−βkv_{k−1}      
-4. αk:=(wk,vk)       // 
-5. wk:=wk−αkvk       // wk orthog vk 
-6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
-7. vk+1 := wk/βk+1
+3. wk:=Avk - b_k v_{k-1}      
+4. ak:=(wk,vk)       // 
+5. wk:=wk-akvk       // wk orthog vk 
+6. bk+1 := ||wk||_2. If b_k+1 = 0 then Stop
+7. vk+1 := wk/b_k+1
 8. EndDo
 */
  void step(std::vector<RealD>& lmd,
@@ -589,6 +661,7 @@ until convergence
 	    std::vector<Field>& evec,
 	    Field& w,int Nm,int k)
  {
+    std::cout<<GridLogIRL << "Lanczos step " <<k<<std::endl;
    const RealD tiny = 1.0e-20;
    assert( k< Nm );

@@ -600,20 +673,20 @@ until convergence

    if(k>0) w -= lme[k-1] * evec[k-1];

-    ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk)
+    ComplexD zalph = innerProduct(evec_k,w);
    RealD     alph = real(zalph);

-    w = w - alph * evec_k;// 5. wk:=wk−αkvk
+    w = w - alph * evec_k;

-    RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
-    // 7. vk+1 := wk/βk+1
+    RealD beta = normalise(w); 

    lmd[k] = alph;
    lme[k] = beta;

-    if (k>0 && k % orth_period == 0) {
+    if ( (k>0) && ( (k % orth_period) == 0 )) {
+      std::cout<<GridLogIRL << "Orthogonalising " <<k<<std::endl;
      orthogonalize(w,evec,k); // orthonormalise
-      std::cout<<GridLogIRL << "Orthogonalised " <<std::endl;
+      std::cout<<GridLogIRL << "Orthogonalised " <<k<<std::endl;
    }

    if(k < Nm-1) evec[k+1] = w;
@@ -621,6 +694,8 @@ until convergence
    std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
    if ( beta < tiny ) 
      std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl;
+
+    std::cout<<GridLogIRL << "Lanczos step complete " <<k<<std::endl;
  }

  void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
@@ -33,26 +33,78 @@ NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // Take a matrix and form an NE solver calling a Herm solver
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
-template<class Field> class NormalEquations : public OperatorFunction<Field>{
+template<class Field> class NormalEquations {
 private:
  SparseMatrixBase<Field> & _Matrix;
  OperatorFunction<Field> & _HermitianSolver;
-
+  LinearFunction<Field>   & _Guess;
 public:

  /////////////////////////////////////////////////////
  // Wrap the usual normal equations trick
  /////////////////////////////////////////////////////
-  NormalEquations(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver) 
-    :  _Matrix(Matrix), _HermitianSolver(HermitianSolver) {}; 
+ NormalEquations(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
+		 LinearFunction<Field> &Guess) 
+   :  _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {}; 

  void operator() (const Field &in, Field &out){
 
    Field src(in.Grid());
+    Field tmp(in.Grid());

+    MdagMLinearOperator<SparseMatrixBase<Field>,Field> MdagMOp(_Matrix);
    _Matrix.Mdag(in,src);
-    _HermitianSolver(src,out);  // Mdag M out = Mdag in
+    _Guess(src,out);
+    _HermitianSolver(MdagMOp,src,out);  // Mdag M out = Mdag in
+
+  }     
+};
+
+template<class Field> class HPDSolver {
+private:
+  LinearOperatorBase<Field> & _Matrix;
+  OperatorFunction<Field> & _HermitianSolver;
+  LinearFunction<Field>   & _Guess;
+public:
+
+  /////////////////////////////////////////////////////
+  // Wrap the usual normal equations trick
+  /////////////////////////////////////////////////////
+ HPDSolver(LinearOperatorBase<Field> &Matrix,
+	   OperatorFunction<Field> &HermitianSolver,
+	   LinearFunction<Field> &Guess) 
+   :  _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {}; 
+
+  void operator() (const Field &in, Field &out){
 
+    _Guess(in,out);
+    _HermitianSolver(_Matrix,in,out);  // Mdag M out = Mdag in
+
+  }     
+};
+
+
+template<class Field> class MdagMSolver {
+private:
+  SparseMatrixBase<Field> & _Matrix;
+  OperatorFunction<Field> & _HermitianSolver;
+  LinearFunction<Field>   & _Guess;
+public:
+
+  /////////////////////////////////////////////////////
+  // Wrap the usual normal equations trick
+  /////////////////////////////////////////////////////
+ MdagMSolver(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
+	     LinearFunction<Field> &Guess) 
+   :  _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {}; 
+
+  void operator() (const Field &in, Field &out){
+ 
+    MdagMLinearOperator<SparseMatrixBase<Field>,Field> MdagMOp(_Matrix);
+    _Guess(in,out);
+
+    _HermitianSolver(MdagMOp,in,out);  // Mdag M out = Mdag in
+
  }     
 };

@@ -30,12 +30,12 @@ template<class Field> class PowerMethod
      RealD vden = norm2(src_n); 
      RealD na = vnum/vden; 
      
-      if ( (fabs(evalMaxApprox/na - 1.0) < 0.01) || (i==_MAX_ITER_EST_-1) ) { 
+      if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) { 
 	evalMaxApprox = na; 
+	std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
 	return evalMaxApprox; 
      } 
      evalMaxApprox = na; 
-      std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
      src_n = tmp;
    }
    assert(0);
@@ -38,10 +38,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 NAMESPACE_BEGIN(Grid);

+#define GCRLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level<<" " 
+
 template<class Field>
-class PrecGeneralisedConjugateResidual : public OperatorFunction<Field> {
+class PrecGeneralisedConjugateResidual : public LinearFunction<Field> {
 public:                                                
-  using OperatorFunction<Field>::operator();

  RealD   Tolerance;
  Integer MaxIterations;
@@ -49,23 +50,29 @@ public:
  int mmax;
  int nstep;
  int steps;
+  int level;
  GridStopWatch PrecTimer;
  GridStopWatch MatTimer;
  GridStopWatch LinalgTimer;

-  LinearFunction<Field> &Preconditioner;
+  LinearFunction<Field>     &Preconditioner;
+  LinearOperatorBase<Field> &Linop;

-  PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec,int _mmax,int _nstep) : 
+  void Level(int lv) { level=lv; };
+
+  PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearOperatorBase<Field> &_Linop,LinearFunction<Field> &Prec,int _mmax,int _nstep) : 
    Tolerance(tol), 
    MaxIterations(maxit),
+    Linop(_Linop),
    Preconditioner(Prec),
    mmax(_mmax),
    nstep(_nstep)
  { 
+    level=1;
    verbose=1;
  };

-  void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
+  void operator() (const Field &src, Field &psi){

    psi=Zero();
    RealD cp, ssq,rsq;
@@ -84,9 +91,9 @@ public:
    steps=0;
    for(int k=0;k<MaxIterations;k++){

-      cp=GCRnStep(Linop,src,psi,rsq);
+      cp=GCRnStep(src,psi,rsq);

-      std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl;
+      GCRLogLevel <<"PGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl;

      if(cp<rsq) {

@@ -95,24 +102,26 @@ public:
 	Linop.HermOp(psi,r);
 	axpy(r,-1.0,src,r);
 	RealD tr = norm2(r);
-	std::cout<<GridLogMessage<<"PrecGeneralisedConjugateResidual: Converged on iteration " <<steps
+	GCRLogLevel<<"PGCR: Converged on iteration " <<steps
 		 << " computed residual "<<sqrt(cp/ssq)
 		 << " true residual "    <<sqrt(tr/ssq)
 		 << " target "           <<Tolerance <<std::endl;

-	std::cout<<GridLogMessage<<"VPGCR Time elapsed: Total  "<< SolverTimer.Elapsed() <<std::endl;
-	std::cout<<GridLogMessage<<"VPGCR Time elapsed: Precon "<<   PrecTimer.Elapsed() <<std::endl;
-	std::cout<<GridLogMessage<<"VPGCR Time elapsed: Matrix "<<    MatTimer.Elapsed() <<std::endl;
-	std::cout<<GridLogMessage<<"VPGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl;
+	GCRLogLevel<<"PGCR Time elapsed: Total  "<< SolverTimer.Elapsed() <<std::endl;
+	/*
+	  GCRLogLevel<<"PGCR Time elapsed: Precon "<<   PrecTimer.Elapsed() <<std::endl;
+	  GCRLogLevel<<"PGCR Time elapsed: Matrix "<<    MatTimer.Elapsed() <<std::endl;
+	  GCRLogLevel<<"PGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl;
+	*/
 	return;
      }

    }
-    std::cout<<GridLogMessage<<"Variable Preconditioned GCR did not converge"<<std::endl;
-    assert(0);
+    GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
+    //    assert(0);
  }

-  RealD GCRnStep(LinearOperatorBase<Field> &Linop,const Field &src, Field &psi,RealD rsq){
+  RealD GCRnStep(const Field &src, Field &psi,RealD rsq){

    RealD cp;
    RealD a, b;
@@ -134,9 +143,7 @@ public:
    std::vector<Field> p(mmax,grid);
    std::vector<RealD> qq(mmax);
      
-    std::cout<<GridLogIterative<< " ************** "<< std::endl;
-    std::cout<<GridLogIterative<< "   GCRnStep("<<nstep<<")"<<std::endl;
-    std::cout<<GridLogIterative<< " ************** "<< std::endl;
+    GCRLogLevel<< "PGCR nStep("<<nstep<<")"<<std::endl;

    //////////////////////////////////
    // initial guess x0 is taken as nonzero.
@@ -150,35 +157,15 @@ public:
    LinalgTimer.Start();
    r=src-Az;
    LinalgTimer.Stop();
-    std::cout<<GridLogIterative<< " GCRnStep true residual r = src - A psi   "<<norm2(r) <<std::endl;
+    GCRLogLevel<< "PGCR true residual r = src - A psi   "<<norm2(r) <<std::endl;
    
    /////////////////////
    // p = Prec(r)
    /////////////////////

-    std::cout<<GridLogIterative<< " GCRnStep apply preconditioner z= M^-1 r "<< std::endl;
-    std::cout<<GridLogIterative<< " --------------------------------------- "<< std::endl;
    PrecTimer.Start();
    Preconditioner(r,z);
    PrecTimer.Stop();
-    std::cout<<GridLogIterative<< " --------------------------------------- "<< std::endl;
-    std::cout<<GridLogIterative<< " GCRnStep called Preconditioner z "<< norm2(z) <<std::endl;
-
-    //    MatTimer.Start();
-    //    Linop.HermOp(z,tmp); 
-    //    MatTimer.Stop();
-
-    //    LinalgTimer.Start();
-    //    ttmp=tmp;
-    //    tmp=tmp-r;
-    //    LinalgTimer.Stop();
-
-    /*
-      std::cout<<GridLogMessage<<r<<std::endl;
-      std::cout<<GridLogMessage<<z<<std::endl;
-      std::cout<<GridLogMessage<<ttmp<<std::endl;
-      std::cout<<GridLogMessage<<tmp<<std::endl;
-    */

    MatTimer.Start();
    Linop.HermOpAndNorm(z,Az,zAz,zAAz); 
@@ -190,7 +177,6 @@ public:
    p[0]= z;
    q[0]= Az;
    qq[0]= zAAz;
-    std::cout<<GridLogIterative<< " GCRnStep p0=z, q0 = A p0 " <<std::endl;
    
    cp =norm2(r);
    LinalgTimer.Stop();
@@ -212,20 +198,16 @@ public:
      cp = axpy_norm(r,-a,q[peri_k],r);
      LinalgTimer.Stop();

-      std::cout<<GridLogMessage<< " VPGCR_step["<<steps<<"]  resid " << cp << " target " <<rsq<<std::endl; 
+      GCRLogLevel<< "PGCR step["<<steps<<"]  resid " << cp << " target " <<rsq<<std::endl; 

      if((k==nstep-1)||(cp<rsq)){
 	return cp;
      }


-      std::cout<<GridLogIterative<< " GCRnStep apply preconditioner z= M^-1 r "<< std::endl;
-      std::cout<<GridLogIterative<< " --------------------------------------- "<< std::endl;
      PrecTimer.Start();
      Preconditioner(r,z);// solve Az = r
      PrecTimer.Stop();
-      std::cout<<GridLogIterative<< " --------------------------------------- "<< std::endl;
-      std::cout<<GridLogIterative<< " GCRnStep called Preconditioner z "<< norm2(z) <<std::endl;

      MatTimer.Start();
      Linop.HermOpAndNorm(z,Az,zAz,zAAz);
@@ -405,6 +405,70 @@ namespace Grid {
    }
  };

+  template<class Field> class NonHermitianSchurRedBlackDiagMooeeSolve : public SchurRedBlackBase<Field> 
+  {
+    public:
+      typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
+
+      NonHermitianSchurRedBlackDiagMooeeSolve(OperatorFunction<Field>& RBSolver, const bool initSubGuess = false,
+          const bool _solnAsInitGuess = false)  
+      : SchurRedBlackBase<Field>(RBSolver, initSubGuess, _solnAsInitGuess) {};
+
+      //////////////////////////////////////////////////////
+      // Override RedBlack specialisation
+      //////////////////////////////////////////////////////
+      virtual void RedBlackSource(Matrix& _Matrix, const Field& src, Field& src_e, Field& src_o)
+      {
+        GridBase* grid  = _Matrix.RedBlackGrid();
+        GridBase* fgrid = _Matrix.Grid();
+
+        Field  tmp(grid);
+        Field Mtmp(grid);
+
+        pickCheckerboard(Even, src_e, src);
+        pickCheckerboard(Odd , src_o, src);
+
+        /////////////////////////////////////////////////////
+        // src_o = Mdag * (source_o - Moe MeeInv source_e)
+        /////////////////////////////////////////////////////
+        _Matrix.MooeeInv(src_e, tmp);   assert(   tmp.Checkerboard() == Even );
+        _Matrix.Meooe   (tmp, Mtmp);    assert(  Mtmp.Checkerboard() == Odd  );     
+        src_o -= Mtmp;                  assert( src_o.Checkerboard() == Odd  );     
+      }
+      
+      virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol)
+      {
+        GridBase* grid  = _Matrix.RedBlackGrid();
+        GridBase* fgrid = _Matrix.Grid();
+
+        Field     tmp(grid);
+        Field   sol_e(grid);
+        Field src_e_i(grid);
+        
+        ///////////////////////////////////////////////////
+        // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+        ///////////////////////////////////////////////////
+        _Matrix.Meooe(sol_o, tmp);         assert(     tmp.Checkerboard() == Even );
+        src_e_i = src_e - tmp;             assert( src_e_i.Checkerboard() == Even );
+        _Matrix.MooeeInv(src_e_i, sol_e);  assert(   sol_e.Checkerboard() == Even );
+       
+        setCheckerboard(sol, sol_e); assert( sol_e.Checkerboard() == Even );
+        setCheckerboard(sol, sol_o); assert( sol_o.Checkerboard() == Odd  );
+      }
+
+      virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o)
+      {
+        NonHermitianSchurDiagMooeeOperator<Matrix,Field> _OpEO(_Matrix);
+        this->_HermitianRBSolver(_OpEO, src_o, sol_o);  assert(sol_o.Checkerboard() == Odd);
+      }
+
+      virtual void RedBlackSolve(Matrix& _Matrix, const std::vector<Field>& src_o, std::vector<Field>& sol_o)
+      {
+        NonHermitianSchurDiagMooeeOperator<Matrix,Field> _OpEO(_Matrix);
+        this->_HermitianRBSolver(_OpEO, src_o, sol_o); 
+      }
+  };
+
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Site diagonal is identity, right preconditioned by Mee^inv
  // ( 1 - Meo Moo^inv Moe Mee^inv  ) phi =( 1 - Meo Moo^inv Moe Mee^inv  ) Mee psi =  = eta  = eta
@@ -482,5 +546,76 @@ namespace Grid {
      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
    }
  };
+
+  template<class Field> class NonHermitianSchurRedBlackDiagTwoSolve : public SchurRedBlackBase<Field> 
+  {
+    public:
+      typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
+
+      /////////////////////////////////////////////////////
+      // Wrap the usual normal equations Schur trick
+      /////////////////////////////////////////////////////
+      NonHermitianSchurRedBlackDiagTwoSolve(OperatorFunction<Field>& RBSolver, const bool initSubGuess = false,
+          const bool _solnAsInitGuess = false)  
+      : SchurRedBlackBase<Field>(RBSolver, initSubGuess, _solnAsInitGuess) {};
+
+      virtual void RedBlackSource(Matrix& _Matrix, const Field& src, Field& src_e, Field& src_o)
+      {
+        GridBase* grid  = _Matrix.RedBlackGrid();
+        GridBase* fgrid = _Matrix.Grid();
+
+        Field  tmp(grid);
+        Field Mtmp(grid);
+
+        pickCheckerboard(Even, src_e, src);
+        pickCheckerboard(Odd , src_o, src);
+      
+        /////////////////////////////////////////////////////
+        // src_o = Mdag * (source_o - Moe MeeInv source_e)
+        /////////////////////////////////////////////////////
+        _Matrix.MooeeInv(src_e, tmp);   assert(   tmp.Checkerboard() == Even );
+        _Matrix.Meooe   (tmp, Mtmp);    assert(  Mtmp.Checkerboard() == Odd  );     
+        src_o -= Mtmp;                  assert( src_o.Checkerboard() == Odd  );     
+      }
+
+      virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol)
+      {
+        GridBase* grid  = _Matrix.RedBlackGrid();
+        GridBase* fgrid = _Matrix.Grid();
+
+        Field sol_o_i(grid);
+        Field     tmp(grid);
+        Field   sol_e(grid);
+
+        ////////////////////////////////////////////////
+        // MooeeInv due to pecond
+        ////////////////////////////////////////////////
+        _Matrix.MooeeInv(sol_o, tmp);
+        sol_o_i = tmp;
+
+        ///////////////////////////////////////////////////
+        // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+        ///////////////////////////////////////////////////
+        _Matrix.Meooe(sol_o_i, tmp);    assert(   tmp.Checkerboard() == Even );
+        tmp = src_e - tmp;              assert( src_e.Checkerboard() == Even );
+        _Matrix.MooeeInv(tmp, sol_e);   assert( sol_e.Checkerboard() == Even );
+       
+        setCheckerboard(sol, sol_e);    assert(   sol_e.Checkerboard() == Even );
+        setCheckerboard(sol, sol_o_i);  assert( sol_o_i.Checkerboard() == Odd  );
+      };
+
+      virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o)
+      {
+        NonHermitianSchurDiagTwoOperator<Matrix,Field> _OpEO(_Matrix);
+        this->_HermitianRBSolver(_OpEO, src_o, sol_o);
+      };
+
+      virtual void RedBlackSolve(Matrix& _Matrix, const std::vector<Field>& src_o,  std::vector<Field>& sol_o)
+      {
+        NonHermitianSchurDiagTwoOperator<Matrix,Field> _OpEO(_Matrix);
+        this->_HermitianRBSolver(_OpEO, src_o, sol_o); 
+      }
+  };
 }
+
 #endif
@@ -6,6 +6,12 @@ NAMESPACE_BEGIN(Grid);
 MemoryStats *MemoryProfiler::stats = nullptr;
 bool         MemoryProfiler::debug = false;

+#ifdef GRID_NVCC
+#define SMALL_LIMIT (0)
+#else
+#define SMALL_LIMIT (4096)
+#endif
+
 #ifdef POINTER_CACHE
 int PointerCache::victim;

@@ -13,7 +19,7 @@ PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];

 void *PointerCache::Insert(void *ptr,size_t bytes) {

-  if (bytes < 4096 ) return ptr;
+  if (bytes < SMALL_LIMIT ) return ptr;

 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
@@ -50,7 +56,7 @@ void *PointerCache::Insert(void *ptr,size_t bytes) {

 void *PointerCache::Lookup(size_t bytes) {

-  if (bytes < 4096 ) return NULL;
+  if (bytes < SMALL_LIMIT ) return NULL;

 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
@@ -49,8 +49,13 @@ NAMESPACE_BEGIN(Grid);
 #ifdef POINTER_CACHE
 class PointerCache {
 private:
-
+/*Pinning pages is costly*/
+/*Could maintain separate large and small allocation caches*/
+#ifdef GRID_NVCC 
+  static const int Ncache=128;
+#else
  static const int Ncache=8;
+#endif
  static int victim;

  typedef struct { 
@@ -63,7 +68,6 @@ private:

 public:

-
  static void *Insert(void *ptr,size_t bytes) ;
  static void *Lookup(size_t bytes) ;

@@ -170,13 +174,14 @@ public:
    // Unified (managed) memory
    ////////////////////////////////////
    if ( ptr == (_Tp *) NULL ) {
+      //      printf(" alignedAllocater cache miss %ld bytes ",bytes);      BACKTRACEFP(stdout);
      auto err = cudaMallocManaged((void **)&ptr,bytes);
      if( err != cudaSuccess ) {
 	ptr = (_Tp *) NULL;
 	std::cerr << " cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
 	assert(0);
      }
-    }
+    } 
    assert( ptr != (_Tp *)NULL);
 #else 
    //////////////////////////////////////////////////////////////////////////////////////////
@@ -47,20 +47,19 @@ public:
  // Give Lattice access
  template<class object> friend class Lattice;

-  GridBase(const Coordinate & processor_grid) : CartesianCommunicator(processor_grid) {}; 
+  GridBase(const Coordinate & processor_grid) : CartesianCommunicator(processor_grid) { LocallyPeriodic=0;}; 

  GridBase(const Coordinate & processor_grid,
 	   const CartesianCommunicator &parent,
 	   int &split_rank) 
-    : CartesianCommunicator(processor_grid,parent,split_rank) {};
+    : CartesianCommunicator(processor_grid,parent,split_rank) {LocallyPeriodic=0;};

  GridBase(const Coordinate & processor_grid,
 	   const CartesianCommunicator &parent) 
-    : CartesianCommunicator(processor_grid,parent,dummy) {};
+    : CartesianCommunicator(processor_grid,parent,dummy) {LocallyPeriodic=0;};

  virtual ~GridBase() = default;

-
  // Physics Grid information.
  Coordinate _simd_layout;// Which dimensions get relayed out over simd lanes.
  Coordinate _fdimensions;// (full) Global dimensions of array prior to cb removal
@@ -80,7 +79,8 @@ public:
  Coordinate _lstart;     // local start of array in gcoors _processor_coor[d]*_ldimensions[d]
  Coordinate _lend  ;     // local end of array in gcoors   _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1

-    bool _isCheckerBoarded; 
+  bool _isCheckerBoarded; 
+  int        LocallyPeriodic;

 public:

@@ -173,6 +173,7 @@ public:
  ///////////////////////////////////////////////////
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_object scalar_object;
  typedef vobj vector_object;

 private:
@@ -37,19 +37,18 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
  GridBase *grid = l.Grid();
  int Nsimd = grid->iSites();

-  Coordinate gcoor;
-  ExtractBuffer<scalar_type> mergebuf(Nsimd);
-
-  vector_type vI;
  auto l_v = l.View();
-  for(int o=0;o<grid->oSites();o++){
+  thread_for( o, grid->oSites(), {
+    vector_type vI;
+    Coordinate gcoor;
+    ExtractBuffer<scalar_type> mergebuf(Nsimd);
    for(int i=0;i<grid->iSites();i++){
      grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
      mergebuf[i]=(Integer)gcoor[mu];
    }
    merge<vector_type,scalar_type>(vI,mergebuf);
    l_v[o]=vI;
-  }
+  });
 };

 // LatticeCoordinate();
@@ -156,7 +156,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
 // Peek a scalar object from the SIMD array
 //////////////////////////////////////////////////////////
 template<class vobj,class sobj>
-void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
+accelerator_inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
        
  GridBase *grid = l.Grid();

@@ -185,7 +185,7 @@ void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
 };

 template<class vobj,class sobj>
-void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
+accelerator_inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){

  GridBase *grid=l.Grid();

@@ -1,5 +1,4 @@
 /*************************************************************************************
-
    Grid physics library, www.github.com/paboyle/Grid 

    Source file: ./lib/lattice/Lattice_transfer.h
@@ -83,12 +82,35 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
  });
 }
  
-
 template<class vobj,class CComplex,int nbasis>
 inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
+			  const             Lattice<vobj>   &fineData,
+			  const std::vector<Lattice<vobj> > &Basis)
+{
+  GridBase * fine  = fineData.Grid();
+  GridBase * coarse= coarseData.Grid();
+
+  Lattice<CComplex> ip(coarse); 
+
+  //  auto fineData_   = fineData.View();
+  auto coarseData_ = coarseData.View();
+  auto ip_         = ip.View();
+  for(int v=0;v<nbasis;v++) {
+    blockInnerProduct(ip,Basis[v],fineData);
+    accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
+	coalescedWrite(coarseData_[sc](v),ip_(sc));
+      });
+  }
+}
+
+template<class vobj,class CComplex,int nbasis>
+inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
 			 const             Lattice<vobj>   &fineData,
 			 const std::vector<Lattice<vobj> > &Basis)
 {
+  typedef iVector<CComplex,nbasis > coarseSiteData;
+  coarseSiteData elide;
+  typedef decltype(coalescedRead(elide)) ScalarComplex;
  GridBase * fine  = fineData.Grid();
  GridBase * coarse= coarseData.Grid();
  int  _ndimension = coarse->_ndimension;
@@ -106,26 +128,40 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
    block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
    assert(block_r[d]*coarse->_rdimensions[d] == fine->_rdimensions[d]);
  }
+  int blockVol = fine->oSites()/coarse->oSites();

  coarseData=Zero();

  auto fineData_   = fineData.View();
  auto coarseData_ = coarseData.View();
-  // Loop over coars parallel, and then loop over fine associated with coarse.
-  thread_for( sf, fine->oSites(), {
-    int sc;
-    Coordinate coor_c(_ndimension);
-    Coordinate coor_f(_ndimension);
-    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
-    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
-    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // To make this lock free, loop over coars parallel, and then loop over fine associated with coarse.
+  // Otherwise do fine inner product per site, and make the update atomic
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////
+  accelerator_for( sci, nbasis*coarse->oSites(), vobj::Nsimd(), {

-    thread_critical {
-      for(int i=0;i<nbasis;i++) {
-	auto Basis_      = Basis[i].View();
-	coarseData_[sc](i)=coarseData_[sc](i) + innerProduct(Basis_[sf],fineData_[sf]);
-      }
+    auto sc=sci/nbasis;
+    auto i=sci%nbasis;
+    auto Basis_      = Basis[i].View();
+
+    Coordinate coor_c(_ndimension);
+    Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions);  // Block coordinate
+
+    int sf;
+    decltype(innerProduct(Basis_(sf),fineData_(sf))) reduce=Zero();
+
+    for(int sb=0;sb<blockVol;sb++){
+
+      Coordinate coor_b(_ndimension);
+      Coordinate coor_f(_ndimension);
+
+      Lexicographic::CoorFromIndex(coor_b,sb,block_r);
+      for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d]+coor_b[d];
+      Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
+      
+      reduce=reduce+innerProduct(Basis_(sf),fineData_(sf));
    }
+    coalescedWrite(coarseData_[sc](i),reduce);
  });
  return;
 }
@@ -160,7 +196,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
  auto fineY_  = fineY.View();
  auto coarseA_= coarseA.View();

-  thread_for(sf, fine->oSites(), {
+  accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
    
    int sc;
    Coordinate coor_c(_ndimension);
@@ -171,7 +207,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);

    // z = A x + y
-    fineZ_[sf]=coarseA_[sc]*fineX_[sf]+fineY_[sf];
+    coalescedWrite(fineZ_[sf],coarseA_(sc)*fineX_(sf)+fineY_(sf));

  });

@@ -196,7 +232,7 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,

  fine_inner = localInnerProduct(fineX,fineY);
  blockSum(coarse_inner,fine_inner);
-  thread_for(ss, coarse->oSites(),{
+  accelerator_for(ss, coarse->oSites(), 1, {
    CoarseInner_[ss] = coarse_inner_[ss];
  });
 }
@@ -226,23 +262,29 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
  for(int d=0 ; d<_ndimension;d++){
    block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
  }
+  int blockVol = fine->oSites()/coarse->oSites();

  // Turn this around to loop threaded over sc and interior loop 
  // over sf would thread better
-  coarseData=Zero();
  auto coarseData_ = coarseData.View();
  auto fineData_   = fineData.View();

-  thread_for(sf,fine->oSites(),{
-    int sc;
+  accelerator_for(sc,coarse->oSites(),1,{
+
+    // One thread per sub block
    Coordinate coor_c(_ndimension);
-    Coordinate coor_f(_ndimension);
-    
-    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
-    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
-    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
-    
-    thread_critical { 
+    Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions);  // Block coordinate
+    coarseData_[sc]=Zero();
+
+    for(int sb=0;sb<blockVol;sb++){
+      
+      int sf;
+      Coordinate coor_b(_ndimension);
+      Coordinate coor_f(_ndimension);
+      Lexicographic::CoorFromIndex(coor_b,sb,block_r);               // Block sub coordinate
+      for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
+      Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
+
      coarseData_[sc]=coarseData_[sc]+fineData_[sf];
    }

@@ -296,6 +338,7 @@ inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> >
  }
 }

+#if 0
 template<class vobj,class CComplex,int nbasis>
 inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 			 Lattice<vobj>   &fineData,
@@ -321,7 +364,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
  auto coarseData_ = coarseData.View();

  // Loop with a cache friendly loop ordering
-  thread_for(sf,fine->oSites(),{
+  accelerator_for(sf,fine->oSites(),1,{
    int sc;
    Coordinate coor_c(_ndimension);
    Coordinate coor_f(_ndimension);
@@ -332,13 +375,35 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,

    for(int i=0;i<nbasis;i++) {
      auto basis_ = Basis[i].View();
-      if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf];
-      else     fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf];
+      if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]);
+      else     fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
    }
  });
  return;
  
 }
+#else
+template<class vobj,class CComplex,int nbasis>
+inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
+			 Lattice<vobj>   &fineData,
+			 const std::vector<Lattice<vobj> > &Basis)
+{
+  GridBase * fine  = fineData.Grid();
+  GridBase * coarse= coarseData.Grid();
+
+  fineData=Zero();
+  for(int i=0;i<nbasis;i++) {
+    Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
+    Lattice<CComplex> cip(coarse);
+    auto cip_ = cip.View();
+    auto  ip_ =  ip.View();
+    accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
+	coalescedWrite(cip_[sc], ip_(sc)());
+    });
+    blockZAXPY<vobj,CComplex >(fineData,cip,Basis[i],fineData);
+  }
+}
+#endif

 // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars.
 // Simd layouts need not match since we use peek/poke Local
@@ -374,6 +439,67 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
  });
 }

+template<class vobj>
+void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate FromLowerLeft, Coordinate ToLowerLeft, Coordinate RegionSize)
+{
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  static const int words=sizeof(vobj)/sizeof(vector_type);
+
+  GridBase *Fg = From.Grid();
+  GridBase *Tg = To.Grid();
+  assert(!Fg->_isCheckerBoarded);
+  assert(!Tg->_isCheckerBoarded);
+  int Nsimd = Fg->Nsimd();
+  int nF = Fg->_ndimension;
+  int nT = Tg->_ndimension;
+  int nd = nF;
+  assert(nF == nT);
+
+  for(int d=0;d<nd;d++){
+    assert(Fg->_processors[d]  == Tg->_processors[d]);
+  }
+
+  // the above should guarantee that the operations are local
+  Coordinate ldf = Fg->_ldimensions;
+  Coordinate rdf = Fg->_rdimensions;
+  Coordinate isf = Fg->_istride;
+  Coordinate osf = Fg->_ostride;
+  Coordinate rdt = Tg->_rdimensions;
+  Coordinate ist = Tg->_istride;
+  Coordinate ost = Tg->_ostride;
+  auto t_v = To.View();
+  auto f_v = From.View();
+  accelerator_for(idx,Fg->lSites(),1,{
+    sobj s;
+    Coordinate Fcoor(nd);
+    Coordinate Tcoor(nd);
+    Lexicographic::CoorFromIndex(Fcoor,idx,ldf);
+    int in_region=1;
+    for(int d=0;d<nd;d++){
+      if ( (Fcoor[d] < FromLowerLeft[d]) || (Fcoor[d]>=FromLowerLeft[d]+RegionSize[d]) ){ 
+	in_region=0;
+      }
+      Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d];
+    }
+    if (in_region) {
+      Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]);
+      Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]);
+      Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]);
+      Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]);
+      scalar_type * fp = (scalar_type *)&f_v[odx_f];
+      scalar_type * tp = (scalar_type *)&t_v[odx_t];
+      for(int w=0;w<words;w++){
+	tp[idx_t+w*Nsimd] = fp[idx_f+w*Nsimd];  // FIXME IF RRII layout, type pun no worke
+      }
+      //      peekLocalSite(s,From,Fcoor);
+      //      pokeLocalSite(s,To  ,Tcoor);
+    }
+  });
+}
+

 template<class vobj>
 void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog)
@@ -101,7 +101,8 @@ public:
  virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);

  // Efficient support for multigrid coarsening
-  virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
+  virtual void  Mdir   (const FermionField &in, FermionField &out,int dir,int disp);
+  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out);

  void   Meooe5D       (const FermionField &in, FermionField &out);
  void   MeooeDag5D    (const FermionField &in, FermionField &out);
@@ -62,14 +62,15 @@ public:

  // Efficient support for multigrid coarsening
  virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
+  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out);

-      ///////////////////////////////////////////////////////////////
-      // Physical surface field utilities
-      ///////////////////////////////////////////////////////////////
-      //      virtual void Dminus(const FermionField &psi, FermionField &chi);     // Inherit trivial case
-      //      virtual void DminusDag(const FermionField &psi, FermionField &chi);  // Inherit trivial case
-      virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
-      virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);
+  ///////////////////////////////////////////////////////////////
+  // Physical surface field utilities
+  ///////////////////////////////////////////////////////////////
+  //      virtual void Dminus(const FermionField &psi, FermionField &chi);     // Inherit trivial case
+  //      virtual void DminusDag(const FermionField &psi, FermionField &chi);  // Inherit trivial case
+  virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
+  virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);

  // Constructors
  ContinuedFractionFermion5D(GaugeField &_Umu,
@@ -89,6 +89,7 @@ public:

  virtual void  Mdiag  (const FermionField &in, FermionField &out) { Mooee(in,out);};   // Same as Mooee applied to both CB's
  virtual void  Mdir   (const FermionField &in, FermionField &out,int dir,int disp)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
+  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac


      virtual void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);};
@@ -103,6 +103,7 @@ public:
  // Multigrid assistance; force term uses too
  ///////////////////////////////////////////////////////////////
  void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
+  void MdirAll(const FermionField &in, std::vector<FermionField> &out);
  void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);

  ///////////////////////////////////////////////////////////////
@@ -86,7 +86,8 @@ public:
  void   MooeeDag    (const FermionField &in, FermionField &out);
  void   MooeeInvDag (const FermionField &in, FermionField &out);

-  void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp);
+  void Mdir   (const FermionField &in, FermionField &out,int dir,int disp);
+  void MdirAll(const FermionField &in, std::vector<FermionField> &out);
  void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);

  // These can be overridden by fancy 5d chiral action
@@ -67,12 +67,13 @@ public:

  // Efficient support for multigrid coarsening
  virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
+  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out);

-      ///////////////////////////////////////////////////////////////
-      // Physical surface field utilities
-      ///////////////////////////////////////////////////////////////
-      virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
-      virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);
+  ///////////////////////////////////////////////////////////////
+  // Physical surface field utilities
+  ///////////////////////////////////////////////////////////////
+  virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
+  virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);

  // Constructors
  PartialFractionFermion5D(GaugeField &_Umu,
@@ -115,9 +115,10 @@ public:
  // Multigrid assistance; force term uses too
  ///////////////////////////////////////////////////////////////
  void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
+  void MdirAll(const FermionField &in, std::vector<FermionField> &out);
  void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);
-  void DhopDirDisp(const FermionField &in, FermionField &out, int dirdisp,
-                   int gamma, int dag);
+  void DhopDirAll(const FermionField &in, std::vector<FermionField> &out);
+  void DhopDirCalc(const FermionField &in, FermionField &out, int dirdisp,int gamma, int dag);

  ///////////////////////////////////////////////////////////////
  // Extra methods added by derived
@@ -111,15 +111,16 @@ public:
  virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
  virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
  virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
+  virtual void   MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac

  // These can be overridden by fancy 5d chiral action
  virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
  virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
  virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);

-      void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
-      void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
-      void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+  void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+  void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+  void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;

  // Implement hopping term non-hermitian hopping term; half cb or both
  // Implement s-diagonal DW
@@ -131,6 +132,9 @@ public:
  // add a DhopComm
  // -- suboptimal interface will presently trigger multiple comms.
  void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
+  void DhopDirAll(const FermionField &in,std::vector<FermionField> &out);
+  void DhopDirComms(const FermionField &in);
+  void DhopDirCalc(const FermionField &in, FermionField &out,int point);
    
  ///////////////////////////////////////////////////////////////
  // New methods added 
@@ -60,6 +60,9 @@ public:
 			    int Ls, int Nsite, const FermionField &in, FermionField &out,
 			    int interior=1,int exterior=1) ;

+  static void DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
+			  int Nsite, const FermionField &in, std::vector<FermionField> &out) ;
+
  static void DhopDirKernel(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
 			    int Ls, int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma);

@@ -100,8 +103,17 @@ public:

 private:

-  static accelerator void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor * buf,
+  static accelerator_inline void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor * buf,
 				   int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dirdisp, int gamma);
+
+  static accelerator_inline void DhopDirXp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirYp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirZp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirTp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirXm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirYm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirZm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirTm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
      
  // Specialised variants
  static accelerator void GenericDhopSite(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
@@ -54,6 +54,14 @@ public:
    _Mat.Mdir(in,tmp,dir,disp);
    G5R5(out,tmp);
  }
+  void OpDirAll(const Field &in, std::vector<Field> &out) {
+    Field tmp(in.Grid());
+    _Mat.MdirAll(in,out);
+    for(int p=0;p<out.size();p++) {
+      tmp=out[p];
+      G5R5(out[p],tmp);
+    }
+  }

  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){

@@ -96,6 +104,12 @@ public:
    _Mat.Mdir(in,tmp,dir,disp);
    out=g5*tmp;
  }
+  void OpDirAll(const Field &in, std::vector<Field> &out) {
+    _Mat.MdirAll(in,out);
+    for(int p=0;p<out.size();p++) {
+      out[p]=g5*out[p];
+    }
+  }

  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){

@@ -389,6 +389,14 @@ void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,in
  Meo5D(psi,tmp);
  this->DhopDir(tmp,chi,dir,disp);
 }
+template<class Impl>
+void  CayleyFermion5D<Impl>::MdirAll(const FermionField &psi, std::vector<FermionField> &out)
+{
+  FermionField tmp(psi.Grid());
+  Meo5D(psi,tmp);
+  this->DhopDirAll(tmp,out);
+}
+
 // force terms; five routines; default to Dhop on diagonal
 template<class Impl>
 void CayleyFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
@@ -143,6 +143,25 @@ void  ContinuedFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionFi
  }
 }
 template<class Impl>
+void  ContinuedFractionFermion5D<Impl>::MdirAll (const FermionField &psi, std::vector<FermionField> &chi)
+{
+  int Ls = this->Ls;
+
+  this->DhopDirAll(psi,chi); // Dslash on diagonal. g5 Dslash is hermitian
+
+  for(int p=0;p<chi.size();p++){
+    int sign=1;
+    for(int s=0;s<Ls;s++){
+      if ( s==(Ls-1) ){
+	ag5xpby_ssp(chi[p],Beta[s]*ZoloHiInv,chi[p],0.0,chi[p],s,s);
+      } else {
+	ag5xpby_ssp(chi[p],cc[s]*Beta[s]*sign*ZoloHiInv,chi[p],0.0,chi[p],s,s);
+      }
+      sign=-sign; 
+    }
+  }
+}
+template<class Impl>
 void   ContinuedFractionFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
 {
  int Ls = this->Ls;
@@ -538,10 +538,16 @@ void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void)
 // Implement the general interface. Here we use SAME mass on all slices
 /////////////////////////////////////////////////////////////////////////
 template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
+void ImprovedStaggeredFermion5D<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) 
+{
  DhopDir(in, out, dir, disp);
 }
 template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out) 
+{
+  assert(0);
+}
+template <class Impl>
 RealD ImprovedStaggeredFermion5D<Impl>::M(const FermionField &in, FermionField &out) {
  out.Checkerboard() = in.Checkerboard();
  Dhop(in, out, DaggerNo);
@@ -362,12 +362,19 @@ void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField
 }

 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
+void ImprovedStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) 
+{
  DhopDir(in, out, dir, disp);
 }
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out) 
+{
+  assert(0); // Not implemented yet
+}

 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) {
+void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) 
+{

  Compressor compressor;
  Stencil.HaloExchange(in, compressor);
@@ -380,6 +387,7 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel
  });
 };

+
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
 						  DoubledGaugeField &U,
@@ -404,7 +412,6 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
 #ifdef GRID_OMP
  Compressor compressor; 
  int len =  U.Grid()->oSites();
-  const int LLs =  1;

  DhopTotalTime   -= usecond();

@@ -31,7 +31,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-template<class Impl>
+ template<class Impl>
 void  PartialFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
  // this does both dag and undag but is trivial; make a common helper routing
  int Ls = this->Ls;
@@ -45,8 +45,25 @@ void  PartialFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionFiel
    ag5xpby_ssp(chi, scale,chi,0.0,chi,s+1,s+1); 
  }
  ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);
-
 }
+template<class Impl>
+void  PartialFractionFermion5D<Impl>::MdirAll (const FermionField &psi, std::vector<FermionField> &chi){
+  // this does both dag and undag but is trivial; make a common helper routing
+  int Ls = this->Ls;
+
+  this->DhopDirAll(psi,chi);
+
+  for(int point=0;point<chi.size();point++){
+    int nblock=(Ls-1)/2;
+    for(int b=0;b<nblock;b++){
+      int s = 2*b;
+      ag5xpby_ssp(chi[point],-scale,chi[point],0.0,chi[point],s,s); 
+      ag5xpby_ssp(chi[point], scale,chi[point],0.0,chi[point],s+1,s+1); 
+    }
+    ag5xpby_ssp(chi[point],p[nblock]*scale/amax,chi[point],0.0,chi[point],Ls-1,Ls-1);
+  }
+}
+
 template<class Impl>
 void   PartialFractionFermion5D<Impl>::Meooe_internal(const FermionField &psi, FermionField &chi,int dag)
 {
@@ -241,6 +241,15 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
  Kernels::DhopDirKernel(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out,dirdisp,gamma);

 };
+template<class Impl>
+void WilsonFermion5D<Impl>::DhopDirAll(const FermionField &in, std::vector<FermionField> &out)
+{
+  Compressor compressor(DaggerNo);
+  Stencil.HaloExchange(in,compressor);
+  uint64_t Nsite = Umu.Grid()->oSites();
+  Kernels::DhopDirAll(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out);
+};
+

 template<class Impl>
 void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
@@ -319,28 +319,51 @@ void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int d
 }

 template <class Impl>
-void WilsonFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
+void WilsonFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) 
+{
  DhopDir(in, out, dir, disp);
 }
+template <class Impl>
+void WilsonFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out) 
+{
+  DhopDirAll(in, out);
+}

 template <class Impl>
 void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) 
 {
+  Compressor compressor(DaggerNo);
+  Stencil.HaloExchange(in, compressor);
+
  int skip = (disp == 1) ? 0 : 1;
  int dirdisp = dir + skip * 4;
  int gamma = dir + (1 - skip) * 4;

-  DhopDirDisp(in, out, dirdisp, gamma, DaggerNo);
+  DhopDirCalc(in, out, dirdisp, gamma, DaggerNo);
 };
-
 template <class Impl>
-void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) 
+void WilsonFermion<Impl>::DhopDirAll(const FermionField &in, std::vector<FermionField> &out) 
 {
-  Compressor compressor(dag);
-
+  Compressor compressor(DaggerNo);
  Stencil.HaloExchange(in, compressor);
+
+  assert((out.size()==8)||(out.size()==9)); 
+  for(int dir=0;dir<Nd;dir++){
+    for(int disp=-1;disp<=1;disp+=2){
+
+      int skip = (disp == 1) ? 0 : 1;
+      int dirdisp = dir + skip * 4;
+      int gamma = dir + (1 - skip) * 4;
+
+      DhopDirCalc(in, out[dirdisp], dirdisp, gamma, DaggerNo);
+    }
+  }
+}
+template <class Impl>
+void WilsonFermion<Impl>::DhopDirCalc(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) 
+{
  int Ls=1;
-  int Nsite=in.oSites();
+  uint64_t Nsite=in.oSites();
  Kernels::DhopDirKernel(Stencil, Umu, Stencil.CommBuf(), Ls, Nsite, in, out, dirdisp, gamma);
 };

@@ -348,7 +371,8 @@ template <class Impl>
 void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
                                       DoubledGaugeField &U,
                                       const FermionField &in,
-                                       FermionField &out, int dag) {
+                                       FermionField &out, int dag) 
+{
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
@@ -91,8 +91,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
  }								\
  synchronise();						

-#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon)			\
-  if (gamma == Dir) {						\
+#define GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon)		\
    if (SE->_is_local ) {					\
      int perm= SE->_permute;					\
      auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	\
@@ -102,10 +101,14 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
    }								\
    synchronise();						\
    Impl::multLink(Uchi, U[sU], chi, dir, SE, st);		\
-    Recon(result, Uchi);					\
-    synchronise();						\
+    Recon(result, Uchi);					
+
+#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon)			\
+  if (gamma == Dir) {						\
+    GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon);			\
  }

+
  ////////////////////////////////////////////////////////////////////
  // All legs kernels ; comms then compute
  ////////////////////////////////////////////////////////////////////
@@ -284,7 +287,36 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeField
  }
 };

-template <class Impl>
+#define DhopDirMacro(Dir,spProj,spRecon)	\
+  template <class Impl>							\
+  void WilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \
+					 int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
+  {									\
+  typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;		\
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;			\
+  calcHalfSpinor chi;							\
+  calcSpinor result;							\
+  calcHalfSpinor Uchi;							\
+  StencilEntry *SE;							\
+  int ptype;								\
+  const int Nsimd = SiteHalfSpinor::Nsimd();				\
+  const int lane=SIMTlane(Nsimd);					\
+									\
+  SE = st.GetEntry(ptype, dir, sF);					\
+  GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,spRecon);				\
+  coalescedWrite(out[sF], result,lane);					\
+  }									
+
+DhopDirMacro(Xp,spProjXp,spReconXp);
+DhopDirMacro(Yp,spProjYp,spReconYp);
+DhopDirMacro(Zp,spProjZp,spReconZp);
+DhopDirMacro(Tp,spProjTp,spReconTp);
+DhopDirMacro(Xm,spProjXm,spReconXm);
+DhopDirMacro(Ym,spProjYm,spReconYm);
+DhopDirMacro(Zm,spProjZm,spReconZm);
+DhopDirMacro(Tm,spProjTm,spReconTm);
+
+template <class Impl> 
 void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
 				    int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma) 
 {
@@ -299,18 +331,7 @@ void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si
  const int lane=SIMTlane(Nsimd);

  SE = st.GetEntry(ptype, dir, sF);
-  if (gamma == Xp) {						
-    if (SE->_is_local ) {					
-      int perm= SE->_permute;					
-      auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	
-      spProjXp(chi,tmp);						
-    } else {							
-      chi = coalescedRead(buf[SE->_offset],lane);			
-    }								
-    Impl::multLink(Uchi, U[sU], chi, dir, SE, st);		
-    spReconXp(result, Uchi);					
-  }
-
+  GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp);
  GENERIC_DHOPDIR_LEG(Yp,spProjYp,spReconYp);
  GENERIC_DHOPDIR_LEG(Zp,spProjZp,spReconZp);
  GENERIC_DHOPDIR_LEG(Tp,spProjTp,spReconTp);
@@ -321,6 +342,38 @@ void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si
  coalescedWrite(out[sF], result,lane);
 }

+template <class Impl>
+void WilsonKernels<Impl>::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
+				      int Nsite, const FermionField &in, std::vector<FermionField> &out) 
+{
+   auto U_v   = U.View();
+   auto in_v  = in.View();
+   auto st_v  = st.View();
+
+   auto out_Xm = out[0].View();
+   auto out_Ym = out[1].View();
+   auto out_Zm = out[2].View();
+   auto out_Tm = out[3].View();
+   auto out_Xp = out[4].View();
+   auto out_Yp = out[5].View();
+   auto out_Zp = out[6].View();
+   auto out_Tp = out[7].View();
+
+   accelerator_forNB(sss,Nsite*Ls,Simd::Nsimd(),{
+      int sU=sss/Ls;				
+      int sF =sss;				
+      DhopDirXm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xm,0);
+      DhopDirYm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Ym,1);
+      DhopDirZm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zm,2);
+      DhopDirTm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tm,3);
+      DhopDirXp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xp,4);
+      DhopDirYp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Yp,5);
+      DhopDirZp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zp,6);
+      DhopDirTp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tp,7);
+   });
+}
+
+
 template <class Impl>
 void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
 					 int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma) 
@@ -332,13 +385,32 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
   auto in_v  = in.View();
   auto out_v = out.View();
   auto st_v  = st.View();
-   accelerator_for(ss,Nsite,Simd::Nsimd(),{
-    for(int s=0;s<Ls;s++){
-      int sU=ss;
-      int sF = s+Ls*sU; 
-      DhopDirK(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_v,dirdisp,gamma);
-    }
-  });
+#define LoopBody(Dir)				\
+   case Dir :			\
+     accelerator_forNB(ss,Nsite,Simd::Nsimd(),{	\
+       for(int s=0;s<Ls;s++){			\
+	 int sU=ss;				\
+	 int sF = s+Ls*sU;						\
+	 DhopDir##Dir(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_v,dirdisp);\
+       }							       \
+       });							       \
+     break;
+
+   switch(gamma){
+   LoopBody(Xp);
+   LoopBody(Yp);
+   LoopBody(Zp);
+   LoopBody(Tp);
+
+   LoopBody(Xm);
+   LoopBody(Ym);
+   LoopBody(Zm);
+   LoopBody(Tm);
+   default:
+     assert(0);
+     break;
+   }
+#undef LoopBody
 } 

 #define KERNEL_CALLNB(A) \
@@ -80,6 +80,8 @@ static Registrar<OneFlavourRatioEOFModule<FermionImplementationPolicy>,

 static Registrar< ConjugateGradientModule<WilsonFermionR::FermionField>,   
                  HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CGWFmodXMLInit("ConjugateGradient"); 
+static Registrar< BiCGSTABModule<WilsonFermionR::FermionField>,   
+                  HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CGWFmodXMLInit("BiCGSTAB"); 
 static Registrar< ConjugateResidualModule<WilsonFermionR::FermionField>,   
                  HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CRWFmodXMLInit("ConjugateResidual"); 

@@ -119,6 +119,17 @@ class ConjugateGradientModule: public SolverModule<ConjugateGradient, Field, Sol
  }
 };

+template <class Field >
+class BiCGSTABModule: public SolverModule<BiCGSTAB, Field, SolverParameters> {
+  typedef SolverModule<BiCGSTAB, Field, SolverParameters> SolverBase;
+  using SolverBase::SolverBase; // for constructors
+
+  // acquire resource
+  virtual void initialize(){
+    this->SolverPtr.reset(new BiCGSTAB<Field>(this->Par_.tolerance, this->Par_.max_iterations, true));
+  }
+};
+
 template <class Field >
 class ConjugateResidualModule: public SolverModule<ConjugateResidual, Field, SolverParameters> {
  typedef SolverModule<ConjugateResidual, Field, SolverParameters> SolverBase;
@@ -92,6 +92,7 @@ public:
  };

  void Mdir(const GaugeField&, GaugeField&, int, int){ assert(0);}
+  void MdirAll(const GaugeField&, std::vector<GaugeField> &){ assert(0);}
  void Mdiag(const GaugeField&, GaugeField&){ assert(0);}

  void ImportGauge(const GaugeField& _U) {
@@ -97,6 +97,23 @@ namespace Grid {
    template<typename T, typename V = void> struct is_tensor_variable : public std::false_type {};
    template<typename T> struct is_tensor_variable<T, typename std::enable_if<is_tensor<T>::value
        && !is_tensor_fixed<T>::value>::type> : public std::true_type {};
+
+    // Helper functions to get the ultimate scalar inside a tensor, and corresponding size
+    template <typename ET>
+    inline typename std::enable_if<is_tensor<ET>::value, const typename ET::Index>::type
+    getScalarCount(const ET &eigenTensor) { return eigenTensor.size() * Traits<ET>::count; }
+    template <typename ET>
+    inline typename std::enable_if<is_tensor_of_scalar<ET>::value, const typename ET::Scalar *>::type
+    getFirstScalar(const ET &eigenTensor) { return eigenTensor.data(); }
+    template <typename ET>
+    inline typename std::enable_if<is_tensor_of_scalar<ET>::value, typename ET::Scalar *>::type
+    getFirstScalar(ET &eigenTensor) { return eigenTensor.data(); }
+    template <typename ET>
+    inline typename std::enable_if<is_tensor_of_container<ET>::value, const typename Traits<ET>::scalar_type *>::type
+    getFirstScalar(const ET &eigenTensor) { return eigenTensor.data()->begin(); }
+    template <typename ET>
+    inline typename std::enable_if<is_tensor_of_container<ET>::value, typename Traits<ET>::scalar_type *>::type
+    getFirstScalar(ET &eigenTensor) { return eigenTensor.data()->begin(); }
  }

  // Abstract writer/reader classes ////////////////////////////////////////////
@@ -128,23 +145,6 @@ namespace Grid {
    typename std::enable_if<EigenIO::is_tensor<ETensor>::value>::type
    write(const std::string &s, const ETensor &output);

-    // Helper functions for Scalar vs Container specialisations
-    template <typename ETensor>
-    inline typename std::enable_if<EigenIO::is_tensor_of_scalar<ETensor>::value,
-    const typename ETensor::Scalar *>::type
-    getFirstScalar(const ETensor &output)
-    {
-      return output.data();
-    }
-    
-    template <typename ETensor>
-    inline typename std::enable_if<EigenIO::is_tensor_of_container<ETensor>::value,
-    const typename EigenIO::Traits<ETensor>::scalar_type *>::type
-    getFirstScalar(const ETensor &output)
-    {
-      return output.data()->begin();
-    }
-    
    template <typename S>
    inline typename std::enable_if<EigenIO::is_scalar<S>::value, void>::type
    copyScalars(S * &pCopy, const S &Source)
@@ -318,12 +318,12 @@ namespace Grid {
      TotalDims[TensorRank + i] = Traits::Dimension(i);

    // If the Tensor isn't in Row-Major order, then we'll need to copy it's data
-    const bool CopyData{NumElements > 1 && ETensor::Layout != Eigen::StorageOptions::RowMajor};
+    const bool CopyData{NumElements > 1 && static_cast<int>( ETensor::Layout ) != static_cast<int>( Eigen::StorageOptions::RowMajor )};
    const Scalar * pWriteBuffer;
    std::vector<Scalar> CopyBuffer;
    const Index TotalNumElements = NumElements * Traits::count;
    if( !CopyData ) {
-      pWriteBuffer = getFirstScalar( output );
+      pWriteBuffer = EigenIO::getFirstScalar( output );
    } else {
      // Regardless of the Eigen::Tensor storage order, the copy will be Row Major
      CopyBuffer.resize( TotalNumElements );
@@ -403,6 +403,10 @@ namespace Optimization {
    accelerator_inline GpuVectorRD operator()(GpuVectorRD a, GpuVectorRD b){
      return a/b;
    }
+    accelerator_inline GpuVectorI operator()(GpuVectorI a, GpuVectorI b){
+      return a/b;
+    }
+
    // Danger -- element wise divide fro complex, not complex div. 
    // See Grid_vector_types.h lines around 735, applied after "toReal"
    accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b){
@@ -628,6 +628,7 @@ void Grid_debug_handler_init(void)
  sigaction(SIGSEGV,&sa,NULL);
  sigaction(SIGTRAP,&sa,NULL);
  sigaction(SIGBUS,&sa,NULL);
+  sigaction(SIGUSR2,&sa,NULL);

  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);