Change to interface to minise comms in evaluating coarse space operator

2025-12-17 11:14:40 +00:00 · 2020-01-06 11:43:59 -05:00
parent 3c3d6a94f3
commit e583035614
20 changed files with 262 additions and 92 deletions
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -35,14 +35,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);

 class Geometry {
-  //    int dimension;
 public:
  int npoint;
  std::vector<int> directions   ;
  std::vector<int> displacements;

  Geometry(int _d)  {
-  
+    
    int base = (_d==5) ? 1:0;

    // make coarse grid stencil for 4d , not 5d
@@ -187,9 +186,10 @@ public:
    }
  }

-  // 
-  // World of possibilities here. 
-  //
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
+  // and this is the best I found
+  ////////////////////////////////////////////////////////////////////////////////////////////////
  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
 				       int nn,
 				       double hi,
@@ -249,11 +249,18 @@ public:
 	
 	hermop.HermOp(*Tn,y);

-	y=xscale*y+mscale*(*Tn);
-	
-	*Tnp=2.0*y-(*Tnm);
+	auto y_v = y.View();
+	auto Tn_v = Tn->View();
+	auto Tnp_v = Tnp->View();
+	auto Tnm_v = Tnm->View();
+	accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
+	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
+	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
+        });

-	if ( (n%orderstep)==0 ) {
+	// Possible more fine grained control is needed than a linear sweep,
+	// but huge productivity gain if this is simple algorithm and not a tunable
+	if ( (n%orderstep)==0 ) { 
 	  Mn=*Tnp;
 	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale;
 	  subspace[b] = Mn;
@@ -270,6 +277,7 @@ public:
 	  
      }
    }
+    assert(b==nn);
  }
 };

@@ -393,42 +401,15 @@ public:
      return norm2(out);
    }
  };
-
-  void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){
-
-    conformable(_grid,in.Grid());
-    conformable(in.Grid(),out.Grid());
-    
+  void MdirComms(const CoarseVector &in)
+  {
    SimpleCompressor<siteVector> compressor;
-
    Stencil.HaloExchange(in,compressor);
-
-    int ndim = in.Grid()->Nd();
-
-    //////////////
-    // 4D action like wilson
-    // 0+ => 0 
-    // 0- => 1
-    // 1+ => 2 
-    // 1- => 3
-    // etc..
-    //////////////
-    // 5D action like DWF
-    // 1+ => 0 
-    // 1- => 1
-    // 2+ => 2 
-    // 2- => 3
-    // etc..
-
-    auto point = [dir, disp, ndim](){
-      if(dir == 0 and disp == 0)
-	return 8;
-      else if ( ndim==4 ) { 
-	return (4 * dir + 1 - disp) / 2;
-      } else { 
-	return (4 * (dir-1) + 1 - disp) / 2;
-      }
-    }();
+  }
+  void MdirCalc(const CoarseVector &in, CoarseVector &out, int point)
+  {
+    conformable(_grid,in.Grid());
+    conformable(_grid,out.Grid());

    typedef LatticeView<Cobj> Aview;
    Vector<Aview> AcceleratorViewContainer;
@@ -458,10 +439,54 @@ public:
      out_v[ss]=res;
    });

+  }
+  void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
+  {
+    this->MdirComms(in);
+    int ndir=geom.npoint-1;
+    assert(out.size()==ndir);
+    for(int p=0;p<ndir;p++){
+      MdirCalc(in,out[p],p);
+    }
+  };
+  void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){
+
+    this->MdirComms(in);
+
+    int ndim = in.Grid()->Nd();
+
+    //////////////
+    // 4D action like wilson
+    // 0+ => 0 
+    // 0- => 1
+    // 1+ => 2 
+    // 1- => 3
+    // etc..
+    //////////////
+    // 5D action like DWF
+    // 1+ => 0 
+    // 1- => 1
+    // 2+ => 2 
+    // 2- => 3
+    // etc..
+    auto point = [dir, disp, ndim](){
+      if(dir == 0 and disp == 0)
+	return 8;
+      else if ( ndim==4 ) { 
+	return (4 * dir + 1 - disp) / 2;
+      } else { 
+	return (4 * (dir-1) + 1 - disp) / 2;
+      }
+    }();
+
+    MdirCalc(in,out,point);
+
  };

-  void Mdiag(const CoarseVector &in, CoarseVector &out){
-    Mdir(in, out, 0, 0); // use the self coupling (= last) point of the stencil
+  void Mdiag(const CoarseVector &in, CoarseVector &out)
+  {
+    int point=geom.npoint-1;
+    MdirCalc(in, out, point); // No comms
  };

  
@@ -511,16 +536,12 @@ public:
    for(int i=0;i<nbasis;i++){
      phi=Subspace.subspace[i];

-      for(int p=0;p<geom.npoint;p++){ 
+      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i<<" OpDir " << std::endl;
+      linop.OpDirAll(phi,Mphi_p);
+      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i<<" OpDir calculated" << std::endl;
+      linop.OpDiag  (phi,Mphi_p[geom.npoint-1]);
+      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i<<" OpDiag calculated" << std::endl;

-	int dir   = geom.directions[p];
-	int disp  = geom.displacements[p];
-
-	std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i<<" stencil point "<<p << std::endl;
-	if ( disp==0 ) linop.OpDiag(phi,Mphi_p[p]);
-	else           linop.OpDir (phi,Mphi_p[p],dir,disp); 
-	std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i<<" applied" << std::endl;
-      }
      for(int p=0;p<geom.npoint;p++){ 

 	Mphi = Mphi_p[p];
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -47,6 +47,7 @@ public:
  // Support for coarsening to a multigrid
  virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base
  virtual void OpDir  (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base
+  virtual void OpDirAll  (const Field &in, std::vector<Field> &out) = 0; // Abstract base

  virtual void Op     (const Field &in, Field &out) = 0; // Abstract base
  virtual void AdjOp  (const Field &in, Field &out) = 0; // Abstract base
@@ -83,6 +84,9 @@ public:
  void OpDir  (const Field &in, Field &out,int dir,int disp) {
    _Mat.Mdir(in,out,dir,disp);
  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    _Mat.MdirAll(in,out);
+  };
  void Op     (const Field &in, Field &out){
    _Mat.M(in,out);
  }
@@ -116,6 +120,9 @@ public:
    _Mat.Mdir(in,out,dir,disp);
    assert(0);
  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    assert(0);
+  };
  void Op     (const Field &in, Field &out){
    _Mat.M(in,out);
    assert(0);
@@ -154,6 +161,9 @@ public:
  void OpDir  (const Field &in, Field &out,int dir,int disp) {
    _Mat.Mdir(in,out,dir,disp);
  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    _Mat.MdirAll(in,out);
+  };
  void Op     (const Field &in, Field &out){
    _Mat.M(in,out);
  }
@@ -183,6 +193,9 @@ public:
  void OpDir  (const Field &in, Field &out,int dir,int disp) {
    _Mat.Mdir(in,out,dir,disp);
  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    _Mat.MdirAll(in,out);
+  };
  void Op     (const Field &in, Field &out){
    _Mat.M(in,out);
  }
@@ -234,6 +247,9 @@ public:
      void OpDir  (const Field &in, Field &out,int dir,int disp) {
 	assert(0);
      }
+      void OpDirAll  (const Field &in, std::vector<Field> &out){
+	assert(0);
+      };
    };
    template<class Matrix,class Field>
    class SchurDiagMooeeOperator :  public SchurOperatorBase<Field> {
--- a/Grid/algorithms/SparseMatrix.h
+++ b/Grid/algorithms/SparseMatrix.h
@@ -47,6 +47,7 @@ public:
  }
  virtual  void Mdiag    (const Field &in, Field &out)=0;
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
+  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)=0;
 };

 /////////////////////////////////////////////////////////////////////////////////////////////
@@ -56,12 +57,12 @@ template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrix
 public:
  virtual GridBase *RedBlackGrid(void)=0;

-      //////////////////////////////////////////////////////////////////////
-      // Query the even even properties to make algorithmic decisions
-      //////////////////////////////////////////////////////////////////////
-      virtual RealD  Mass(void)        { return 0.0; };
-      virtual int    ConstEE(void)     { return 1; }; // Disable assumptions unless overridden
-      virtual int    isTrivialEE(void) { return 0; }; // by a derived class that knows better
+  //////////////////////////////////////////////////////////////////////
+  // Query the even even properties to make algorithmic decisions
+  //////////////////////////////////////////////////////////////////////
+  virtual RealD  Mass(void)        { return 0.0; };
+  virtual int    ConstEE(void)     { return 1; }; // Disable assumptions unless overridden
+  virtual int    isTrivialEE(void) { return 0; }; // by a derived class that knows better

  // half checkerboard operaions
  virtual  void Meooe    (const Field &in, Field &out)=0;
--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@@ -101,7 +101,8 @@ public:
  virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);

  // Efficient support for multigrid coarsening
-  virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
+  virtual void  Mdir   (const FermionField &in, FermionField &out,int dir,int disp);
+  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out);

  void   Meooe5D       (const FermionField &in, FermionField &out);
  void   MeooeDag5D    (const FermionField &in, FermionField &out);
--- a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
@@ -62,14 +62,15 @@ public:

  // Efficient support for multigrid coarsening
  virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
+  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out);

-      ///////////////////////////////////////////////////////////////
-      // Physical surface field utilities
-      ///////////////////////////////////////////////////////////////
-      //      virtual void Dminus(const FermionField &psi, FermionField &chi);     // Inherit trivial case
-      //      virtual void DminusDag(const FermionField &psi, FermionField &chi);  // Inherit trivial case
-      virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
-      virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);
+  ///////////////////////////////////////////////////////////////
+  // Physical surface field utilities
+  ///////////////////////////////////////////////////////////////
+  //      virtual void Dminus(const FermionField &psi, FermionField &chi);     // Inherit trivial case
+  //      virtual void DminusDag(const FermionField &psi, FermionField &chi);  // Inherit trivial case
+  virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
+  virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);

  // Constructors
  ContinuedFractionFermion5D(GaugeField &_Umu,
--- a/Grid/qcd/action/fermion/FermionOperator.h
+++ b/Grid/qcd/action/fermion/FermionOperator.h
@@ -89,6 +89,7 @@ public:

  virtual void  Mdiag  (const FermionField &in, FermionField &out) { Mooee(in,out);};   // Same as Mooee applied to both CB's
  virtual void  Mdir   (const FermionField &in, FermionField &out,int dir,int disp)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
+  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac


      virtual void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);};
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
@@ -103,6 +103,7 @@ public:
  // Multigrid assistance; force term uses too
  ///////////////////////////////////////////////////////////////
  void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
+  void MdirAll(const FermionField &in, std::vector<FermionField> &out);
  void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);

  ///////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -86,7 +86,8 @@ public:
  void   MooeeDag    (const FermionField &in, FermionField &out);
  void   MooeeInvDag (const FermionField &in, FermionField &out);

-  void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp);
+  void Mdir   (const FermionField &in, FermionField &out,int dir,int disp);
+  void MdirAll(const FermionField &in, std::vector<FermionField> &out);
  void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);

  // These can be overridden by fancy 5d chiral action
--- a/Grid/qcd/action/fermion/PartialFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.h
@@ -67,12 +67,13 @@ public:

  // Efficient support for multigrid coarsening
  virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
+  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out);

-      ///////////////////////////////////////////////////////////////
-      // Physical surface field utilities
-      ///////////////////////////////////////////////////////////////
-      virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
-      virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);
+  ///////////////////////////////////////////////////////////////
+  // Physical surface field utilities
+  ///////////////////////////////////////////////////////////////
+  virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
+  virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);

  // Constructors
  PartialFractionFermion5D(GaugeField &_Umu,
--- a/Grid/qcd/action/fermion/WilsonFermion.h
+++ b/Grid/qcd/action/fermion/WilsonFermion.h
@@ -115,9 +115,10 @@ public:
  // Multigrid assistance; force term uses too
  ///////////////////////////////////////////////////////////////
  void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
+  void MdirAll(const FermionField &in, std::vector<FermionField> &out);
  void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);
-  void DhopDirDisp(const FermionField &in, FermionField &out, int dirdisp,
-                   int gamma, int dag);
+  void DhopDirAll(const FermionField &in, std::vector<FermionField> &out);
+  void DhopDirCalc(const FermionField &in, FermionField &out, int dirdisp,int gamma, int dag);

  ///////////////////////////////////////////////////////////////
  // Extra methods added by derived
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@@ -111,15 +111,16 @@ public:
  virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
  virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
  virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
+  virtual void   MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac

  // These can be overridden by fancy 5d chiral action
  virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
  virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
  virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);

-      void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
-      void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
-      void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+  void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+  void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+  void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;

  // Implement hopping term non-hermitian hopping term; half cb or both
  // Implement s-diagonal DW
@@ -131,6 +132,9 @@ public:
  // add a DhopComm
  // -- suboptimal interface will presently trigger multiple comms.
  void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
+  void DhopDirAll(const FermionField &in,std::vector<FermionField> &out);
+  void DhopDirComms(const FermionField &in);
+  void DhopDirCalc(const FermionField &in, FermionField &out,int point);
    
  ///////////////////////////////////////////////////////////////
  // New methods added 
--- a/Grid/qcd/action/fermion/g5HermitianLinop.h
+++ b/Grid/qcd/action/fermion/g5HermitianLinop.h
@@ -54,6 +54,14 @@ public:
    _Mat.Mdir(in,tmp,dir,disp);
    G5R5(out,tmp);
  }
+  void OpDirAll(const Field &in, std::vector<Field> &out) {
+    Field tmp(in.Grid());
+    _Mat.MdirAll(in,out);
+    for(int p=0;p<out.size();p++) {
+      tmp=out[p];
+      G5R5(out[p],tmp);
+    }
+  }

  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){

@@ -96,6 +104,12 @@ public:
    _Mat.Mdir(in,tmp,dir,disp);
    out=g5*tmp;
  }
+  void OpDirAll(const Field &in, std::vector<Field> &out) {
+    _Mat.MdirAll(in,out);
+    for(int p=0;p<out.size();p++) {
+      out[p]=g5*out[p];
+    }
+  }

  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){

--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -389,6 +389,14 @@ void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,in
  Meo5D(psi,tmp);
  this->DhopDir(tmp,chi,dir,disp);
 }
+template<class Impl>
+void  CayleyFermion5D<Impl>::MdirAll(const FermionField &psi, std::vector<FermionField> &out)
+{
+  FermionField tmp(psi.Grid());
+  Meo5D(psi,tmp);
+  this->DhopDirAll(tmp,out);
+}
+
 // force terms; five routines; default to Dhop on diagonal
 template<class Impl>
 void CayleyFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
--- a/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
@@ -143,6 +143,25 @@ void  ContinuedFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionFi
  }
 }
 template<class Impl>
+void  ContinuedFractionFermion5D<Impl>::MdirAll (const FermionField &psi, std::vector<FermionField> &chi)
+{
+  int Ls = this->Ls;
+
+  this->DhopDirAll(psi,chi); // Dslash on diagonal. g5 Dslash is hermitian
+
+  for(int p=0;p<chi.size();p++){
+    int sign=1;
+    for(int s=0;s<Ls;s++){
+      if ( s==(Ls-1) ){
+	ag5xpby_ssp(chi[p],Beta[s]*ZoloHiInv,chi[p],0.0,chi[p],s,s);
+      } else {
+	ag5xpby_ssp(chi[p],cc[s]*Beta[s]*sign*ZoloHiInv,chi[p],0.0,chi[p],s,s);
+      }
+      sign=-sign; 
+    }
+  }
+}
+template<class Impl>
 void   ContinuedFractionFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
 {
  int Ls = this->Ls;
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
@@ -538,10 +538,16 @@ void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void)
 // Implement the general interface. Here we use SAME mass on all slices
 /////////////////////////////////////////////////////////////////////////
 template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
+void ImprovedStaggeredFermion5D<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) 
+{
  DhopDir(in, out, dir, disp);
 }
 template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out) 
+{
+  assert(0);
+}
+template <class Impl>
 RealD ImprovedStaggeredFermion5D<Impl>::M(const FermionField &in, FermionField &out) {
  out.Checkerboard() = in.Checkerboard();
  Dhop(in, out, DaggerNo);
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
@@ -362,12 +362,19 @@ void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField
 }

 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
+void ImprovedStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) 
+{
  DhopDir(in, out, dir, disp);
 }
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out) 
+{
+  assert(0); // Not implemented yet
+}

 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) {
+void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) 
+{

  Compressor compressor;
  Stencil.HaloExchange(in, compressor);
@@ -380,6 +387,7 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel
  });
 };

+
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
 						  DoubledGaugeField &U,
--- a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
@@ -31,7 +31,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-template<class Impl>
+ template<class Impl>
 void  PartialFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
  // this does both dag and undag but is trivial; make a common helper routing
  int Ls = this->Ls;
@@ -45,8 +45,25 @@ void  PartialFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionFiel
    ag5xpby_ssp(chi, scale,chi,0.0,chi,s+1,s+1); 
  }
  ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);
-
 }
+template<class Impl>
+void  PartialFractionFermion5D<Impl>::MdirAll (const FermionField &psi, std::vector<FermionField> &chi){
+  // this does both dag and undag but is trivial; make a common helper routing
+  int Ls = this->Ls;
+
+  this->DhopDirAll(psi,chi);
+
+  for(int point=0;point<chi.size();point++){
+    int nblock=(Ls-1)/2;
+    for(int b=0;b<nblock;b++){
+      int s = 2*b;
+      ag5xpby_ssp(chi[point],-scale,chi[point],0.0,chi[point],s,s); 
+      ag5xpby_ssp(chi[point], scale,chi[point],0.0,chi[point],s+1,s+1); 
+    }
+    ag5xpby_ssp(chi[point],p[nblock]*scale/amax,chi[point],0.0,chi[point],Ls-1,Ls-1);
+  }
+}
+
 template<class Impl>
 void   PartialFractionFermion5D<Impl>::Meooe_internal(const FermionField &psi, FermionField &chi,int dag)
 {
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@@ -241,6 +241,30 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
  Kernels::DhopDirKernel(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out,dirdisp,gamma);

 };
+template<class Impl>
+void WilsonFermion5D<Impl>::DhopDirAll(const FermionField &in, std::vector<FermionField> &out)
+{
+  Compressor compressor(DaggerNo);
+  Stencil.HaloExchange(in,compressor);
+
+  assert( (out.size()==8)||(out.size()==9));
+  for(int dir5=1;dir5<=4;dir5++) {
+    for(int disp=-1;disp<=1;disp+=2){
+      int dir = dir5-1; // Maps to the ordering above in "directions" that is passed to stencil
+                    // we drop off the innermost fifth dimension
+      assert( (disp==1)||(disp==-1) );
+      assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
+      
+      int skip = (disp==1) ? 0 : 1;
+      int dirdisp = dir+skip*4;
+      int gamma   = dir+(1-skip)*4;
+
+      uint64_t Nsite = Umu.Grid()->oSites();
+      Kernels::DhopDirKernel(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out[dirdisp],dirdisp,gamma);
+    }
+  }
+};
+

 template<class Impl>
 void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@@ -319,28 +319,51 @@ void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int d
 }

 template <class Impl>
-void WilsonFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
+void WilsonFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) 
+{
  DhopDir(in, out, dir, disp);
 }
+template <class Impl>
+void WilsonFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out) 
+{
+  DhopDirAll(in, out);
+}

 template <class Impl>
 void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) 
 {
+  Compressor compressor(DaggerNo);
+  Stencil.HaloExchange(in, compressor);
+
  int skip = (disp == 1) ? 0 : 1;
  int dirdisp = dir + skip * 4;
  int gamma = dir + (1 - skip) * 4;

-  DhopDirDisp(in, out, dirdisp, gamma, DaggerNo);
+  DhopDirCalc(in, out, dirdisp, gamma, DaggerNo);
 };
-
 template <class Impl>
-void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) 
+void WilsonFermion<Impl>::DhopDirAll(const FermionField &in, std::vector<FermionField> &out) 
 {
-  Compressor compressor(dag);
-
+  Compressor compressor(DaggerNo);
  Stencil.HaloExchange(in, compressor);
+
+  assert((out.size()==8)||(out.size()==9)); 
+  for(int dir=0;dir<Nd;dir++){
+    for(int disp=-1;disp<=1;disp+=2){
+
+      int skip = (disp == 1) ? 0 : 1;
+      int dirdisp = dir + skip * 4;
+      int gamma = dir + (1 - skip) * 4;
+
+      DhopDirCalc(in, out[dirdisp], dirdisp, gamma, DaggerNo);
+    }
+  }
+}
+template <class Impl>
+void WilsonFermion<Impl>::DhopDirCalc(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) 
+{
  int Ls=1;
-  int Nsite=in.oSites();
+  uint64_t Nsite=in.oSites();
  Kernels::DhopDirKernel(Stencil, Umu, Stencil.CommBuf(), Ls, Nsite, in, out, dirdisp, gamma);
 };

@@ -348,7 +371,8 @@ template <class Impl>
 void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
                                       DoubledGaugeField &U,
                                       const FermionField &in,
-                                       FermionField &out, int dag) {
+                                       FermionField &out, int dag) 
+{
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
--- a/Grid/qcd/utils/CovariantLaplacian.h
+++ b/Grid/qcd/utils/CovariantLaplacian.h
@@ -92,6 +92,7 @@ public:
  };

  void Mdir(const GaugeField&, GaugeField&, int, int){ assert(0);}
+  void MdirAll(const GaugeField&, std::vector<GaugeField> &){ assert(0);}
  void Mdiag(const GaugeField&, GaugeField&){ assert(0);}

  void ImportGauge(const GaugeField& _U) {