Much faster coarsening

2025-07-16 04:56:53 +01:00 · 2020-01-27 13:43:19 -05:00
parent 114db3b99d
commit 76c823781e
1 changed files with 409 additions and 87 deletions
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -1,3 +1,14 @@
+    // blockZaxpy in bockPromote - 3s, 5%
+    // noncoalesced linalg in Preconditionoer ~ 3s 5%
+    // Lancos tuning or replace 10-20s ~ 25%, open ended
+    // setup tuning   5s  ~  8%
+    //    -- e.g. ordermin, orderstep tunables.
+    // MdagM path without norm in LinOp code.     few seconds
+
+    // Mdir calc blocking kernels
+    // Fuse kernels in blockMaskedInnerProduct
+    // preallocate Vectors in Cayley 5D ~ few percent few seconds
+
 /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 
@@ -34,6 +45,34 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

+template<class vobj,class CComplex>
+inline void blockMaskedInnerProduct(Lattice<CComplex> &CoarseInner1,
+				    Lattice<CComplex> &CoarseInner2,
+				    const Lattice<decltype(innerProduct(vobj(),vobj()))> &FineMask1,
+				    const Lattice<decltype(innerProduct(vobj(),vobj()))> &FineMask2,
+				    const Lattice<vobj> &fineX,
+				    const Lattice<vobj> &fineY)
+{
+  typedef decltype(innerProduct(vobj(),vobj())) dotp;
+
+  GridBase *coarse(CoarseInner1.Grid());
+  GridBase *fine  (fineX.Grid());
+
+  Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
+  Lattice<dotp> fine_inner_msk(fine);
+
+  // Multiply could be fused with innerProduct
+  // Single block sum kernel could do both masks.
+  fine_inner = localInnerProduct(fineX,fineY);
+
+  mult(fine_inner_msk, fine_inner,FineMask1);
+  blockSum(CoarseInner1,fine_inner_msk);
+
+  mult(fine_inner_msk, fine_inner,FineMask2);
+  blockSum(CoarseInner2,fine_inner_msk);
+}
+
+
 class Geometry {
 public:
  int npoint;
@@ -51,10 +90,10 @@ public:
    directions.resize(npoint);
    displacements.resize(npoint);
    for(int d=0;d<_d;d++){
-      directions[2*d  ] = d+base;
-      directions[2*d+1] = d+base;
-      displacements[2*d  ] = +1;
-      displacements[2*d+1] = -1;
+      directions[d   ] = d+base;
+      directions[d+_d] = d+base;
+      displacements[d  ] = +1;
+      displacements[d+_d]= -1;
    }
    directions   [2*_d]=0;
    displacements[2*_d]=0;
@@ -136,20 +175,15 @@ public:
    std::cout<<GridLogMessage <<"CheckOrthog done"<<std::endl;
  }
  void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
-    //    std::cout << GridLogMessage<< "BlockPromote"<<std::endl;
    blockProject(CoarseVec,FineVec,subspace);
-    //    std::cout << GridLogMessage<< "BlockPromote"<<std::endl;
  }
  void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
    FineVec.Checkerboard() = subspace[0].Checkerboard();
-    //    std::cout << GridLogMessage<< "BlockPromote"<<std::endl;
    blockPromote(CoarseVec,FineVec,subspace);
-    //    std::cout << GridLogMessage<< "BlockPromote done"<<std::endl;
  }
  void CreateSubspaceRandom(GridParallelRNG &RNG){
    for(int i=0;i<nbasis;i++){
      random(RNG,subspace[i]);
-      //      std::cout<<GridLogMessage<<" norm subspace["<<i<<"] "<<norm2(subspace[i])<<std::endl;
    }
  }

@@ -190,12 +224,15 @@ public:
  // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
  // and this is the best I found
  ////////////////////////////////////////////////////////////////////////////////////////////////
+#if 1
  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
 				       int nn,
 				       double hi,
 				       double lo,
-				       int order,
-				       int orderstep
+				       int orderfilter,
+				       int ordermin,
+				       int orderstep,
+				       double filterlo
 				       ) {

    RealD scale;
@@ -215,7 +252,7 @@ public:
    int b =0;
    {
      // Filter
-      Chebyshev<FineField> Cheb(lo,hi,order);
+      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
      Cheb(hermop,noise,Mn);
      // normalise
      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
@@ -227,7 +264,7 @@ public:

    // Generate a full sequence of Chebyshevs
    {
-      lo=0;
+      lo=filterlo;
      noise=Mn;

      FineField T0(FineGrid); T0 = noise;  
@@ -245,7 +282,7 @@ public:
      hermop.HermOp(T0,y);
      T1=y*xscale+noise*mscale;

-      for(int n=2;n<=orderstep*(nn-1);n++){
+      for(int n=2;n<=ordermin+orderstep*(nn-2);n++){
 	
 	hermop.HermOp(*Tn,y);

@@ -253,6 +290,7 @@ public:
 	auto Tn_v = Tn->View();
 	auto Tnp_v = Tnp->View();
 	auto Tnm_v = Tnm->View();
+	const int Nsimd = CComplex::Nsimd();
 	accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
 	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
 	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
@@ -260,12 +298,14 @@ public:

 	// Possible more fine grained control is needed than a linear sweep,
 	// but huge productivity gain if this is simple algorithm and not a tunable
-	if ( (n%orderstep)==0 ) { 
+	int m =1;
+	if ( n>=ordermin ) m=n-ordermin;
+	if ( (m%orderstep)==0 ) { 
 	  Mn=*Tnp;
 	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale;
 	  subspace[b] = Mn;
 	  hermop.Op(Mn,tmp); 
-	  std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+	  std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
 	  b++;
 	}

@@ -279,6 +319,202 @@ public:
    }
    assert(b==nn);
  }
+#endif
+#if 0
+  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
+				       int nn,
+				       double hi,
+				       double lo,
+				       int orderfilter,
+				       int ordermin,
+				       int orderstep,
+				       double filterlo
+				       ) {
+
+    RealD scale;
+
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+    FineField tmp(FineGrid);
+    FineField combined(FineGrid);
+
+    // New normalised noise
+    gaussian(RNG,noise);
+    scale = std::pow(norm2(noise),-0.5); 
+    noise=noise*scale;
+
+    // Initial matrix element
+    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+    int b =0;
+#define FILTERb(llo,hhi,oorder)						\
+    {									\
+      Chebyshev<FineField> Cheb(llo,hhi,oorder);			\
+      Cheb(hermop,noise,Mn);						\
+      scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;			\
+      subspace[b]   = Mn;						\
+      hermop.Op(Mn,tmp);						\
+      std::cout<<GridLogMessage << oorder<< " Cheb filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
+      b++;								\
+    }									
+
+    //      JacobiPolynomial<FineField> Cheb(0.002,60.0,1500,-0.5,3.5);	\
+
+    RealD alpha=-0.8;
+    RealD beta =-0.8;
+#define FILTER(llo,hhi,oorder)						\
+    {									\
+      Chebyshev<FineField> Cheb(llo,hhi,oorder);			\
+      /* JacobiPolynomial<FineField> Cheb(0.0,60.0,oorder,alpha,beta);*/\
+      Cheb(hermop,noise,Mn);						\
+      scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;			\
+      subspace[b]   = Mn;						\
+      hermop.Op(Mn,tmp);						\
+      std::cout<<GridLogMessage << oorder<< "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
+      b++;								\
+    }									
+    
+#define FILTERc(llo,hhi,oorder)				\
+    {							\
+      Chebyshev<FineField> Cheb(llo,hhi,oorder);	\
+      Cheb(hermop,noise,combined);			\
+    }									
+
+    double node = 0.000;
+    FILTERb(lo,hi,orderfilter);// 0
+    //    FILTERc(node,hi,51);// 0
+    noise = Mn;
+    int base = 0;
+    int mult = 100;
+    FILTER(node,hi,base+1*mult);
+    FILTER(node,hi,base+2*mult);
+    FILTER(node,hi,base+3*mult);
+    FILTER(node,hi,base+4*mult);
+    FILTER(node,hi,base+5*mult);
+    FILTER(node,hi,base+6*mult);
+    FILTER(node,hi,base+7*mult);
+    FILTER(node,hi,base+8*mult);
+    FILTER(node,hi,base+9*mult);
+    FILTER(node,hi,base+10*mult);
+    FILTER(node,hi,base+11*mult);
+    FILTER(node,hi,base+12*mult);
+    FILTER(node,hi,base+13*mult);
+    FILTER(node,hi,base+14*mult);
+    FILTER(node,hi,base+15*mult);
+    assert(b==nn);
+  }
+#endif
+
+#if 0
+  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
+				       int nn,
+				       double hi,
+				       double lo,
+				       int orderfilter,
+				       int ordermin,
+				       int orderstep,
+				       double filterlo
+				       ) {
+
+    RealD scale;
+
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+    FineField tmp(FineGrid);
+    FineField combined(FineGrid);
+
+    // New normalised noise
+    gaussian(RNG,noise);
+    scale = std::pow(norm2(noise),-0.5); 
+    noise=noise*scale;
+
+    // Initial matrix element
+    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+    int b =0;
+    {						
+      Chebyshev<FineField> JacobiPoly(0.005,60.,1500);
+      //      JacobiPolynomial<FineField> JacobiPoly(0.002,60.0,1500,-0.5,3.5);
+      //JacobiPolynomial<FineField> JacobiPoly(0.03,60.0,500,-0.5,3.5);
+      //      JacobiPolynomial<FineField> JacobiPoly(0.00,60.0,1000,-0.5,3.5);
+      JacobiPoly(hermop,noise,Mn);
+      scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
+      subspace[b]   = Mn;
+      hermop.Op(Mn,tmp);
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; 
+      b++;
+      //      scale = std::pow(norm2(tmp),-0.5);     tmp=tmp*scale;
+      //      subspace[b]   = tmp;      b++;
+      //    }									
+    }									
+
+#define FILTER(lambda)						\
+    {								\
+      hermop.HermOp(subspace[0],tmp);				\
+      tmp = tmp - lambda *subspace[0];				\
+      scale = std::pow(norm2(tmp),-0.5);			\
+      tmp=tmp*scale;							\
+      subspace[b]   = tmp;						\
+      hermop.Op(subspace[b],tmp);					\
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
+      b++;								\
+    }									
+    //      scale = std::pow(norm2(tmp),-0.5);     tmp=tmp*scale;
+    //      subspace[b]   = tmp;      b++;
+    //    }									
+
+    FILTER(2.0e-5);
+    FILTER(2.0e-4);
+    FILTER(4.0e-4);
+    FILTER(8.0e-4);
+    FILTER(8.0e-4);
+
+    FILTER(2.0e-3);
+    FILTER(3.0e-3);
+    FILTER(4.0e-3);
+    FILTER(5.0e-3);
+    FILTER(6.0e-3);
+
+    FILTER(2.5e-3);
+    FILTER(3.5e-3);
+    FILTER(4.5e-3);
+    FILTER(5.5e-3);
+    FILTER(6.5e-3);
+
+    //    FILTER(6.0e-5);//6
+    //    FILTER(7.0e-5);//8
+    //    FILTER(8.0e-5);//9
+    //    FILTER(9.0e-5);//3
+
+    /*
+    //    FILTER(1.0e-4);//10
+    FILTER(2.0e-4);//11
+    //   FILTER(3.0e-4);//12
+    //    FILTER(4.0e-4);//13
+    FILTER(5.0e-4);//14
+
+    FILTER(6.0e-3);//4
+    FILTER(7.0e-4);//1
+    FILTER(8.0e-4);//7
+    FILTER(9.0e-4);//15
+    FILTER(1.0e-3);//2
+
+    FILTER(2.0e-3);//2
+    FILTER(3.0e-3);//2
+    FILTER(4.0e-3);//2
+    FILTER(5.0e-3);//2
+    FILTER(6.0e-3);//2
+
+    FILTER(7.0e-3);//2
+    FILTER(8.0e-3);//2
+    FILTER(1.0e-2);//2
+    */
+    std::cout << GridLogMessage <<"Jacobi filtering done" <<std::endl;
+    assert(b==nn);
+  }
+#endif
+
+
 };

 // Fine Object == (per site) type of fine field
@@ -288,6 +524,7 @@ class CoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis
 public:
    
  typedef iVector<CComplex,nbasis >           siteVector;
+  typedef Lattice<CComplex >                  CoarseComplexField;
  typedef Lattice<siteVector>                 CoarseVector;
  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
  typedef iMatrix<CComplex,nbasis >  Cobj;
@@ -305,7 +542,6 @@ public:

  std::vector<CoarseMatrix> A;
      
-      
  ///////////////////////
  // Interface
  ///////////////////////
@@ -316,7 +552,6 @@ public:
    conformable(_grid,in.Grid());
    conformable(in.Grid(),out.Grid());

-
    //    RealD Nin = norm2(in);
    SimpleCompressor<siteVector> compressor;

@@ -333,16 +568,14 @@ public:
    Aview *Aview_p = & AcceleratorViewContainer[0];

    const int Nsimd = CComplex::Nsimd();
-
    typedef decltype(coalescedRead(in_v[0])) calcVector;
    typedef decltype(coalescedRead(in_v[0](0))) calcComplex;

    GridStopWatch ArithmeticTimer;
    int osites=Grid()->oSites();
-    double flops = osites*Nsimd*nbasis*nbasis*8.0*geom.npoint;
-    double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex);
+    //    double flops = osites*Nsimd*nbasis*nbasis*8.0*geom.npoint;
+    //    double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex);
    double usecs =-usecond();
-
    // assert(geom.npoint==9);

    accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
@@ -418,7 +651,37 @@ public:

    auto out_v = out.View();
    auto in_v  = in.View();
+
+    const int Nsimd = CComplex::Nsimd();
+    typedef decltype(coalescedRead(in_v[0])) calcVector;
+    typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
+
+    accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
+      int ss = sss/nbasis;
+      int b  = sss%nbasis;
+      calcComplex res = Zero();
+      calcVector nbr;
+      int ptype;
+      StencilEntry *SE;
+
+      int lane=SIMTlane(Nsimd);
+      SE=Stencil.GetEntry(ptype,point,ss);
+	  
+      if(SE->_is_local) { 
+	nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
+      } else {
+	nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
+      }
+      synchronise();
+
+      for(int bb=0;bb<nbasis;bb++) {
+	res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
+      }
+      coalescedWrite(out_v[ss](b),res,lane);
+    });
+#if 0
    accelerator_for(ss,Grid()->oSites(),1,{
+
      siteVector res = Zero();
      siteVector nbr;
      int ptype;
@@ -433,18 +696,23 @@ public:
      } else {
 	nbr = Stencil.CommBuf()[SE->_offset];
      }
+      synchronise();

      res = res + Aview_p[point][ss]*nbr;
      
      out_v[ss]=res;
    });
-
+#endif
  }
  void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
  {
    this->MdirComms(in);
    int ndir=geom.npoint-1;
-    assert(out.size()==ndir);
+    if ((out.size()!=ndir)&&(out.size()!=ndir+1)) { 
+      std::cout <<"MdirAll out size "<< out.size()<<std::endl;
+      std::cout <<"MdirAll ndir "<< ndir<<std::endl;
+      assert(0);
+    }
    for(int p=0;p<ndir;p++){
      MdirCalc(in,out[p],p);
    }
@@ -501,23 +769,40 @@ public:
  };

  void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
-		       Aggregation<Fobj,CComplex,nbasis> & Subspace){
+		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
+  {
+    typedef Lattice<typename Fobj::tensor_reduced> FineComplexField;
+    typedef typename Fobj::scalar_type scalar_type;

-    FineField iblock(FineGrid); // contributions from within this block
-    FineField oblock(FineGrid); // contributions from outwith this block
+    FineComplexField one(FineGrid); one=scalar_type(1.0,0.0);
+    FineComplexField zero(FineGrid); zero=scalar_type(0.0,0.0);
+
+    std::vector<FineComplexField> masks(geom.npoint,FineGrid);
+    FineComplexField imask(FineGrid); // contributions from within this block
+    FineComplexField omask(FineGrid); // contributions from outwith this block
+
+    FineComplexField evenmask(FineGrid);
+    FineComplexField oddmask(FineGrid); 

    FineField     phi(FineGrid);
    FineField     tmp(FineGrid);
    FineField     zz(FineGrid); zz=Zero();
    FineField    Mphi(FineGrid);
+    FineField    Mphie(FineGrid);
+    FineField    Mphio(FineGrid);
    std::vector<FineField>     Mphi_p(geom.npoint,FineGrid);

    Lattice<iScalar<vInteger> > coor (FineGrid);
+    Lattice<iScalar<vInteger> > bcoor(FineGrid);
+    Lattice<iScalar<vInteger> > bcb  (FineGrid);

    CoarseVector iProj(Grid()); 
    CoarseVector oProj(Grid()); 
-    CoarseScalar InnerProd(Grid()); 
+    CoarseVector SelfProj(Grid()); 
+    CoarseComplexField iZProj(Grid()); 
+    CoarseComplexField oZProj(Grid()); 

+    CoarseScalar InnerProd(Grid()); 

    // Orthogonalise the subblocks over the basis
    blockOrthogonalise(InnerProd,Subspace.subspace);
@@ -525,22 +810,46 @@ public:
    // Compute the matrix elements of linop between this orthonormal
    // set of vectors.
    int self_stencil=-1;
-    for(int p=0;p<geom.npoint;p++){ 
+    for(int p=0;p<geom.npoint;p++)
+    { 
+      int dir   = geom.directions[p];
+      int disp  = geom.displacements[p];
      A[p]=Zero();
      if( geom.displacements[p]==0){
 	self_stencil=p;
      }
+
+      Integer block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]);
+
+      LatticeCoordinate(coor,dir);
+
+      ///////////////////////////////////////////////////////
+      // Work out even and odd block checkerboarding for fast diagonal term
+      ///////////////////////////////////////////////////////
+      if ( disp==1 ) {
+	bcb   = bcb + div(coor,block);
      }
+	
+      if ( disp==0 ) {
+	  masks[p]= Zero();
+      } else if ( disp==1 ) {
+	masks[p] = where(mod(coor,block)==(block-1),one,zero);
+      } else if ( disp==-1 ) {
+	masks[p] = where(mod(coor,block)==(Integer)0,one,zero);
+      }
+    }
+    evenmask = where(mod(bcb,2)==(Integer)0,one,zero);
+    oddmask  = one-evenmask;
+
    assert(self_stencil!=-1);

    for(int i=0;i<nbasis;i++){
+
      phi=Subspace.subspace[i];

-      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i<<" OpDir " << std::endl;
+      //      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl;
      linop.OpDirAll(phi,Mphi_p);
-      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i<<" OpDir calculated" << std::endl;
      linop.OpDiag  (phi,Mphi_p[geom.npoint-1]);
-      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i<<" OpDiag calculated" << std::endl;

      for(int p=0;p<geom.npoint;p++){ 

@@ -549,54 +858,66 @@ public:
 	int dir   = geom.directions[p];
 	int disp  = geom.displacements[p];

-	Integer block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]);
-
-	LatticeCoordinate(coor,dir);
-
+	if ( (disp==-1) || (!hermitian ) ) {

 	  ////////////////////////////////////////////////////////////////////////
 	  // Pick out contributions coming from this cell and neighbour cell
 	  ////////////////////////////////////////////////////////////////////////
-	if ( disp==0 ) {
-	  iblock = Mphi;
-	  oblock = Zero();
-	} else if ( disp==1 ) {
-	  oblock = where(mod(coor,block)==(block-1),Mphi,zz);
-	  iblock = where(mod(coor,block)!=(block-1),Mphi,zz);
-	} else if ( disp==-1 ) {
-	  oblock = where(mod(coor,block)==(Integer)0,Mphi,zz);
-	  iblock = where(mod(coor,block)!=(Integer)0,Mphi,zz);
-	} else {
-	  assert(0);
-	}
+	  omask = masks[p];
+	  imask = one-omask;
 	
-	// Could do local inner products,
-	// and then block pick the IP's.
-	// Ideally write a routine to do two masked block sums at once
-	std::cout << GridLogMessage<< "CoarsenMatrix picked "<<p<< std::endl;
-	Subspace.ProjectToSubspace(iProj,iblock);
-	Subspace.ProjectToSubspace(oProj,oblock);
-	std::cout << GridLogMessage<< "CoarsenMatrix projected"<<p<< std::endl;
+	  for(int j=0;j<nbasis;j++){
 	    
-	// 4x gain possible in this loop. Profile and identify time loss.
-	// i)  Assume Hermiticity, upper diagonal only (2x)
-	// ii) Local inner product, then pick the local inners and sum. (2x)
+	    blockMaskedInnerProduct(iZProj,oZProj,imask,omask,Subspace.subspace[j],Mphi);
 	    
-	auto iProj_v = iProj.View() ;
-	auto oProj_v = oProj.View() ;
+	    auto iZProj_v = iZProj.View() ;
+	    auto oZProj_v = oZProj.View() ;
 	    auto A_p     =  A[p].View();
 	    auto A_self  = A[self_stencil].View();
-	accelerator_for(ss, Grid()->oSites(),1,{
-	  for(int j=0;j<nbasis;j++){
-	    if( disp!= 0 ) {
-	      A_p[ss](j,i) = oProj_v[ss](j);
-	    }
-	    A_self[ss](j,i) =	A_self[ss](j,i) + iProj_v[ss](j);
-	  }
-	});
+
+	    accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });

 	  }
 	}
+      }
+
+      ///////////////////////////////////////////
+      // Faster alternate self coupling.. use hermiticity to save 2x
+      ///////////////////////////////////////////
+      {
+	mult(tmp,phi,evenmask);  linop.Op(tmp,Mphie);
+	mult(tmp,phi,oddmask );   linop.Op(tmp,Mphio);
+
+	//	tmp = Mphie*evenmask + Mphio*oddmask;
+	{
+	  auto tmp_      = tmp.View();
+	  auto evenmask_ = evenmask.View();
+	  auto oddmask_  =  oddmask.View();
+	  auto Mphie_    =  Mphie.View();
+	  auto Mphio_    =  Mphio.View();
+	  accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{ 
+	      coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss));
+	    });
+	}
+
+	blockProject(SelfProj,tmp,Subspace.subspace);
+
+	auto SelfProj_ = SelfProj.View();
+	auto A_self  = A[self_stencil].View();
+	accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{
+	  for(int j=0;j<nbasis;j++){
+	    coalescedWrite(A_self[ss](j,i), SelfProj_(ss)(j));
+	  }
+	});
+      }
+    }
+    if(hermitian) {
+      std::cout << GridLogMessage << " ForceHermitian "<<std::endl;
+      ForceHermitian();
+    }
+      // AssertHermitian();
+      // ForceDiagonal();
+  }

 #if 0
    ///////////////////////////
@@ -619,25 +940,26 @@ public:
    std::cout<<GridLogMessage<< iProj <<std::endl;
    std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
 #endif
-    /*
-    if(hermitian) {
-      std::cout << GridLogMessage << " ForceHermitian "<<std::endl;
-      ForceHermitian();
-    }
-    for(int p=0;p<geom.npoint;p++){
-      std::cout << GridLogMessage<< " dir "<< norm2(A[p]) <<std::endl;
-    }
-    */
-      // AssertHermitian();
-      // ForceDiagonal();
-  }
+

  void ForceHermitian(void) {
-    for(int d=0;d<4;d++){
-      int dd=d+1;
-      A[2*d] = adj(Cshift(A[2*d+1],dd,1));
+    CoarseMatrix Diff  (Grid());
+    for(int p=0;p<geom.npoint;p++){
+      int dir   = geom.directions[p];
+      int disp  = geom.displacements[p];
+      if(disp==-1) {
+	// Find the opposite link
+	for(int pp=0;pp<geom.npoint;pp++){
+	  int dirp   = geom.directions[pp];
+	  int dispp  = geom.displacements[pp];
+	  if ( (dirp==dir) && (dispp==1) ){
+	    //	    Diff = adj(Cshift(A[p],dir,1)) - A[pp]; 
+	    //	    std::cout << GridLogMessage<<" Replacing stencil leg "<<pp<<" with leg "<<p<< " diff "<<norm2(Diff) <<std::endl;
+	    A[pp] = adj(Cshift(A[p],dir,1));
+	  }
+	}
+      }
    }
-    //      A[8] = 0.5*(A[8] + adj(A[8]));
  }
  void AssertHermitian(void) {
    CoarseMatrix AA    (Grid());