diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index ba71faf8..01b0da4d 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -1,3 +1,14 @@ + // blockZaxpy in bockPromote - 3s, 5% + // noncoalesced linalg in Preconditionoer ~ 3s 5% + // Lancos tuning or replace 10-20s ~ 25%, open ended + // setup tuning 5s ~ 8% + // -- e.g. ordermin, orderstep tunables. + // MdagM path without norm in LinOp code. few seconds + + // Mdir calc blocking kernels + // Fuse kernels in blockMaskedInnerProduct + // preallocate Vectors in Cayley 5D ~ few percent few seconds + /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -34,6 +45,34 @@ Author: paboyle NAMESPACE_BEGIN(Grid); +template +inline void blockMaskedInnerProduct(Lattice &CoarseInner1, + Lattice &CoarseInner2, + const Lattice &FineMask1, + const Lattice &FineMask2, + const Lattice &fineX, + const Lattice &fineY) +{ + typedef decltype(innerProduct(vobj(),vobj())) dotp; + + GridBase *coarse(CoarseInner1.Grid()); + GridBase *fine (fineX.Grid()); + + Lattice fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard(); + Lattice fine_inner_msk(fine); + + // Multiply could be fused with innerProduct + // Single block sum kernel could do both masks. + fine_inner = localInnerProduct(fineX,fineY); + + mult(fine_inner_msk, fine_inner,FineMask1); + blockSum(CoarseInner1,fine_inner_msk); + + mult(fine_inner_msk, fine_inner,FineMask2); + blockSum(CoarseInner2,fine_inner_msk); +} + + class Geometry { public: int npoint; @@ -51,10 +90,10 @@ public: directions.resize(npoint); displacements.resize(npoint); for(int d=0;d<_d;d++){ - directions[2*d ] = d+base; - directions[2*d+1] = d+base; - displacements[2*d ] = +1; - displacements[2*d+1] = -1; + directions[d ] = d+base; + directions[d+_d] = d+base; + displacements[d ] = +1; + displacements[d+_d]= -1; } directions [2*_d]=0; displacements[2*_d]=0; @@ -136,20 +175,15 @@ public: std::cout< &hermop, int nn, double hi, double lo, - int order, - int orderstep + int orderfilter, + int ordermin, + int orderstep, + double filterlo ) { RealD scale; @@ -215,7 +252,7 @@ public: int b =0; { // Filter - Chebyshev Cheb(lo,hi,order); + Chebyshev Cheb(lo,hi,orderfilter); Cheb(hermop,noise,Mn); // normalise scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; @@ -227,7 +264,7 @@ public: // Generate a full sequence of Chebyshevs { - lo=0; + lo=filterlo; noise=Mn; FineField T0(FineGrid); T0 = noise; @@ -245,7 +282,7 @@ public: hermop.HermOp(T0,y); T1=y*xscale+noise*mscale; - for(int n=2;n<=orderstep*(nn-1);n++){ + for(int n=2;n<=ordermin+orderstep*(nn-2);n++){ hermop.HermOp(*Tn,y); @@ -253,6 +290,7 @@ public: auto Tn_v = Tn->View(); auto Tnp_v = Tnp->View(); auto Tnm_v = Tnm->View(); + const int Nsimd = CComplex::Nsimd(); accelerator_forNB(ss, FineGrid->oSites(), Nsimd, { coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss)); @@ -260,12 +298,14 @@ public: // Possible more fine grained control is needed than a linear sweep, // but huge productivity gain if this is simple algorithm and not a tunable - if ( (n%orderstep)==0 ) { + int m =1; + if ( n>=ordermin ) m=n-ordermin; + if ( (m%orderstep)==0 ) { Mn=*Tnp; scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; subspace[b] = Mn; hermop.Op(Mn,tmp); - std::cout< "< "< &hermop, + int nn, + double hi, + double lo, + int orderfilter, + int ordermin, + int orderstep, + double filterlo + ) { + + RealD scale; + + FineField noise(FineGrid); + FineField Mn(FineGrid); + FineField tmp(FineGrid); + FineField combined(FineGrid); + + // New normalised noise + gaussian(RNG,noise); + scale = std::pow(norm2(noise),-0.5); + noise=noise*scale; + + // Initial matrix element + hermop.Op(noise,Mn); std::cout< "< Cheb(llo,hhi,oorder); \ + Cheb(hermop,noise,Mn); \ + scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; \ + subspace[b] = Mn; \ + hermop.Op(Mn,tmp); \ + std::cout< "< Cheb(0.002,60.0,1500,-0.5,3.5); \ + + RealD alpha=-0.8; + RealD beta =-0.8; +#define FILTER(llo,hhi,oorder) \ + { \ + Chebyshev Cheb(llo,hhi,oorder); \ + /* JacobiPolynomial Cheb(0.0,60.0,oorder,alpha,beta);*/\ + Cheb(hermop,noise,Mn); \ + scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; \ + subspace[b] = Mn; \ + hermop.Op(Mn,tmp); \ + std::cout< "< Cheb(llo,hhi,oorder); \ + Cheb(hermop,noise,combined); \ + } + + double node = 0.000; + FILTERb(lo,hi,orderfilter);// 0 + // FILTERc(node,hi,51);// 0 + noise = Mn; + int base = 0; + int mult = 100; + FILTER(node,hi,base+1*mult); + FILTER(node,hi,base+2*mult); + FILTER(node,hi,base+3*mult); + FILTER(node,hi,base+4*mult); + FILTER(node,hi,base+5*mult); + FILTER(node,hi,base+6*mult); + FILTER(node,hi,base+7*mult); + FILTER(node,hi,base+8*mult); + FILTER(node,hi,base+9*mult); + FILTER(node,hi,base+10*mult); + FILTER(node,hi,base+11*mult); + FILTER(node,hi,base+12*mult); + FILTER(node,hi,base+13*mult); + FILTER(node,hi,base+14*mult); + FILTER(node,hi,base+15*mult); + assert(b==nn); + } +#endif + +#if 0 + virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase &hermop, + int nn, + double hi, + double lo, + int orderfilter, + int ordermin, + int orderstep, + double filterlo + ) { + + RealD scale; + + FineField noise(FineGrid); + FineField Mn(FineGrid); + FineField tmp(FineGrid); + FineField combined(FineGrid); + + // New normalised noise + gaussian(RNG,noise); + scale = std::pow(norm2(noise),-0.5); + noise=noise*scale; + + // Initial matrix element + hermop.Op(noise,Mn); std::cout< "< JacobiPoly(0.005,60.,1500); + // JacobiPolynomial JacobiPoly(0.002,60.0,1500,-0.5,3.5); + //JacobiPolynomial JacobiPoly(0.03,60.0,500,-0.5,3.5); + // JacobiPolynomial JacobiPoly(0.00,60.0,1000,-0.5,3.5); + JacobiPoly(hermop,noise,Mn); + scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; + subspace[b] = Mn; + hermop.Op(Mn,tmp); + std::cout< "< "< class CoarsenedMatrix : public SparseMatrixBase > > { public: - typedef iVector siteVector; + typedef iVector siteVector; + typedef Lattice CoarseComplexField; typedef Lattice CoarseVector; typedef Lattice > CoarseMatrix; typedef iMatrix Cobj; @@ -304,7 +541,6 @@ public: CartesianStencil Stencil; std::vector A; - /////////////////////// // Interface @@ -316,7 +552,6 @@ public: conformable(_grid,in.Grid()); conformable(in.Grid(),out.Grid()); - // RealD Nin = norm2(in); SimpleCompressor compressor; @@ -333,16 +568,14 @@ public: Aview *Aview_p = & AcceleratorViewContainer[0]; const int Nsimd = CComplex::Nsimd(); - typedef decltype(coalescedRead(in_v[0])) calcVector; typedef decltype(coalescedRead(in_v[0](0))) calcComplex; GridStopWatch ArithmeticTimer; int osites=Grid()->oSites(); - double flops = osites*Nsimd*nbasis*nbasis*8.0*geom.npoint; - double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex); + // double flops = osites*Nsimd*nbasis*nbasis*8.0*geom.npoint; + // double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex); double usecs =-usecond(); - // assert(geom.npoint==9); accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, { @@ -418,7 +651,37 @@ public: auto out_v = out.View(); auto in_v = in.View(); + + const int Nsimd = CComplex::Nsimd(); + typedef decltype(coalescedRead(in_v[0])) calcVector; + typedef decltype(coalescedRead(in_v[0](0))) calcComplex; + + accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + int lane=SIMTlane(Nsimd); + SE=Stencil.GetEntry(ptype,point,ss); + + if(SE->_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane); + } else { + nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane); + } + synchronise(); + + for(int bb=0;bboSites(),1,{ + siteVector res = Zero(); siteVector nbr; int ptype; @@ -433,18 +696,23 @@ public: } else { nbr = Stencil.CommBuf()[SE->_offset]; } + synchronise(); res = res + Aview_p[point][ss]*nbr; out_v[ss]=res; }); - +#endif } void MdirAll(const CoarseVector &in,std::vector &out) { this->MdirComms(in); int ndir=geom.npoint-1; - assert(out.size()==ndir); + if ((out.size()!=ndir)&&(out.size()!=ndir+1)) { + std::cout <<"MdirAll out size "<< out.size()< > &linop, - Aggregation & Subspace){ + Aggregation & Subspace) + { + typedef Lattice FineComplexField; + typedef typename Fobj::scalar_type scalar_type; - FineField iblock(FineGrid); // contributions from within this block - FineField oblock(FineGrid); // contributions from outwith this block + FineComplexField one(FineGrid); one=scalar_type(1.0,0.0); + FineComplexField zero(FineGrid); zero=scalar_type(0.0,0.0); + + std::vector masks(geom.npoint,FineGrid); + FineComplexField imask(FineGrid); // contributions from within this block + FineComplexField omask(FineGrid); // contributions from outwith this block + + FineComplexField evenmask(FineGrid); + FineComplexField oddmask(FineGrid); FineField phi(FineGrid); FineField tmp(FineGrid); FineField zz(FineGrid); zz=Zero(); FineField Mphi(FineGrid); + FineField Mphie(FineGrid); + FineField Mphio(FineGrid); std::vector Mphi_p(geom.npoint,FineGrid); - Lattice > coor(FineGrid); + Lattice > coor (FineGrid); + Lattice > bcoor(FineGrid); + Lattice > bcb (FineGrid); CoarseVector iProj(Grid()); CoarseVector oProj(Grid()); - CoarseScalar InnerProd(Grid()); + CoarseVector SelfProj(Grid()); + CoarseComplexField iZProj(Grid()); + CoarseComplexField oZProj(Grid()); + CoarseScalar InnerProd(Grid()); // Orthogonalise the subblocks over the basis blockOrthogonalise(InnerProd,Subspace.subspace); @@ -525,22 +810,46 @@ public: // Compute the matrix elements of linop between this orthonormal // set of vectors. int self_stencil=-1; - for(int p=0;p_rdimensions[dir])/(Grid()->_rdimensions[dir]); + + LatticeCoordinate(coor,dir); + + /////////////////////////////////////////////////////// + // Work out even and odd block checkerboarding for fast diagonal term + /////////////////////////////////////////////////////// + if ( disp==1 ) { + bcb = bcb + div(coor,block); + } + + if ( disp==0 ) { + masks[p]= Zero(); + } else if ( disp==1 ) { + masks[p] = where(mod(coor,block)==(block-1),one,zero); + } else if ( disp==-1 ) { + masks[p] = where(mod(coor,block)==(Integer)0,one,zero); + } } + evenmask = where(mod(bcb,2)==(Integer)0,one,zero); + oddmask = one-evenmask; + assert(self_stencil!=-1); for(int i=0;i_rdimensions[dir])/(Grid()->_rdimensions[dir]); + if ( (disp==-1) || (!hermitian ) ) { - LatticeCoordinate(coor,dir); + //////////////////////////////////////////////////////////////////////// + // Pick out contributions coming from this cell and neighbour cell + //////////////////////////////////////////////////////////////////////// + omask = masks[p]; + imask = one-omask; + + for(int j=0;joSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); }); - //////////////////////////////////////////////////////////////////////// - // Pick out contributions coming from this cell and neighbour cell - //////////////////////////////////////////////////////////////////////// - if ( disp==0 ) { - iblock = Mphi; - oblock = Zero(); - } else if ( disp==1 ) { - oblock = where(mod(coor,block)==(block-1),Mphi,zz); - iblock = where(mod(coor,block)!=(block-1),Mphi,zz); - } else if ( disp==-1 ) { - oblock = where(mod(coor,block)==(Integer)0,Mphi,zz); - iblock = where(mod(coor,block)!=(Integer)0,Mphi,zz); - } else { - assert(0); + } + } + } + + /////////////////////////////////////////// + // Faster alternate self coupling.. use hermiticity to save 2x + /////////////////////////////////////////// + { + mult(tmp,phi,evenmask); linop.Op(tmp,Mphie); + mult(tmp,phi,oddmask ); linop.Op(tmp,Mphio); + + // tmp = Mphie*evenmask + Mphio*oddmask; + { + auto tmp_ = tmp.View(); + auto evenmask_ = evenmask.View(); + auto oddmask_ = oddmask.View(); + auto Mphie_ = Mphie.View(); + auto Mphio_ = Mphio.View(); + accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{ + coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss)); + }); } - // Could do local inner products, - // and then block pick the IP's. - // Ideally write a routine to do two masked block sums at once - std::cout << GridLogMessage<< "CoarsenMatrix picked "<oSites(),1,{ + accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ for(int j=0;j