diff --git a/Grid/algorithms/Algorithms.h b/Grid/algorithms/Algorithms.h index f1ac1c81..97ab4dc1 100644 --- a/Grid/algorithms/Algorithms.h +++ b/Grid/algorithms/Algorithms.h @@ -35,6 +35,7 @@ Author: Peter Boyle #include #include +#include #include #include #include diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index 913f5c0c..01b0da4d 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -1,3 +1,14 @@ + // blockZaxpy in bockPromote - 3s, 5% + // noncoalesced linalg in Preconditionoer ~ 3s 5% + // Lancos tuning or replace 10-20s ~ 25%, open ended + // setup tuning 5s ~ 8% + // -- e.g. ordermin, orderstep tunables. + // MdagM path without norm in LinOp code. few seconds + + // Mdir calc blocking kernels + // Fuse kernels in blockMaskedInnerProduct + // preallocate Vectors in Cayley 5D ~ few percent few seconds + /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -34,15 +45,42 @@ Author: paboyle NAMESPACE_BEGIN(Grid); +template +inline void blockMaskedInnerProduct(Lattice &CoarseInner1, + Lattice &CoarseInner2, + const Lattice &FineMask1, + const Lattice &FineMask2, + const Lattice &fineX, + const Lattice &fineY) +{ + typedef decltype(innerProduct(vobj(),vobj())) dotp; + + GridBase *coarse(CoarseInner1.Grid()); + GridBase *fine (fineX.Grid()); + + Lattice fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard(); + Lattice fine_inner_msk(fine); + + // Multiply could be fused with innerProduct + // Single block sum kernel could do both masks. + fine_inner = localInnerProduct(fineX,fineY); + + mult(fine_inner_msk, fine_inner,FineMask1); + blockSum(CoarseInner1,fine_inner_msk); + + mult(fine_inner_msk, fine_inner,FineMask2); + blockSum(CoarseInner2,fine_inner_msk); +} + + class Geometry { - // int dimension; public: int npoint; std::vector directions ; std::vector displacements; Geometry(int _d) { - + int base = (_d==5) ? 1:0; // make coarse grid stencil for 4d , not 5d @@ -52,10 +90,10 @@ public: directions.resize(npoint); displacements.resize(npoint); for(int d=0;d<_d;d++){ - directions[2*d ] = d+base; - directions[2*d+1] = d+base; - displacements[2*d ] = +1; - displacements[2*d+1] = -1; + directions[d ] = d+base; + directions[d+_d] = d+base; + displacements[d ] = +1; + displacements[d+_d]= -1; } directions [2*_d]=0; displacements[2*_d]=0; @@ -63,7 +101,7 @@ public: //// report back std::cout<oSites(),{ + accelerator_for(ss, CoarseGrid->oSites(),1,{ eProj[ss](i)=CComplex(1.0); }); eProj=eProj - iProj; @@ -146,61 +184,9 @@ public: void CreateSubspaceRandom(GridParallelRNG &RNG){ for(int i=0;i &hermop,int nn=nbasis) - { - // Run a Lanczos with sloppy convergence - const int Nstop = nn; - const int Nk = nn+20; - const int Np = nn+20; - const int Nm = Nk+Np; - const int MaxIt= 10000; - RealD resid = 1.0e-3; - - Chebyshev Cheb(0.5,64.0,21); - ImplicitlyRestartedLanczos IRL(hermop,Cheb,Nstop,Nk,Nm,resid,MaxIt); - // IRL.lock = 1; - - FineField noise(FineGrid); gaussian(RNG,noise); - FineField tmp(FineGrid); - std::vector eval(Nm); - std::vector evec(Nm,FineGrid); - - int Nconv; - IRL.calc(eval,evec, - noise, - Nconv); - - // pull back nn vectors - for(int b=0;b "< "< Cheb(lo,hi,orderfilter); + Cheb(hermop,noise,Mn); + // normalise + scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; + subspace[b] = Mn; + hermop.Op(Mn,tmp); + std::cout< "<View(); + auto Tnp_v = Tnp->View(); + auto Tnm_v = Tnm->View(); + const int Nsimd = CComplex::Nsimd(); + accelerator_forNB(ss, FineGrid->oSites(), Nsimd, { + coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); + coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss)); + }); + + // Possible more fine grained control is needed than a linear sweep, + // but huge productivity gain if this is simple algorithm and not a tunable + int m =1; + if ( n>=ordermin ) m=n-ordermin; + if ( (m%orderstep)==0 ) { + Mn=*Tnp; + scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; + subspace[b] = Mn; + hermop.Op(Mn,tmp); + std::cout< "< &hermop, + int nn, + double hi, + double lo, + int orderfilter, + int ordermin, + int orderstep, + double filterlo + ) { + + RealD scale; FineField noise(FineGrid); FineField Mn(FineGrid); + FineField tmp(FineGrid); + FineField combined(FineGrid); - for(int b=0;b "< "< Cheb(llo,hhi,oorder); \ + Cheb(hermop,noise,Mn); \ + scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; \ + subspace[b] = Mn; \ + hermop.Op(Mn,tmp); \ + std::cout< "< Cheb(0.002,60.0,1500,-0.5,3.5); \ - hermop.Op(Mn,noise); std::cout< "< Cheb(llo,hhi,oorder); \ + /* JacobiPolynomial Cheb(0.0,60.0,oorder,alpha,beta);*/\ + Cheb(hermop,noise,Mn); \ + scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; \ + subspace[b] = Mn; \ + hermop.Op(Mn,tmp); \ + std::cout< "< Cheb(llo,hhi,oorder); \ + Cheb(hermop,noise,combined); \ + } + double node = 0.000; + FILTERb(lo,hi,orderfilter);// 0 + // FILTERc(node,hi,51);// 0 + noise = Mn; + int base = 0; + int mult = 100; + FILTER(node,hi,base+1*mult); + FILTER(node,hi,base+2*mult); + FILTER(node,hi,base+3*mult); + FILTER(node,hi,base+4*mult); + FILTER(node,hi,base+5*mult); + FILTER(node,hi,base+6*mult); + FILTER(node,hi,base+7*mult); + FILTER(node,hi,base+8*mult); + FILTER(node,hi,base+9*mult); + FILTER(node,hi,base+10*mult); + FILTER(node,hi,base+11*mult); + FILTER(node,hi,base+12*mult); + FILTER(node,hi,base+13*mult); + FILTER(node,hi,base+14*mult); + FILTER(node,hi,base+15*mult); + assert(b==nn); } +#endif + +#if 0 + virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase &hermop, + int nn, + double hi, + double lo, + int orderfilter, + int ordermin, + int orderstep, + double filterlo + ) { + + RealD scale; + + FineField noise(FineGrid); + FineField Mn(FineGrid); + FineField tmp(FineGrid); + FineField combined(FineGrid); + + // New normalised noise + gaussian(RNG,noise); + scale = std::pow(norm2(noise),-0.5); + noise=noise*scale; + + // Initial matrix element + hermop.Op(noise,Mn); std::cout< "< JacobiPoly(0.005,60.,1500); + // JacobiPolynomial JacobiPoly(0.002,60.0,1500,-0.5,3.5); + //JacobiPolynomial JacobiPoly(0.03,60.0,500,-0.5,3.5); + // JacobiPolynomial JacobiPoly(0.00,60.0,1000,-0.5,3.5); + JacobiPoly(hermop,noise,Mn); + scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; + subspace[b] = Mn; + hermop.Op(Mn,tmp); + std::cout< "< "< class CoarsenedMatrix : public SparseMatrixBase > > { public: - typedef iVector siteVector; + typedef iVector siteVector; + typedef Lattice CoarseComplexField; typedef Lattice CoarseVector; typedef Lattice > CoarseMatrix; - + typedef iMatrix Cobj; typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field typedef Lattice FineField; @@ -293,7 +541,6 @@ public: CartesianStencil Stencil; std::vector A; - /////////////////////// // Interface @@ -305,33 +552,71 @@ public: conformable(_grid,in.Grid()); conformable(in.Grid(),out.Grid()); - RealD Nin = norm2(in); + // RealD Nin = norm2(in); SimpleCompressor compressor; + + double comms_usec = -usecond(); Stencil.HaloExchange(in,compressor); + comms_usec += usecond(); + auto in_v = in.View(); auto out_v = out.View(); - thread_for(ss,Grid()->oSites(),{ - siteVector res = Zero(); - siteVector nbr; + typedef LatticeView Aview; + + Vector AcceleratorViewContainer; + for(int p=0;poSites(); + // double flops = osites*Nsimd*nbasis*nbasis*8.0*geom.npoint; + // double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex); + double usecs =-usecond(); + // assert(geom.npoint==9); + + accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; int ptype; StencilEntry *SE; + + int lane=SIMTlane(Nsimd); for(int point=0;point_is_local&&SE->_permute) { - permute(nbr,in_v[SE->_offset],ptype); - } else if(SE->_is_local) { - nbr = in_v[SE->_offset]; + if(SE->_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane); } else { - nbr = Stencil.CommBuf()[SE->_offset]; + nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane); + } + synchronise(); + + for(int bb=0;bb compressor; Stencil.HaloExchange(in,compressor); - - auto point = [dir, disp](){ - if(dir == 0 and disp == 0) - return 8; - else - return (4 * dir + 1 - disp) / 2; - }(); + } + void MdirCalc(const CoarseVector &in, CoarseVector &out, int point) + { + conformable(_grid,in.Grid()); + conformable(_grid,out.Grid()); + + typedef LatticeView Aview; + Vector AcceleratorViewContainer; + for(int p=0;poSites(),{ + + const int Nsimd = CComplex::Nsimd(); + typedef decltype(coalescedRead(in_v[0])) calcVector; + typedef decltype(coalescedRead(in_v[0](0))) calcComplex; + + accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + int lane=SIMTlane(Nsimd); + SE=Stencil.GetEntry(ptype,point,ss); + + if(SE->_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane); + } else { + nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane); + } + synchronise(); + + for(int bb=0;bboSites(),1,{ + siteVector res = Zero(); siteVector nbr; int ptype; @@ -382,16 +696,65 @@ public: } else { nbr = Stencil.CommBuf()[SE->_offset]; } + synchronise(); - auto A_point = A[point].View(); - res = res + A_point[ss]*nbr; + res = res + Aview_p[point][ss]*nbr; - vstream(out_v[ss],res); + out_v[ss]=res; }); +#endif + } + void MdirAll(const CoarseVector &in,std::vector &out) + { + this->MdirComms(in); + int ndir=geom.npoint-1; + if ((out.size()!=ndir)&&(out.size()!=ndir+1)) { + std::cout <<"MdirAll out size "<< out.size()<MdirComms(in); + + int ndim = in.Grid()->Nd(); + + ////////////// + // 4D action like wilson + // 0+ => 0 + // 0- => 1 + // 1+ => 2 + // 1- => 3 + // etc.. + ////////////// + // 5D action like DWF + // 1+ => 0 + // 1- => 1 + // 2+ => 2 + // 2- => 3 + // etc.. + auto point = [dir, disp, ndim](){ + if(dir == 0 and disp == 0) + return 8; + else if ( ndim==4 ) { + return (4 * dir + 1 - disp) / 2; + } else { + return (4 * (dir-1) + 1 - disp) / 2; + } + }(); + + MdirCalc(in,out,point); + }; - void Mdiag(const CoarseVector &in, CoarseVector &out){ - Mdir(in, out, 0, 0); // use the self coupling (= last) point of the stencil + void Mdiag(const CoarseVector &in, CoarseVector &out) + { + int point=geom.npoint-1; + MdirCalc(in, out, point); // No comms }; @@ -401,25 +764,44 @@ public: geom(CoarseGrid._ndimension), hermitian(hermitian_), Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0), - A(geom.npoint,&CoarseGrid) + A(geom.npoint,&CoarseGrid) { }; void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase > &linop, - Aggregation & Subspace){ + Aggregation & Subspace) + { + typedef Lattice FineComplexField; + typedef typename Fobj::scalar_type scalar_type; - FineField iblock(FineGrid); // contributions from within this block - FineField oblock(FineGrid); // contributions from outwith this block + FineComplexField one(FineGrid); one=scalar_type(1.0,0.0); + FineComplexField zero(FineGrid); zero=scalar_type(0.0,0.0); + + std::vector masks(geom.npoint,FineGrid); + FineComplexField imask(FineGrid); // contributions from within this block + FineComplexField omask(FineGrid); // contributions from outwith this block + + FineComplexField evenmask(FineGrid); + FineComplexField oddmask(FineGrid); FineField phi(FineGrid); FineField tmp(FineGrid); FineField zz(FineGrid); zz=Zero(); FineField Mphi(FineGrid); + FineField Mphie(FineGrid); + FineField Mphio(FineGrid); + std::vector Mphi_p(geom.npoint,FineGrid); - Lattice > coor(FineGrid); + Lattice > coor (FineGrid); + Lattice > bcoor(FineGrid); + Lattice > bcb (FineGrid); CoarseVector iProj(Grid()); CoarseVector oProj(Grid()); + CoarseVector SelfProj(Grid()); + CoarseComplexField iZProj(Grid()); + CoarseComplexField oZProj(Grid()); + CoarseScalar InnerProd(Grid()); // Orthogonalise the subblocks over the basis @@ -428,69 +810,114 @@ public: // Compute the matrix elements of linop between this orthonormal // set of vectors. int self_stencil=-1; - for(int p=0;p_rdimensions[dir])/(Grid()->_rdimensions[dir]); + + LatticeCoordinate(coor,dir); + + /////////////////////////////////////////////////////// + // Work out even and odd block checkerboarding for fast diagonal term + /////////////////////////////////////////////////////// + if ( disp==1 ) { + bcb = bcb + div(coor,block); + } + + if ( disp==0 ) { + masks[p]= Zero(); + } else if ( disp==1 ) { + masks[p] = where(mod(coor,block)==(block-1),one,zero); + } else if ( disp==-1 ) { + masks[p] = where(mod(coor,block)==(Integer)0,one,zero); + } } + evenmask = where(mod(bcb,2)==(Integer)0,one,zero); + oddmask = one-evenmask; + assert(self_stencil!=-1); for(int i=0;i_rdimensions[dir])/(Grid()->_rdimensions[dir]); + if ( (disp==-1) || (!hermitian ) ) { - LatticeCoordinate(coor,dir); - - if ( disp==0 ){ - linop.OpDiag(phi,Mphi); - } - else { - linop.OpDir(phi,Mphi,dir,disp); - } - - //////////////////////////////////////////////////////////////////////// - // Pick out contributions coming from this cell and neighbour cell - //////////////////////////////////////////////////////////////////////// - if ( disp==0 ) { - iblock = Mphi; - oblock = Zero(); - } else if ( disp==1 ) { - oblock = where(mod(coor,block)==(block-1),Mphi,zz); - iblock = where(mod(coor,block)!=(block-1),Mphi,zz); - } else if ( disp==-1 ) { - oblock = where(mod(coor,block)==(Integer)0,Mphi,zz); - iblock = where(mod(coor,block)!=(Integer)0,Mphi,zz); - } else { - assert(0); - } - - Subspace.ProjectToSubspace(iProj,iblock); - Subspace.ProjectToSubspace(oProj,oblock); - // blockProject(iProj,iblock,Subspace.subspace); - // blockProject(oProj,oblock,Subspace.subspace); - auto iProj_v = iProj.View() ; - auto oProj_v = oProj.View() ; - auto A_p = A[p].View(); - auto A_self = A[self_stencil].View(); - thread_for(ss, Grid()->oSites(),{ + //////////////////////////////////////////////////////////////////////// + // Pick out contributions coming from this cell and neighbour cell + //////////////////////////////////////////////////////////////////////// + omask = masks[p]; + imask = one-omask; + for(int j=0;joSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); }); + + } + } + } + + /////////////////////////////////////////// + // Faster alternate self coupling.. use hermiticity to save 2x + /////////////////////////////////////////// + { + mult(tmp,phi,evenmask); linop.Op(tmp,Mphie); + mult(tmp,phi,oddmask ); linop.Op(tmp,Mphio); + + // tmp = Mphie*evenmask + Mphio*oddmask; + { + auto tmp_ = tmp.View(); + auto evenmask_ = evenmask.View(); + auto oddmask_ = oddmask.View(); + auto Mphie_ = Mphie.View(); + auto Mphio_ = Mphio.View(); + accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{ + coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss)); + }); + } + + blockProject(SelfProj,tmp,Subspace.subspace); + + auto SelfProj_ = SelfProj.View(); + auto A_self = A[self_stencil].View(); + accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ + for(int j=0;j &out) = 0; // Abstract base virtual void Op (const Field &in, Field &out) = 0; // Abstract base virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base @@ -83,6 +84,9 @@ public: void OpDir (const Field &in, Field &out,int dir,int disp) { _Mat.Mdir(in,out,dir,disp); } + void OpDirAll (const Field &in, std::vector &out){ + _Mat.MdirAll(in,out); + }; void Op (const Field &in, Field &out){ _Mat.M(in,out); } @@ -93,8 +97,7 @@ public: _Mat.MdagM(in,out,n1,n2); } void HermOp(const Field &in, Field &out){ - RealD n1,n2; - HermOpAndNorm(in,out,n1,n2); + _Mat.MdagM(in,out); } }; @@ -116,6 +119,9 @@ public: _Mat.Mdir(in,out,dir,disp); assert(0); } + void OpDirAll (const Field &in, std::vector &out){ + assert(0); + }; void Op (const Field &in, Field &out){ _Mat.M(in,out); assert(0); @@ -154,6 +160,9 @@ public: void OpDir (const Field &in, Field &out,int dir,int disp) { _Mat.Mdir(in,out,dir,disp); } + void OpDirAll (const Field &in, std::vector &out){ + _Mat.MdirAll(in,out); + }; void Op (const Field &in, Field &out){ _Mat.M(in,out); } @@ -162,7 +171,6 @@ public: } void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ _Mat.M(in,out); - ComplexD dot= innerProduct(in,out); n1=real(dot); n2=norm2(out); } @@ -183,6 +191,9 @@ public: void OpDir (const Field &in, Field &out,int dir,int disp) { _Mat.Mdir(in,out,dir,disp); } + void OpDirAll (const Field &in, std::vector &out){ + _Mat.MdirAll(in,out); + }; void Op (const Field &in, Field &out){ _Mat.M(in,out); } @@ -234,6 +245,9 @@ public: void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); } + void OpDirAll (const Field &in, std::vector &out){ + assert(0); + }; }; template class SchurDiagMooeeOperator : public SchurOperatorBase { diff --git a/Grid/algorithms/SparseMatrix.h b/Grid/algorithms/SparseMatrix.h index ffed7527..b959f53c 100644 --- a/Grid/algorithms/SparseMatrix.h +++ b/Grid/algorithms/SparseMatrix.h @@ -45,8 +45,13 @@ public: ni=M(in,tmp); no=Mdag(tmp,out); } + virtual void MdagM(const Field &in, Field &out) { + RealD ni, no; + MdagM(in,out,ni,no); + } virtual void Mdiag (const Field &in, Field &out)=0; virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0; + virtual void MdirAll (const Field &in, std::vector &out)=0; }; ///////////////////////////////////////////////////////////////////////////////////////////// @@ -56,12 +61,12 @@ template class CheckerBoardedSparseMatrixBase : public SparseMatrix public: virtual GridBase *RedBlackGrid(void)=0; - ////////////////////////////////////////////////////////////////////// - // Query the even even properties to make algorithmic decisions - ////////////////////////////////////////////////////////////////////// - virtual RealD Mass(void) { return 0.0; }; - virtual int ConstEE(void) { return 1; }; // Disable assumptions unless overridden - virtual int isTrivialEE(void) { return 0; }; // by a derived class that knows better + ////////////////////////////////////////////////////////////////////// + // Query the even even properties to make algorithmic decisions + ////////////////////////////////////////////////////////////////////// + virtual RealD Mass(void) { return 0.0; }; + virtual int ConstEE(void) { return 1; }; // Disable assumptions unless overridden + virtual int isTrivialEE(void) { return 0; }; // by a derived class that knows better // half checkerboard operaions virtual void Meooe (const Field &in, Field &out)=0; diff --git a/Grid/algorithms/approx/Chebyshev.h b/Grid/algorithms/approx/Chebyshev.h index 97e0e807..133db2b4 100644 --- a/Grid/algorithms/approx/Chebyshev.h +++ b/Grid/algorithms/approx/Chebyshev.h @@ -94,6 +94,24 @@ public: Coeffs.assign(0.,order); Coeffs[order-1] = 1.; }; + + // PB - more efficient low pass drops high modes above the low as 1/x uses all Chebyshev's. + // Similar kick effect below the threshold as Lanczos filter approach + void InitLowPass(RealD _lo,RealD _hi,int _order) + { + lo=_lo; + hi=_hi; + order=_order; + + if(order < 2) exit(-1); + Coeffs.resize(order); + for(int j=0;j + +NAMESPACE_BEGIN(Grid); + +template +class JacobiPolynomial : public OperatorFunction { + private: + using OperatorFunction::operator(); + + int order; + RealD hi; + RealD lo; + RealD alpha; + RealD beta; + + public: + void csv(std::ostream &out){ + csv(out,lo,hi); + } + void csv(std::ostream &out,RealD llo,RealD hhi){ + RealD diff = hhi-llo; + RealD delta = diff*1.0e-5; + for (RealD x=llo-delta; x<=hhi; x+=delta) { + RealD f = approx(x); + out<< x<<" "< &Linop, const Field &in, Field &out) { + GridBase *grid=in.Grid(); + + int vol=grid->gSites(); + + Field T0(grid); + Field T1(grid); + Field T2(grid); + Field y(grid); + + + Field *Tnm = &T0; + Field *Tn = &T1; + Field *Tnp = &T2; + + // RealD T0=1.0; + T0=in; + + // RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo)); + // = x * 2/(hi-lo) - (hi+lo)/(hi-lo) + Linop.HermOp(T0,y); + RealD xscale = 2.0/(hi-lo); + RealD mscale = -(hi+lo)/(hi-lo); + Linop.HermOp(T0,y); + y=y*xscale+in*mscale; + + // RealD T1=(alpha-beta)*0.5+(alpha+beta+2.0)*0.5*y; + RealD halfAmB = (alpha-beta)*0.5; + RealD halfApBp2= (alpha+beta+2.0)*0.5; + T1 = halfAmB * in + halfApBp2*y; + + for(int n=2;n<=order;n++){ + + Linop.HermOp(*Tn,y); + y=xscale*y+mscale*(*Tn); + + RealD cnp = 2.0*n*(n+alpha+beta)*(2.0*n-2.0+alpha+beta); + RealD cny = (2.0*n-2.0+alpha+beta)*(2.0*n-1.0+alpha+beta)*(2.0*n+alpha+beta); + RealD cn1 = (2.0*n+alpha+beta-1.0)*(alpha*alpha-beta*beta); + RealD cnm = - 2.0*(n+alpha-1.0)*(n+beta-1.0)*(2.0*n+alpha+beta); + + // Tnp= ( cny * y *Tn + cn1 * Tn + cnm * Tnm )/ cnp; + cny=cny/cnp; + cn1=cn1/cnp; + cn1=cn1/cnp; + cnm=cnm/cnp; + + *Tnp=cny*y + cn1 *(*Tn) + cnm * (*Tnm); + + // Cycle pointers to avoid copies + Field *swizzle = Tnm; + Tnm =Tn; + Tn =Tnp; + Tnp =swizzle; + } + out=*Tnp; + + } +}; +NAMESPACE_END(Grid); +#endif diff --git a/Grid/algorithms/iterative/ConjugateGradient.h b/Grid/algorithms/iterative/ConjugateGradient.h index 398f578f..32ba4035 100644 --- a/Grid/algorithms/iterative/ConjugateGradient.h +++ b/Grid/algorithms/iterative/ConjugateGradient.h @@ -71,7 +71,6 @@ public: // Initial residual computation & set up RealD guess = norm2(psi); assert(std::isnan(guess) == 0); - Linop.HermOpAndNorm(psi, mmp, d, b); @@ -154,18 +153,18 @@ public: RealD resnorm = std::sqrt(norm2(p)); RealD true_residual = resnorm / srcnorm; - std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl; - std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)< void basisOrthogonalize(std::vector &basis,Field &w,int k) { + // If assume basis[j] are already orthonormal, + // can take all inner products in parallel saving 2x bandwidth + // Save 3x bandwidth on the second line of loop. + // perhaps 2.5x speed up. + // 2x overall in Multigrid Lanczos for(int j=0; j& lmd, @@ -589,6 +594,7 @@ until convergence std::vector& evec, Field& w,int Nm,int k) { + std::cout<0) w -= lme[k-1] * evec[k-1]; - ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk) + ComplexD zalph = innerProduct(evec_k,w); RealD alph = real(zalph); - w = w - alph * evec_k;// 5. wk:=wk−αkvk + w = w - alph * evec_k; - RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop - // 7. vk+1 := wk/βk+1 + RealD beta = normalise(w); lmd[k] = alph; lme[k] = beta; - if (k>0 && k % orth_period == 0) { + if ( (k>0) && ( (k % orth_period) == 0 )) { + std::cout<& lmd, std::vector& lme, diff --git a/Grid/algorithms/iterative/NormalEquations.h b/Grid/algorithms/iterative/NormalEquations.h index 270b0115..df82b6dc 100644 --- a/Grid/algorithms/iterative/NormalEquations.h +++ b/Grid/algorithms/iterative/NormalEquations.h @@ -33,26 +33,30 @@ NAMESPACE_BEGIN(Grid); /////////////////////////////////////////////////////////////////////////////////////////////////////// // Take a matrix and form an NE solver calling a Herm solver /////////////////////////////////////////////////////////////////////////////////////////////////////// -template class NormalEquations : public OperatorFunction{ +template class NormalEquations { private: SparseMatrixBase & _Matrix; OperatorFunction & _HermitianSolver; - + LinearFunction & _Guess; public: ///////////////////////////////////////////////////// // Wrap the usual normal equations trick ///////////////////////////////////////////////////// - NormalEquations(SparseMatrixBase &Matrix, OperatorFunction &HermitianSolver) - : _Matrix(Matrix), _HermitianSolver(HermitianSolver) {}; + NormalEquations(SparseMatrixBase &Matrix, OperatorFunction &HermitianSolver, + LinearFunction &Guess) + : _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {}; void operator() (const Field &in, Field &out){ Field src(in.Grid()); + Field tmp(in.Grid()); + MdagMLinearOperator,Field> MdagMOp(_Matrix); _Matrix.Mdag(in,src); - _HermitianSolver(src,out); // Mdag M out = Mdag in - + _Guess(src,out); + _HermitianSolver(MdagMOp,src,out); // Mdag M out = Mdag in + } }; diff --git a/Grid/algorithms/iterative/PowerMethod.h b/Grid/algorithms/iterative/PowerMethod.h index 8a238ea6..6aa8e923 100644 --- a/Grid/algorithms/iterative/PowerMethod.h +++ b/Grid/algorithms/iterative/PowerMethod.h @@ -30,12 +30,12 @@ template class PowerMethod RealD vden = norm2(src_n); RealD na = vnum/vden; - if ( (fabs(evalMaxApprox/na - 1.0) < 0.01) || (i==_MAX_ITER_EST_-1) ) { + if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) { evalMaxApprox = na; + std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl; return evalMaxApprox; } evalMaxApprox = na; - std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl; src_n = tmp; } assert(0); diff --git a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h index d3188ecb..a61b62e0 100644 --- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h +++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h @@ -38,10 +38,11 @@ Author: Peter Boyle /////////////////////////////////////////////////////////////////////////////////////////////////////// NAMESPACE_BEGIN(Grid); +#define GCRLogLevel std::cout << GridLogMessage < -class PrecGeneralisedConjugateResidual : public OperatorFunction { +class PrecGeneralisedConjugateResidual : public LinearFunction { public: - using OperatorFunction::operator(); RealD Tolerance; Integer MaxIterations; @@ -49,23 +50,29 @@ public: int mmax; int nstep; int steps; + int level; GridStopWatch PrecTimer; GridStopWatch MatTimer; GridStopWatch LinalgTimer; - LinearFunction &Preconditioner; + LinearFunction &Preconditioner; + LinearOperatorBase &Linop; - PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearFunction &Prec,int _mmax,int _nstep) : + void Level(int lv) { level=lv; }; + + PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearOperatorBase &_Linop,LinearFunction &Prec,int _mmax,int _nstep) : Tolerance(tol), MaxIterations(maxit), + Linop(_Linop), Preconditioner(Prec), mmax(_mmax), nstep(_nstep) { + level=1; verbose=1; }; - void operator() (LinearOperatorBase &Linop,const Field &src, Field &psi){ + void operator() (const Field &src, Field &psi){ psi=Zero(); RealD cp, ssq,rsq; @@ -84,9 +91,9 @@ public: steps=0; for(int k=0;k &Linop,const Field &src, Field &psi,RealD rsq){ + RealD GCRnStep(const Field &src, Field &psi,RealD rsq){ RealD cp; RealD a, b; @@ -134,9 +143,7 @@ public: std::vector p(mmax,grid); std::vector qq(mmax); - std::cout< inline void LatticeCoordinate(Lattice &l,int mu) GridBase *grid = l.Grid(); int Nsimd = grid->iSites(); - Coordinate gcoor; - ExtractBuffer mergebuf(Nsimd); - - vector_type vI; auto l_v = l.View(); - for(int o=0;ooSites();o++){ + thread_for( o, grid->oSites(), { + vector_type vI; + Coordinate gcoor; + ExtractBuffer mergebuf(Nsimd); for(int i=0;iiSites();i++){ grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor); mergebuf[i]=(Integer)gcoor[mu]; } merge(vI,mergebuf); l_v[o]=vI; - } + }); }; // LatticeCoordinate(); diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index 865a4b14..abe42733 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -1,5 +1,4 @@ /************************************************************************************* - Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/lattice/Lattice_transfer.h @@ -83,12 +82,35 @@ template inline void setCheckerboard(Lattice &full,const Latti }); } - template inline void blockProject(Lattice > &coarseData, + const Lattice &fineData, + const std::vector > &Basis) +{ + GridBase * fine = fineData.Grid(); + GridBase * coarse= coarseData.Grid(); + + Lattice ip(coarse); + + // auto fineData_ = fineData.View(); + auto coarseData_ = coarseData.View(); + auto ip_ = ip.View(); + for(int v=0;voSites(), vobj::Nsimd(), { + coalescedWrite(coarseData_[sc](v),ip_(sc)); + }); + } +} + +template +inline void blockProject1(Lattice > &coarseData, const Lattice &fineData, const std::vector > &Basis) { + typedef iVector coarseSiteData; + coarseSiteData elide; + typedef decltype(coalescedRead(elide)) ScalarComplex; GridBase * fine = fineData.Grid(); GridBase * coarse= coarseData.Grid(); int _ndimension = coarse->_ndimension; @@ -106,26 +128,40 @@ inline void blockProject(Lattice > &coarseData, block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; assert(block_r[d]*coarse->_rdimensions[d] == fine->_rdimensions[d]); } + int blockVol = fine->oSites()/coarse->oSites(); coarseData=Zero(); auto fineData_ = fineData.View(); auto coarseData_ = coarseData.View(); - // Loop over coars parallel, and then loop over fine associated with coarse. - thread_for( sf, fine->oSites(), { - int sc; - Coordinate coor_c(_ndimension); - Coordinate coor_f(_ndimension); - Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); - for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; - Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); + //////////////////////////////////////////////////////////////////////////////////////////////////////// + // To make this lock free, loop over coars parallel, and then loop over fine associated with coarse. + // Otherwise do fine inner product per site, and make the update atomic + //////////////////////////////////////////////////////////////////////////////////////////////////////// + accelerator_for( sci, nbasis*coarse->oSites(), vobj::Nsimd(), { - thread_critical { - for(int i=0;i_rdimensions); // Block coordinate + + int sf; + decltype(innerProduct(Basis_(sf),fineData_(sf))) reduce=Zero(); + + for(int sb=0;sb_rdimensions); + + reduce=reduce+innerProduct(Basis_(sf),fineData_(sf)); } + coalescedWrite(coarseData_[sc](i),reduce); }); return; } @@ -160,7 +196,7 @@ inline void blockZAXPY(Lattice &fineZ, auto fineY_ = fineY.View(); auto coarseA_= coarseA.View(); - thread_for(sf, fine->oSites(), { + accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), { int sc; Coordinate coor_c(_ndimension); @@ -171,7 +207,7 @@ inline void blockZAXPY(Lattice &fineZ, Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); // z = A x + y - fineZ_[sf]=coarseA_[sc]*fineX_[sf]+fineY_[sf]; + coalescedWrite(fineZ_[sf],coarseA_(sc)*fineX_(sf)+fineY_(sf)); }); @@ -196,7 +232,7 @@ inline void blockInnerProduct(Lattice &CoarseInner, fine_inner = localInnerProduct(fineX,fineY); blockSum(coarse_inner,fine_inner); - thread_for(ss, coarse->oSites(),{ + accelerator_for(ss, coarse->oSites(), 1, { CoarseInner_[ss] = coarse_inner_[ss]; }); } @@ -226,23 +262,29 @@ inline void blockSum(Lattice &coarseData,const Lattice &fineData) for(int d=0 ; d<_ndimension;d++){ block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; } + int blockVol = fine->oSites()/coarse->oSites(); // Turn this around to loop threaded over sc and interior loop // over sf would thread better - coarseData=Zero(); auto coarseData_ = coarseData.View(); auto fineData_ = fineData.View(); - thread_for(sf,fine->oSites(),{ - int sc; + accelerator_for(sc,coarse->oSites(),1,{ + + // One thread per sub block Coordinate coor_c(_ndimension); - Coordinate coor_f(_ndimension); - - Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); - for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; - Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); - - thread_critical { + Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate + coarseData_[sc]=Zero(); + + for(int sb=0;sb_rdimensions); + coarseData_[sc]=coarseData_[sc]+fineData_[sf]; } @@ -296,6 +338,7 @@ inline void blockOrthogonalise(Lattice &ip,std::vector > } } +#if 0 template inline void blockPromote(const Lattice > &coarseData, Lattice &fineData, @@ -321,7 +364,7 @@ inline void blockPromote(const Lattice > &coarseData, auto coarseData_ = coarseData.View(); // Loop with a cache friendly loop ordering - thread_for(sf,fine->oSites(),{ + accelerator_for(sf,fine->oSites(),1,{ int sc; Coordinate coor_c(_ndimension); Coordinate coor_f(_ndimension); @@ -332,13 +375,35 @@ inline void blockPromote(const Lattice > &coarseData, for(int i=0;i +inline void blockPromote(const Lattice > &coarseData, + Lattice &fineData, + const std::vector > &Basis) +{ + GridBase * fine = fineData.Grid(); + GridBase * coarse= coarseData.Grid(); + + fineData=Zero(); + for(int i=0;i > ip = PeekIndex<0>(coarseData,i); + Lattice cip(coarse); + auto cip_ = cip.View(); + auto ip_ = ip.View(); + accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{ + coalescedWrite(cip_[sc], ip_(sc)()); + }); + blockZAXPY(fineData,cip,Basis[i],fineData); + } +} +#endif // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars. // Simd layouts need not match since we use peek/poke Local diff --git a/Grid/qcd/action/fermion/CayleyFermion5D.h b/Grid/qcd/action/fermion/CayleyFermion5D.h index 333ba49b..c2ccb98b 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5D.h +++ b/Grid/qcd/action/fermion/CayleyFermion5D.h @@ -101,7 +101,8 @@ public: virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag); // Efficient support for multigrid coarsening - virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp); + virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp); + virtual void MdirAll(const FermionField &in, std::vector &out); void Meooe5D (const FermionField &in, FermionField &out); void MeooeDag5D (const FermionField &in, FermionField &out); diff --git a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h index 379c5f8f..5aa7bfbd 100644 --- a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h +++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h @@ -62,14 +62,15 @@ public: // Efficient support for multigrid coarsening virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp); + virtual void MdirAll(const FermionField &in, std::vector &out); - /////////////////////////////////////////////////////////////// - // Physical surface field utilities - /////////////////////////////////////////////////////////////// - // virtual void Dminus(const FermionField &psi, FermionField &chi); // Inherit trivial case - // virtual void DminusDag(const FermionField &psi, FermionField &chi); // Inherit trivial case - virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d); - virtual void ImportPhysicalFermionSource (const FermionField &input4d,FermionField &imported5d); + /////////////////////////////////////////////////////////////// + // Physical surface field utilities + /////////////////////////////////////////////////////////////// + // virtual void Dminus(const FermionField &psi, FermionField &chi); // Inherit trivial case + // virtual void DminusDag(const FermionField &psi, FermionField &chi); // Inherit trivial case + virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d); + virtual void ImportPhysicalFermionSource (const FermionField &input4d,FermionField &imported5d); // Constructors ContinuedFractionFermion5D(GaugeField &_Umu, diff --git a/Grid/qcd/action/fermion/FermionOperator.h b/Grid/qcd/action/fermion/FermionOperator.h index c60a2e84..cbc6ca63 100644 --- a/Grid/qcd/action/fermion/FermionOperator.h +++ b/Grid/qcd/action/fermion/FermionOperator.h @@ -89,6 +89,7 @@ public: virtual void Mdiag (const FermionField &in, FermionField &out) { Mooee(in,out);}; // Same as Mooee applied to both CB's virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac + virtual void MdirAll(const FermionField &in, std::vector &out)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac virtual void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector twist) { assert(0);}; diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h index b4d8d60b..5cb95ca6 100644 --- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h +++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h @@ -103,6 +103,7 @@ public: // Multigrid assistance; force term uses too /////////////////////////////////////////////////////////////// void Mdir(const FermionField &in, FermionField &out, int dir, int disp); + void MdirAll(const FermionField &in, std::vector &out); void DhopDir(const FermionField &in, FermionField &out, int dir, int disp); /////////////////////////////////////////////////////////////// diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h index b10c0356..8e3d4be5 100644 --- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h +++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h @@ -86,7 +86,8 @@ public: void MooeeDag (const FermionField &in, FermionField &out); void MooeeInvDag (const FermionField &in, FermionField &out); - void Mdir (const FermionField &in, FermionField &out,int dir,int disp); + void Mdir (const FermionField &in, FermionField &out,int dir,int disp); + void MdirAll(const FermionField &in, std::vector &out); void DhopDir(const FermionField &in, FermionField &out,int dir,int disp); // These can be overridden by fancy 5d chiral action diff --git a/Grid/qcd/action/fermion/PartialFractionFermion5D.h b/Grid/qcd/action/fermion/PartialFractionFermion5D.h index d61515f0..928abd3f 100644 --- a/Grid/qcd/action/fermion/PartialFractionFermion5D.h +++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.h @@ -67,12 +67,13 @@ public: // Efficient support for multigrid coarsening virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp); + virtual void MdirAll(const FermionField &in, std::vector &out); - /////////////////////////////////////////////////////////////// - // Physical surface field utilities - /////////////////////////////////////////////////////////////// - virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d); - virtual void ImportPhysicalFermionSource (const FermionField &input4d,FermionField &imported5d); + /////////////////////////////////////////////////////////////// + // Physical surface field utilities + /////////////////////////////////////////////////////////////// + virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d); + virtual void ImportPhysicalFermionSource (const FermionField &input4d,FermionField &imported5d); // Constructors PartialFractionFermion5D(GaugeField &_Umu, diff --git a/Grid/qcd/action/fermion/WilsonFermion.h b/Grid/qcd/action/fermion/WilsonFermion.h index 3a712435..a3f5d2d7 100644 --- a/Grid/qcd/action/fermion/WilsonFermion.h +++ b/Grid/qcd/action/fermion/WilsonFermion.h @@ -115,9 +115,10 @@ public: // Multigrid assistance; force term uses too /////////////////////////////////////////////////////////////// void Mdir(const FermionField &in, FermionField &out, int dir, int disp); + void MdirAll(const FermionField &in, std::vector &out); void DhopDir(const FermionField &in, FermionField &out, int dir, int disp); - void DhopDirDisp(const FermionField &in, FermionField &out, int dirdisp, - int gamma, int dag); + void DhopDirAll(const FermionField &in, std::vector &out); + void DhopDirCalc(const FermionField &in, FermionField &out, int dirdisp,int gamma, int dag); /////////////////////////////////////////////////////////////// // Extra methods added by derived diff --git a/Grid/qcd/action/fermion/WilsonFermion5D.h b/Grid/qcd/action/fermion/WilsonFermion5D.h index 8f1073db..58b54421 100644 --- a/Grid/qcd/action/fermion/WilsonFermion5D.h +++ b/Grid/qcd/action/fermion/WilsonFermion5D.h @@ -111,15 +111,16 @@ public: virtual void MooeeDag (const FermionField &in, FermionField &out){assert(0);}; virtual void MooeeInvDag (const FermionField &in, FermionField &out){assert(0);}; virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac + virtual void MdirAll(const FermionField &in, std::vector &out){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac // These can be overridden by fancy 5d chiral action virtual void DhopDeriv (GaugeField &mat,const FermionField &U,const FermionField &V,int dag); virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag); virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag); - void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector twist) ; - void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector twist) ; - void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector twist) ; + void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector twist) ; + void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector twist) ; + void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector twist) ; // Implement hopping term non-hermitian hopping term; half cb or both // Implement s-diagonal DW @@ -131,6 +132,9 @@ public: // add a DhopComm // -- suboptimal interface will presently trigger multiple comms. void DhopDir(const FermionField &in, FermionField &out,int dir,int disp); + void DhopDirAll(const FermionField &in,std::vector &out); + void DhopDirComms(const FermionField &in); + void DhopDirCalc(const FermionField &in, FermionField &out,int point); /////////////////////////////////////////////////////////////// // New methods added diff --git a/Grid/qcd/action/fermion/WilsonKernels.h b/Grid/qcd/action/fermion/WilsonKernels.h index 35715097..7348a463 100644 --- a/Grid/qcd/action/fermion/WilsonKernels.h +++ b/Grid/qcd/action/fermion/WilsonKernels.h @@ -60,6 +60,9 @@ public: int Ls, int Nsite, const FermionField &in, FermionField &out, int interior=1,int exterior=1) ; + static void DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls, + int Nsite, const FermionField &in, std::vector &out) ; + static void DhopDirKernel(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf, int Ls, int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma); @@ -100,8 +103,17 @@ public: private: - static accelerator void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor * buf, + static accelerator_inline void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor * buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dirdisp, int gamma); + + static accelerator_inline void DhopDirXp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); + static accelerator_inline void DhopDirYp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); + static accelerator_inline void DhopDirZp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); + static accelerator_inline void DhopDirTp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); + static accelerator_inline void DhopDirXm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); + static accelerator_inline void DhopDirYm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); + static accelerator_inline void DhopDirZm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); + static accelerator_inline void DhopDirTm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); // Specialised variants static accelerator void GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, diff --git a/Grid/qcd/action/fermion/g5HermitianLinop.h b/Grid/qcd/action/fermion/g5HermitianLinop.h index 2e417ceb..90bb6d59 100644 --- a/Grid/qcd/action/fermion/g5HermitianLinop.h +++ b/Grid/qcd/action/fermion/g5HermitianLinop.h @@ -54,6 +54,14 @@ public: _Mat.Mdir(in,tmp,dir,disp); G5R5(out,tmp); } + void OpDirAll(const Field &in, std::vector &out) { + Field tmp(in.Grid()); + _Mat.MdirAll(in,out); + for(int p=0;p &out) { + _Mat.MdirAll(in,out); + for(int p=0;p::Mdir (const FermionField &psi, FermionField &chi,in Meo5D(psi,tmp); this->DhopDir(tmp,chi,dir,disp); } +template +void CayleyFermion5D::MdirAll(const FermionField &psi, std::vector &out) +{ + FermionField tmp(psi.Grid()); + Meo5D(psi,tmp); + this->DhopDirAll(tmp,out); +} + // force terms; five routines; default to Dhop on diagonal template void CayleyFermion5D::MDeriv (GaugeField &mat,const FermionField &U,const FermionField &V,int dag) diff --git a/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h index 7af02263..beeb3e00 100644 --- a/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h @@ -143,6 +143,25 @@ void ContinuedFractionFermion5D::Mdir (const FermionField &psi, FermionFi } } template +void ContinuedFractionFermion5D::MdirAll (const FermionField &psi, std::vector &chi) +{ + int Ls = this->Ls; + + this->DhopDirAll(psi,chi); // Dslash on diagonal. g5 Dslash is hermitian + + for(int p=0;p void ContinuedFractionFermion5D::Meooe (const FermionField &psi, FermionField &chi) { int Ls = this->Ls; diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h index c42e896f..fdaa2f71 100644 --- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h @@ -538,10 +538,16 @@ void ImprovedStaggeredFermion5D::ZeroCounters(void) // Implement the general interface. Here we use SAME mass on all slices ///////////////////////////////////////////////////////////////////////// template -void ImprovedStaggeredFermion5D::Mdir(const FermionField &in, FermionField &out, int dir, int disp) { +void ImprovedStaggeredFermion5D::Mdir(const FermionField &in, FermionField &out, int dir, int disp) +{ DhopDir(in, out, dir, disp); } template +void ImprovedStaggeredFermion5D::MdirAll(const FermionField &in, std::vector &out) +{ + assert(0); +} +template RealD ImprovedStaggeredFermion5D::M(const FermionField &in, FermionField &out) { out.Checkerboard() = in.Checkerboard(); Dhop(in, out, DaggerNo); diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h index e2605d81..0b723c47 100644 --- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h @@ -362,12 +362,19 @@ void ImprovedStaggeredFermion::DhopEO(const FermionField &in, FermionField } template -void ImprovedStaggeredFermion::Mdir(const FermionField &in, FermionField &out, int dir, int disp) { +void ImprovedStaggeredFermion::Mdir(const FermionField &in, FermionField &out, int dir, int disp) +{ DhopDir(in, out, dir, disp); } +template +void ImprovedStaggeredFermion::MdirAll(const FermionField &in, std::vector &out) +{ + assert(0); // Not implemented yet +} template -void ImprovedStaggeredFermion::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) { +void ImprovedStaggeredFermion::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) +{ Compressor compressor; Stencil.HaloExchange(in, compressor); @@ -380,6 +387,7 @@ void ImprovedStaggeredFermion::DhopDir(const FermionField &in, FermionFiel }); }; + template void ImprovedStaggeredFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, @@ -404,7 +412,6 @@ void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st #ifdef GRID_OMP Compressor compressor; int len = U.Grid()->oSites(); - const int LLs = 1; DhopTotalTime -= usecond(); diff --git a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h index 9f8f91ad..edc674cc 100644 --- a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h @@ -31,7 +31,7 @@ Author: Peter Boyle NAMESPACE_BEGIN(Grid); -template + template void PartialFractionFermion5D::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){ // this does both dag and undag but is trivial; make a common helper routing int Ls = this->Ls; @@ -45,8 +45,25 @@ void PartialFractionFermion5D::Mdir (const FermionField &psi, FermionFiel ag5xpby_ssp(chi, scale,chi,0.0,chi,s+1,s+1); } ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1); - } +template +void PartialFractionFermion5D::MdirAll (const FermionField &psi, std::vector &chi){ + // this does both dag and undag but is trivial; make a common helper routing + int Ls = this->Ls; + + this->DhopDirAll(psi,chi); + + for(int point=0;point void PartialFractionFermion5D::Meooe_internal(const FermionField &psi, FermionField &chi,int dag) { diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h index 1bdc9a64..613eaa7b 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h @@ -241,6 +241,15 @@ void WilsonFermion5D::DhopDir(const FermionField &in, FermionField &out,in Kernels::DhopDirKernel(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out,dirdisp,gamma); }; +template +void WilsonFermion5D::DhopDirAll(const FermionField &in, std::vector &out) +{ + Compressor compressor(DaggerNo); + Stencil.HaloExchange(in,compressor); + uint64_t Nsite = Umu.Grid()->oSites(); + Kernels::DhopDirAll(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out); +}; + template void WilsonFermion5D::DerivInternal(StencilImpl & st, diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h index 756bdbf4..76b904e9 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h @@ -319,28 +319,51 @@ void WilsonFermion::DhopEO(const FermionField &in, FermionField &out,int d } template -void WilsonFermion::Mdir(const FermionField &in, FermionField &out, int dir, int disp) { +void WilsonFermion::Mdir(const FermionField &in, FermionField &out, int dir, int disp) +{ DhopDir(in, out, dir, disp); } +template +void WilsonFermion::MdirAll(const FermionField &in, std::vector &out) +{ + DhopDirAll(in, out); +} template void WilsonFermion::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) { + Compressor compressor(DaggerNo); + Stencil.HaloExchange(in, compressor); + int skip = (disp == 1) ? 0 : 1; int dirdisp = dir + skip * 4; int gamma = dir + (1 - skip) * 4; - DhopDirDisp(in, out, dirdisp, gamma, DaggerNo); + DhopDirCalc(in, out, dirdisp, gamma, DaggerNo); }; - template -void WilsonFermion::DhopDirDisp(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) +void WilsonFermion::DhopDirAll(const FermionField &in, std::vector &out) { - Compressor compressor(dag); - + Compressor compressor(DaggerNo); Stencil.HaloExchange(in, compressor); + + assert((out.size()==8)||(out.size()==9)); + for(int dir=0;dir +void WilsonFermion::DhopDirCalc(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) +{ int Ls=1; - int Nsite=in.oSites(); + uint64_t Nsite=in.oSites(); Kernels::DhopDirKernel(Stencil, Umu, Stencil.CommBuf(), Ls, Nsite, in, out, dirdisp, gamma); }; @@ -348,7 +371,8 @@ template void WilsonFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, - FermionField &out, int dag) { + FermionField &out, int dag) +{ #ifdef GRID_OMP if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) DhopInternalOverlappedComms(st,lo,U,in,out,dag); diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index a787fa79..9e032b04 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -91,8 +91,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) } \ synchronise(); -#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon) \ - if (gamma == Dir) { \ +#define GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon) \ if (SE->_is_local ) { \ int perm= SE->_permute; \ auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane); \ @@ -102,10 +101,14 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) } \ synchronise(); \ Impl::multLink(Uchi, U[sU], chi, dir, SE, st); \ - Recon(result, Uchi); \ - synchronise(); \ + Recon(result, Uchi); + +#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon) \ + if (gamma == Dir) { \ + GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon); \ } + //////////////////////////////////////////////////////////////////// // All legs kernels ; comms then compute //////////////////////////////////////////////////////////////////// @@ -284,7 +287,36 @@ void WilsonKernels::GenericDhopSiteExt(StencilView &st, DoubledGaugeField } }; -template +#define DhopDirMacro(Dir,spProj,spRecon) \ + template \ + void WilsonKernels::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \ + int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \ + { \ + typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; \ + typedef decltype(coalescedRead(in[0])) calcSpinor; \ + calcHalfSpinor chi; \ + calcSpinor result; \ + calcHalfSpinor Uchi; \ + StencilEntry *SE; \ + int ptype; \ + const int Nsimd = SiteHalfSpinor::Nsimd(); \ + const int lane=SIMTlane(Nsimd); \ + \ + SE = st.GetEntry(ptype, dir, sF); \ + GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,spRecon); \ + coalescedWrite(out[sF], result,lane); \ + } + +DhopDirMacro(Xp,spProjXp,spReconXp); +DhopDirMacro(Yp,spProjYp,spReconYp); +DhopDirMacro(Zp,spProjZp,spReconZp); +DhopDirMacro(Tp,spProjTp,spReconTp); +DhopDirMacro(Xm,spProjXm,spReconXm); +DhopDirMacro(Ym,spProjYm,spReconYm); +DhopDirMacro(Zm,spProjZm,spReconZm); +DhopDirMacro(Tm,spProjTm,spReconTm); + +template void WilsonKernels::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma) { @@ -299,18 +331,7 @@ void WilsonKernels::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si const int lane=SIMTlane(Nsimd); SE = st.GetEntry(ptype, dir, sF); - if (gamma == Xp) { - if (SE->_is_local ) { - int perm= SE->_permute; - auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane); - spProjXp(chi,tmp); - } else { - chi = coalescedRead(buf[SE->_offset],lane); - } - Impl::multLink(Uchi, U[sU], chi, dir, SE, st); - spReconXp(result, Uchi); - } - + GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp); GENERIC_DHOPDIR_LEG(Yp,spProjYp,spReconYp); GENERIC_DHOPDIR_LEG(Zp,spProjZp,spReconZp); GENERIC_DHOPDIR_LEG(Tp,spProjTp,spReconTp); @@ -321,6 +342,38 @@ void WilsonKernels::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si coalescedWrite(out[sF], result,lane); } +template +void WilsonKernels::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls, + int Nsite, const FermionField &in, std::vector &out) +{ + auto U_v = U.View(); + auto in_v = in.View(); + auto st_v = st.View(); + + auto out_Xm = out[0].View(); + auto out_Ym = out[1].View(); + auto out_Zm = out[2].View(); + auto out_Tm = out[3].View(); + auto out_Xp = out[4].View(); + auto out_Yp = out[5].View(); + auto out_Zp = out[6].View(); + auto out_Tp = out[7].View(); + + accelerator_forNB(sss,Nsite*Ls,Simd::Nsimd(),{ + int sU=sss/Ls; + int sF =sss; + DhopDirXm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xm,0); + DhopDirYm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Ym,1); + DhopDirZm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zm,2); + DhopDirTm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tm,3); + DhopDirXp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xp,4); + DhopDirYp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Yp,5); + DhopDirZp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zp,6); + DhopDirTp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tp,7); + }); +} + + template void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls, int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma) @@ -332,13 +385,32 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S auto in_v = in.View(); auto out_v = out.View(); auto st_v = st.View(); - accelerator_for(ss,Nsite,Simd::Nsimd(),{ - for(int s=0;s &){ assert(0);} void Mdiag(const GaugeField&, GaugeField&){ assert(0);} void ImportGauge(const GaugeField& _U) { diff --git a/Grid/simd/Grid_gpu_vec.h b/Grid/simd/Grid_gpu_vec.h index 471fbccc..4584fb36 100644 --- a/Grid/simd/Grid_gpu_vec.h +++ b/Grid/simd/Grid_gpu_vec.h @@ -403,6 +403,10 @@ namespace Optimization { accelerator_inline GpuVectorRD operator()(GpuVectorRD a, GpuVectorRD b){ return a/b; } + accelerator_inline GpuVectorI operator()(GpuVectorI a, GpuVectorI b){ + return a/b; + } + // Danger -- element wise divide fro complex, not complex div. // See Grid_vector_types.h lines around 735, applied after "toReal" accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b){ diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index 472013f4..570f4234 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -628,6 +628,7 @@ void Grid_debug_handler_init(void) sigaction(SIGSEGV,&sa,NULL); sigaction(SIGTRAP,&sa,NULL); sigaction(SIGBUS,&sa,NULL); + sigaction(SIGUSR2,&sa,NULL); feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO); diff --git a/README.md b/README.md index 4b0a86f8..9f690ce0 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=Grid&tab=projectOverview) [![Travis status](https://travis-ci.org/paboyle/Grid.svg?branch=develop)](https://travis-ci.org/paboyle/Grid) +# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:GridBasedSoftware_Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=GridBasedSoftware_Grid&tab=projectOverview) [![Travis status](https://travis-ci.org/paboyle/Grid.svg?branch=develop)](https://travis-ci.org/paboyle/Grid) **Data parallel C++ mathematical object library.** diff --git a/configure.ac b/configure.ac index 1e368091..3d4ee383 100644 --- a/configure.ac +++ b/configure.ac @@ -280,7 +280,8 @@ case ${CXX} in # CXXLD="nvcc -v -link" CXX="nvcc -x cu " CXXLD="nvcc -link" - CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing -Xcompiler -Wno-unusable-partial-specialization --expt-extended-lambda --expt-relaxed-constexpr" +# CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing -Xcompiler -Wno-unusable-partial-specialization --expt-extended-lambda --expt-relaxed-constexpr" + CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr" if test $ac_openmp = yes; then CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp" fi diff --git a/tests/lanczos/Test_synthetic_lanczos.cc b/tests/lanczos/Test_synthetic_lanczos.cc index 7a3591dd..a1e0e672 100644 --- a/tests/lanczos/Test_synthetic_lanczos.cc +++ b/tests/lanczos/Test_synthetic_lanczos.cc @@ -73,6 +73,7 @@ public: } // Support for coarsening to a multigrid + void OpDirAll (const Field &in, std::vector &out){}; void OpDiag (const Field &in, Field &out) {}; void OpDir (const Field &in, Field &out,int dir,int disp){}; diff --git a/tests/solver/Test_dwf_hdcr.cc b/tests/solver/Test_dwf_hdcr.cc index 74adc417..873530ff 100644 --- a/tests/solver/Test_dwf_hdcr.cc +++ b/tests/solver/Test_dwf_hdcr.cc @@ -1,4 +1,6 @@ - /************************************************************************************* + + +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -29,323 +31,174 @@ Author: paboyle /* END LEGAL */ #include #include -//#include using namespace std; using namespace Grid; - ; +/* Params + * Grid: + * block1(4) + * block2(4) + * + * Subspace + * * Fine : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100 + * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100 -class myclass: Serializable { -public: - - GRID_SERIALIZABLE_CLASS_MEMBERS(myclass, - int, domaindecompose, - int, domainsize, - int, order, - int, Ls, - double, mq, - double, lo, - double, hi, - int, steps); - - myclass(){}; - -}; -myclass params; + * Smoother: + * * Fine: Cheby(hi, lo, order) -- 60,0.5,10 + * * Coarse: Cheby(hi, lo, order) -- 12,0.1,4 + * Lanczos: + * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order)) 24,36,24,0.002,4.0,61 + */ RealD InverseApproximation(RealD x){ return 1.0/x; } -template +template class ChebyshevSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & _SmootherMatrix; + FineOperator & _SmootherOperator; + + Chebyshev Cheby; + + ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) : + _SmootherOperator(SmootherOperator), + _SmootherMatrix(SmootherMatrix), + Cheby(_lo,_hi,_ord,InverseApproximation) + {}; + + void operator() (const Field &in, Field &out) + { + Field tmp(in.Grid()); + MdagMLinearOperator MdagMOp(_SmootherMatrix); + _SmootherOperator.AdjOp(in,tmp); + Cheby(MdagMOp,tmp,out); + } +}; +template class MirsSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & SmootherMatrix; + FineOperator & SmootherOperator; + RealD tol; + RealD shift; + int maxit; + + MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) : + shift(_shift),tol(_tol),maxit(_maxit), + SmootherOperator(_SmootherOperator), + SmootherMatrix(_SmootherMatrix) + {}; + + void operator() (const Field &in, Field &out) + { + ZeroGuesser Guess; + ConjugateGradient CG(tol,maxit,false); + + Field src(in.Grid()); + + ShiftedMdagMLinearOperator,Field> MdagMOp(SmootherMatrix,shift); + SmootherOperator.AdjOp(in,src); + Guess(src,out); + CG(MdagMOp,src,out); + } +}; + +template class MultiGridPreconditioner : public LinearFunction< Lattice > { public: typedef Aggregation Aggregates; typedef CoarsenedMatrix CoarseOperator; - - typedef typename Aggregation::siteVector siteVector; - typedef typename Aggregation::CoarseScalar CoarseScalar; typedef typename Aggregation::CoarseVector CoarseVector; typedef typename Aggregation::CoarseMatrix CoarseMatrix; typedef typename Aggregation::FineField FineField; typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; Aggregates & _Aggregates; CoarseOperator & _CoarseOperator; Matrix & _FineMatrix; FineOperator & _FineOperator; - Matrix & _SmootherMatrix; - FineOperator & _SmootherOperator; + Guesser & _Guess; + FineSmoother & _Smoother; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + +#define GridLogLevel std::cout << GridLogMessage < fMdagMOp(_FineMatrix); - - p1=in; - for(int i=0;i<20;i++){ - RealD absp1=std::sqrt(norm2(p1)); - fMdagMOp.HermOp(p1,p2);// this is the G5 herm bit - // _FineOperator.Op(p1,p2);// this is the G5 herm bit - RealD absp2=std::sqrt(norm2(p2)); - if(i%10==9) - std::cout< CG(1.0e-10,100000); - ConjugateGradient fCG(3.0e-2,1000); - - HermitianLinearOperator HermOp(_CoarseOperator); - MdagMLinearOperator MdagMOp(_CoarseOperator); - MdagMLinearOperator fMdagMOp(_FineMatrix); - - FineField tmp(in.Grid()); - FineField res(in.Grid()); - FineField Min(in.Grid()); - - // Monitor completeness of low mode space - _Aggregates.ProjectToSubspace (Csrc,in); - _Aggregates.PromoteFromSubspace(Csrc,out); - std::cout< CG(1.0e-10,100000); - ConjugateGradient fCG(3.0e-2,1000); - - HermitianLinearOperator HermOp(_CoarseOperator); - MdagMLinearOperator MdagMOp(_CoarseOperator); - ShiftedMdagMLinearOperator fMdagMOp(_FineMatrix,0.1); - - FineField tmp(in.Grid()); - FineField res(in.Grid()); - FineField Qin(in.Grid()); - - // Monitor completeness of low mode space - // _Aggregates.ProjectToSubspace (Csrc,in); - // _Aggregates.PromoteFromSubspace(Csrc,out); - // std::cout< fMdagMOp(_FineMatrix); - ShiftedMdagMLinearOperator fMdagMOp(_SmootherMatrix,0.0); - - RealD Ni,r; - - Ni = norm2(in); - - for(int ilo=0;ilo<3;ilo++){ - for(int ord=5;ord<50;ord*=2){ - - std::cout << " lo "< Cheby (lo[ilo],70.0,ord,InverseApproximation); - Cheby(fMdagMOp,vec1,vec2); // solves MdagM = g5 M g5M - - _FineOperator.Op(vec2,vec1);// this is the G5 herm bit - vec1 = in - vec1; // tmp = in - A Min - r=norm2(vec1); - std::cout< CG(3.0e-3,100000); - - HermitianLinearOperator HermOp(_CoarseOperator); - MdagMLinearOperator MdagMOp(_CoarseOperator); - // MdagMLinearOperator fMdagMOp(_FineMatrix); - ShiftedMdagMLinearOperator fMdagMOp(_SmootherMatrix,0.0); - + CoarseVector Csol(_CoarseOperator.Grid()); FineField vec1(in.Grid()); FineField vec2(in.Grid()); - Chebyshev Cheby (params.lo,params.hi,params.order,InverseApproximation); - Chebyshev ChebyAccu(params.lo,params.hi,params.order,InverseApproximation); + double t; + // Fine Smoother + t=-usecond(); + _Smoother(in,out); + t+=usecond(); + GridLogLevel << "Smoother took "<< t/1000.0<< "ms" < block ({2,2,2,2}); + std::vector blockc ({2,2,2,2}); const int nbasis= 32; - + const int nbasisc= 32; auto clatt = GridDefaultLatt(); for(int d=0;d seeds4({1,2,3,4}); std::vector seeds5({5,6,7,8}); @@ -372,186 +233,167 @@ int main (int argc, char ** argv) GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds); - - Gamma g5(Gamma::Algebra::Gamma5); - LatticeFermion src(FGrid); gaussian(RNG5,src);// src=src+g5*src; - LatticeFermion result(FGrid); result=Zero(); - LatticeFermion ref(FGrid); ref=Zero(); - LatticeFermion tmp(FGrid); - LatticeFermion err(FGrid); + LatticeFermion result(FGrid); LatticeGaugeField Umu(UGrid); - LatticeGaugeField UmuDD(UGrid); - LatticeColourMatrix U(UGrid); - LatticeColourMatrix zz(UGrid); FieldMetaData header; std::string file("./ckpoint_lat.4000"); NerscIO::readConfiguration(Umu,header,file); - - if ( params.domaindecompose ) { - Lattice > coor(UGrid); - zz=Zero(); - for(int mu=0;mu(Umu,mu); - U = where(mod(coor,params.domainsize)==(Integer)0,zz,U); - PokeIndex(UmuDD,U,mu); - } - } else { - UmuDD = Umu; - } - // SU3::ColdConfiguration(RNG4,Umu); - // SU3::TepidConfiguration(RNG4,Umu); - // SU3::HotConfiguration(RNG4,Umu); - // Umu=Zero(); - - RealD mass=params.mq; - RealD M5=1.8; - std::cout< Subspace; typedef CoarsenedMatrix CoarseOperator; typedef CoarseOperator::CoarseVector CoarseVector; - + typedef CoarseOperator::siteVector siteVector; std::cout< HermDefOp(Ddwf); - Subspace Aggregates(Coarse5d,FGrid,0); - // Aggregates.CreateSubspace(RNG5,HermDefOp,nbasis); - assert ( (nbasis & 0x1)==0); - int nb=nbasis/2; - std::cout< Level1Op; + typedef CoarsenedMatrix,nbasisc> Level2Op; + Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); - Gamma5R5HermitianLinearOperator HermIndefOpDD(DdwfDD); - CoarsenedMatrix LDOp(*Coarse5d,1); // Hermitian matrix - LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates); + + Level1Op LDOp(*Coarse5d,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates); + + + ////////////////////////////////////////////////// + // Deflate the course space. Recursive multigrid? + ////////////////////////////////////////////////// + typedef Aggregation,nbasisc> CoarseSubspace; + CoarseSubspace CoarseAggregates(CoarseCoarse5d,Coarse5d,0); std::cout< PosdefLdop(LDOp); - ConjugateGradient CG(1.0e-6,100000); - CG(PosdefLdop,c_src,c_res); + { + int nb=nbasisc/2; + CoarseAggregates.CreateSubspaceChebyshev(CRNG,PosdefLdop,nb,12.0,0.02,500,100,100,0.0); + for(int n=0;noSites();site++){ + subspace_g5[site](nn) = subspace[site](nn); + subspace_g5[site](nn+nb)=-subspace[site](nn+nb); + } + } + } + } + + Level2Op L2Op(*CoarseCoarse5d,1); // Hermitian matrix + typedef Level2Op::CoarseVector CoarseCoarseVector; + HermitianLinearOperator L1LinOp(LDOp); + L2Op.CoarsenOperator(Coarse5d,L1LinOp,CoarseAggregates); - std::cout< HermIndefLdop(LDOp); -// ConjugateResidual MCR(1.0e-6,100000); -// MCR(HermIndefLdop,c_src,c_res); std::cout< Precon (Aggregates, LDOp, - HermIndefOp,Ddwf, - HermIndefOp,Ddwf); + MdagMLinearOperator IRLHermOpL2(L2Op); + Chebyshev IRLChebyL2(0.001,4.2,71); + FunctionHermOp IRLOpChebyL2(IRLChebyL2,IRLHermOpL2); + PlainHermOp IRLOpL2 (IRLHermOpL2); + int cNk=24; + int cNm=36; + int cNstop=24; + ImplicitlyRestartedLanczos IRLL2(IRLOpChebyL2,IRLOpL2,cNstop,cNk,cNm,1.0e-3,20); - // MultiGridPreconditioner PreconDD(Aggregates, LDOp, - // HermIndefOp,Ddwf, - // HermIndefOpDD,DdwfDD); - // TrivialPrecon simple; + int cNconv; + std::vector eval2(cNm); + std::vector evec2(cNm,CoarseCoarse5d); + CoarseCoarseVector cc_src(CoarseCoarse5d); cc_src=1.0; + IRLL2.calc(eval2,evec2,cc_src,cNconv); + + ConjugateGradient CoarseCoarseCG(0.1,1000); + DeflatedGuesser DeflCoarseCoarseGuesser(evec2,eval2); + NormalEquations DeflCoarseCoarseCGNE(L2Op,CoarseCoarseCG,DeflCoarseCoarseGuesser); std::cout< , NormalEquations > TwoLevelMG; + typedef MultiGridPreconditioner,nbasisc,Level1Op, DeflatedGuesser, NormalEquations > CoarseMG; + typedef MultiGridPreconditioner, LinearFunction > ThreeLevelMG; - // std::cout< coarsecoarse space + ChebyshevSmoother CoarseSmoother(0.1,12.0,3,L1LinOp,LDOp); + ChebyshevSmoother FineSmoother(0.5,60.0,10,HermIndefOp,Ddwf); - // std::cout< CoarseCGSmoother(0.1,0.1,4,L1LinOp,LDOp); + // MirsSmoother FineCGSmoother(0.0,0.01,8,HermIndefOp,Ddwf); - // std::cout< simple; - // ConjugateGradient fCG(1.0e-8,100000); - // fCG(HermDefOp,src,result); - - std::cout< HermOpEO(Ddwf); - ConjugateGradient pCG(1.0e-8,10000); - // pCG(HermOpEO,src_o,result_o); - - // std::cout< UPGCR(1.0e-8,100000,simple,8,128); - // UPGCR(HermIndefOp,src,result); + CoarseMG Level2Precon (CoarseAggregates, L2Op, + L1LinOp,LDOp, + CoarseSmoother, + DeflCoarseCoarseGuesser, + DeflCoarseCoarseCGNE); + Level2Precon.Level(2); + // PGCR Applying this solver to solve the coarse space problem + PrecGeneralisedConjugateResidual l2PGCR(0.1, 100, L1LinOp,Level2Precon,16,16); + l2PGCR.Level(2); - /// Get themax eval - std::cout< CoarseZeroGuesser; + ThreeLevelMG ThreeLevelPrecon(Aggregates, LDOp, + HermIndefOp,Ddwf, + FineSmoother, + CoarseZeroGuesser, + l2PGCR); + ThreeLevelPrecon.Level(1); - - // std::cout< PGCRDD(1.0e-8,100000,PreconDD,8,128); - // result=Zero(); - // std::cout< l1PGCR(1.0e-8,1000,HermIndefOp,ThreeLevelPrecon,16,16); + l1PGCR.Level(1); std::cout< PGCR(1.0e-8,100000,Precon,8,8); - std::cout< PM; PM(HermDefOp,src); + std::cout< cPM; cPM(PosdefLdop,c_src); + std::cout< ccPM; ccPM(IRLHermOpL2,cc_src); std::cout<