diff --git a/Grid/algorithms/Algorithms.h b/Grid/algorithms/Algorithms.h index f1ac1c81..48ea194b 100644 --- a/Grid/algorithms/Algorithms.h +++ b/Grid/algorithms/Algorithms.h @@ -35,17 +35,22 @@ Author: Peter Boyle #include #include +#include #include #include #include +#include +#include #include #include +#include #include #include #include #include #include +#include #include #include #include diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index 913f5c0c..a6b01986 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -1,3 +1,14 @@ + // blockZaxpy in bockPromote - 3s, 5% + // noncoalesced linalg in Preconditionoer ~ 3s 5% + // Lancos tuning or replace 10-20s ~ 25%, open ended + // setup tuning 5s ~ 8% + // -- e.g. ordermin, orderstep tunables. + // MdagM path without norm in LinOp code. few seconds + + // Mdir calc blocking kernels + // Fuse kernels in blockMaskedInnerProduct + // preallocate Vectors in Cayley 5D ~ few percent few seconds + /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -34,15 +45,36 @@ Author: paboyle NAMESPACE_BEGIN(Grid); +template +inline void blockMaskedInnerProduct(Lattice &CoarseInner, + const Lattice &FineMask, + const Lattice &fineX, + const Lattice &fineY) +{ + typedef decltype(innerProduct(vobj(),vobj())) dotp; + + GridBase *coarse(CoarseInner.Grid()); + GridBase *fine (fineX.Grid()); + + Lattice fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard(); + Lattice fine_inner_msk(fine); + + // Multiply could be fused with innerProduct + // Single block sum kernel could do both masks. + fine_inner = localInnerProduct(fineX,fineY); + mult(fine_inner_msk, fine_inner,FineMask); + blockSum(CoarseInner,fine_inner_msk); +} + + class Geometry { - // int dimension; public: int npoint; std::vector directions ; std::vector displacements; Geometry(int _d) { - + int base = (_d==5) ? 1:0; // make coarse grid stencil for 4d , not 5d @@ -52,10 +84,10 @@ public: directions.resize(npoint); displacements.resize(npoint); for(int d=0;d<_d;d++){ - directions[2*d ] = d+base; - directions[2*d+1] = d+base; - displacements[2*d ] = +1; - displacements[2*d+1] = -1; + directions[d ] = d+base; + directions[d+_d] = d+base; + displacements[d ] = +1; + displacements[d+_d]= -1; } directions [2*_d]=0; displacements[2*_d]=0; @@ -63,7 +95,7 @@ public: //// report back std::cout<oSites(),{ + accelerator_for(ss, CoarseGrid->oSites(),1,{ eProj[ss](i)=CComplex(1.0); }); eProj=eProj - iProj; @@ -146,61 +178,9 @@ public: void CreateSubspaceRandom(GridParallelRNG &RNG){ for(int i=0;i &hermop,int nn=nbasis) - { - // Run a Lanczos with sloppy convergence - const int Nstop = nn; - const int Nk = nn+20; - const int Np = nn+20; - const int Nm = Nk+Np; - const int MaxIt= 10000; - RealD resid = 1.0e-3; - - Chebyshev Cheb(0.5,64.0,21); - ImplicitlyRestartedLanczos IRL(hermop,Cheb,Nstop,Nk,Nm,resid,MaxIt); - // IRL.lock = 1; - - FineField noise(FineGrid); gaussian(RNG,noise); - FineField tmp(FineGrid); - std::vector eval(Nm); - std::vector evec(Nm,FineGrid); - - int Nconv; - IRL.calc(eval,evec, - noise, - Nconv); - - // pull back nn vectors - for(int b=0;b "< "< Cheb(lo,hi,orderfilter); + Cheb(hermop,noise,Mn); + // normalise + scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; + subspace[b] = Mn; + hermop.Op(Mn,tmp); + std::cout< "<View(); + auto Tnp_v = Tnp->View(); + auto Tnm_v = Tnm->View(); + const int Nsimd = CComplex::Nsimd(); + accelerator_forNB(ss, FineGrid->oSites(), Nsimd, { + coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); + coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss)); + }); + + // Possible more fine grained control is needed than a linear sweep, + // but huge productivity gain if this is simple algorithm and not a tunable + int m =1; + if ( n>=ordermin ) m=n-ordermin; + if ( (m%orderstep)==0 ) { + Mn=*Tnp; + scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; + subspace[b] = Mn; + hermop.Op(Mn,tmp); + std::cout< "< &hermop, + int nn, + double hi, + double lo, + int orderfilter, + int ordermin, + int orderstep, + double filterlo + ) { + + RealD scale; FineField noise(FineGrid); FineField Mn(FineGrid); + FineField tmp(FineGrid); + FineField combined(FineGrid); - for(int b=0;b "< "< Cheb(llo,hhi,oorder); \ + Cheb(hermop,noise,Mn); \ + scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; \ + subspace[b] = Mn; \ + hermop.Op(Mn,tmp); \ + std::cout< "< Cheb(0.002,60.0,1500,-0.5,3.5); \ - hermop.Op(Mn,noise); std::cout< "< Cheb(llo,hhi,oorder); \ + /* JacobiPolynomial Cheb(0.0,60.0,oorder,alpha,beta);*/\ + Cheb(hermop,noise,Mn); \ + scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; \ + subspace[b] = Mn; \ + hermop.Op(Mn,tmp); \ + std::cout< "< Cheb(llo,hhi,oorder); \ + Cheb(hermop,noise,combined); \ + } + double node = 0.000; + FILTERb(lo,hi,orderfilter);// 0 + // FILTERc(node,hi,51);// 0 + noise = Mn; + int base = 0; + int mult = 100; + FILTER(node,hi,base+1*mult); + FILTER(node,hi,base+2*mult); + FILTER(node,hi,base+3*mult); + FILTER(node,hi,base+4*mult); + FILTER(node,hi,base+5*mult); + FILTER(node,hi,base+6*mult); + FILTER(node,hi,base+7*mult); + FILTER(node,hi,base+8*mult); + FILTER(node,hi,base+9*mult); + FILTER(node,hi,base+10*mult); + FILTER(node,hi,base+11*mult); + FILTER(node,hi,base+12*mult); + FILTER(node,hi,base+13*mult); + FILTER(node,hi,base+14*mult); + FILTER(node,hi,base+15*mult); + assert(b==nn); } +#endif + +#if 0 + virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase &hermop, + int nn, + double hi, + double lo, + int orderfilter, + int ordermin, + int orderstep, + double filterlo + ) { + + RealD scale; + + FineField noise(FineGrid); + FineField Mn(FineGrid); + FineField tmp(FineGrid); + FineField combined(FineGrid); + + // New normalised noise + gaussian(RNG,noise); + scale = std::pow(norm2(noise),-0.5); + noise=noise*scale; + + // Initial matrix element + hermop.Op(noise,Mn); std::cout< "< JacobiPoly(0.005,60.,1500); + // JacobiPolynomial JacobiPoly(0.002,60.0,1500,-0.5,3.5); + //JacobiPolynomial JacobiPoly(0.03,60.0,500,-0.5,3.5); + // JacobiPolynomial JacobiPoly(0.00,60.0,1000,-0.5,3.5); + JacobiPoly(hermop,noise,Mn); + scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; + subspace[b] = Mn; + hermop.Op(Mn,tmp); + std::cout< "< "< class CoarsenedMatrix : public SparseMatrixBase > > { public: - typedef iVector siteVector; + typedef iVector siteVector; + typedef Lattice CoarseComplexField; typedef Lattice CoarseVector; typedef Lattice > CoarseMatrix; - + typedef iMatrix Cobj; typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field typedef Lattice FineField; @@ -293,7 +535,6 @@ public: CartesianStencil Stencil; std::vector A; - /////////////////////// // Interface @@ -305,33 +546,71 @@ public: conformable(_grid,in.Grid()); conformable(in.Grid(),out.Grid()); - RealD Nin = norm2(in); + // RealD Nin = norm2(in); SimpleCompressor compressor; + + double comms_usec = -usecond(); Stencil.HaloExchange(in,compressor); + comms_usec += usecond(); + auto in_v = in.View(); auto out_v = out.View(); - thread_for(ss,Grid()->oSites(),{ - siteVector res = Zero(); - siteVector nbr; + typedef LatticeView Aview; + + Vector AcceleratorViewContainer; + for(int p=0;poSites(); + // double flops = osites*Nsimd*nbasis*nbasis*8.0*geom.npoint; + // double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex); + double usecs =-usecond(); + // assert(geom.npoint==9); + + accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; int ptype; StencilEntry *SE; + + int lane=SIMTlane(Nsimd); for(int point=0;point_is_local&&SE->_permute) { - permute(nbr,in_v[SE->_offset],ptype); - } else if(SE->_is_local) { - nbr = in_v[SE->_offset]; + if(SE->_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane); } else { - nbr = Stencil.CommBuf()[SE->_offset]; + nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane); + } + synchronise(); + + for(int bb=0;bb compressor; Stencil.HaloExchange(in,compressor); - - auto point = [dir, disp](){ - if(dir == 0 and disp == 0) - return 8; - else - return (4 * dir + 1 - disp) / 2; - }(); + } + void MdirCalc(const CoarseVector &in, CoarseVector &out, int point) + { + conformable(_grid,in.Grid()); + conformable(_grid,out.Grid()); + + typedef LatticeView Aview; + Vector AcceleratorViewContainer; + for(int p=0;poSites(),{ + + const int Nsimd = CComplex::Nsimd(); + typedef decltype(coalescedRead(in_v[0])) calcVector; + typedef decltype(coalescedRead(in_v[0](0))) calcComplex; + + accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + int lane=SIMTlane(Nsimd); + SE=Stencil.GetEntry(ptype,point,ss); + + if(SE->_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane); + } else { + nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane); + } + synchronise(); + + for(int bb=0;bboSites(),1,{ + siteVector res = Zero(); siteVector nbr; int ptype; @@ -382,16 +690,65 @@ public: } else { nbr = Stencil.CommBuf()[SE->_offset]; } + synchronise(); - auto A_point = A[point].View(); - res = res + A_point[ss]*nbr; + res = res + Aview_p[point][ss]*nbr; - vstream(out_v[ss],res); + out_v[ss]=res; }); +#endif + } + void MdirAll(const CoarseVector &in,std::vector &out) + { + this->MdirComms(in); + int ndir=geom.npoint-1; + if ((out.size()!=ndir)&&(out.size()!=ndir+1)) { + std::cout <<"MdirAll out size "<< out.size()<MdirComms(in); + + int ndim = in.Grid()->Nd(); + + ////////////// + // 4D action like wilson + // 0+ => 0 + // 0- => 1 + // 1+ => 2 + // 1- => 3 + // etc.. + ////////////// + // 5D action like DWF + // 1+ => 0 + // 1- => 1 + // 2+ => 2 + // 2- => 3 + // etc.. + auto point = [dir, disp, ndim](){ + if(dir == 0 and disp == 0) + return 8; + else if ( ndim==4 ) { + return (4 * dir + 1 - disp) / 2; + } else { + return (4 * (dir-1) + 1 - disp) / 2; + } + }(); + + MdirCalc(in,out,point); + }; - void Mdiag(const CoarseVector &in, CoarseVector &out){ - Mdir(in, out, 0, 0); // use the self coupling (= last) point of the stencil + void Mdiag(const CoarseVector &in, CoarseVector &out) + { + int point=geom.npoint-1; + MdirCalc(in, out, point); // No comms }; @@ -401,25 +758,44 @@ public: geom(CoarseGrid._ndimension), hermitian(hermitian_), Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0), - A(geom.npoint,&CoarseGrid) + A(geom.npoint,&CoarseGrid) { }; void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase > &linop, - Aggregation & Subspace){ + Aggregation & Subspace) + { + typedef Lattice FineComplexField; + typedef typename Fobj::scalar_type scalar_type; - FineField iblock(FineGrid); // contributions from within this block - FineField oblock(FineGrid); // contributions from outwith this block + FineComplexField one(FineGrid); one=scalar_type(1.0,0.0); + FineComplexField zero(FineGrid); zero=scalar_type(0.0,0.0); + + std::vector masks(geom.npoint,FineGrid); + FineComplexField imask(FineGrid); // contributions from within this block + FineComplexField omask(FineGrid); // contributions from outwith this block + + FineComplexField evenmask(FineGrid); + FineComplexField oddmask(FineGrid); FineField phi(FineGrid); FineField tmp(FineGrid); FineField zz(FineGrid); zz=Zero(); FineField Mphi(FineGrid); + FineField Mphie(FineGrid); + FineField Mphio(FineGrid); + std::vector Mphi_p(geom.npoint,FineGrid); - Lattice > coor(FineGrid); + Lattice > coor (FineGrid); + Lattice > bcoor(FineGrid); + Lattice > bcb (FineGrid); bcb = Zero(); CoarseVector iProj(Grid()); CoarseVector oProj(Grid()); + CoarseVector SelfProj(Grid()); + CoarseComplexField iZProj(Grid()); + CoarseComplexField oZProj(Grid()); + CoarseScalar InnerProd(Grid()); // Orthogonalise the subblocks over the basis @@ -428,69 +804,117 @@ public: // Compute the matrix elements of linop between this orthonormal // set of vectors. int self_stencil=-1; - for(int p=0;p_rdimensions[dir])/(Grid()->_rdimensions[dir]); + + LatticeCoordinate(coor,dir); + + /////////////////////////////////////////////////////// + // Work out even and odd block checkerboarding for fast diagonal term + /////////////////////////////////////////////////////// + if ( disp==1 ) { + bcb = bcb + div(coor,block); + } + + if ( disp==0 ) { + masks[p]= Zero(); + } else if ( disp==1 ) { + masks[p] = where(mod(coor,block)==(block-1),one,zero); + } else if ( disp==-1 ) { + masks[p] = where(mod(coor,block)==(Integer)0,one,zero); + } } + evenmask = where(mod(bcb,2)==(Integer)0,one,zero); + oddmask = one-evenmask; + assert(self_stencil!=-1); for(int i=0;i_rdimensions[dir])/(Grid()->_rdimensions[dir]); + if ( (disp==-1) || (!hermitian ) ) { - LatticeCoordinate(coor,dir); - - if ( disp==0 ){ - linop.OpDiag(phi,Mphi); - } - else { - linop.OpDir(phi,Mphi,dir,disp); - } - - //////////////////////////////////////////////////////////////////////// - // Pick out contributions coming from this cell and neighbour cell - //////////////////////////////////////////////////////////////////////// - if ( disp==0 ) { - iblock = Mphi; - oblock = Zero(); - } else if ( disp==1 ) { - oblock = where(mod(coor,block)==(block-1),Mphi,zz); - iblock = where(mod(coor,block)!=(block-1),Mphi,zz); - } else if ( disp==-1 ) { - oblock = where(mod(coor,block)==(Integer)0,Mphi,zz); - iblock = where(mod(coor,block)!=(Integer)0,Mphi,zz); - } else { - assert(0); - } - - Subspace.ProjectToSubspace(iProj,iblock); - Subspace.ProjectToSubspace(oProj,oblock); - // blockProject(iProj,iblock,Subspace.subspace); - // blockProject(oProj,oblock,Subspace.subspace); - auto iProj_v = iProj.View() ; - auto oProj_v = oProj.View() ; - auto A_p = A[p].View(); - auto A_self = A[self_stencil].View(); - thread_for(ss, Grid()->oSites(),{ + //////////////////////////////////////////////////////////////////////// + // Pick out contributions coming from this cell and neighbour cell + //////////////////////////////////////////////////////////////////////// + omask = masks[p]; + imask = one-omask; + for(int j=0;joSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); }); + // if( disp!= 0 ) { accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });} + // accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_self[ss](j,i),A_self(ss)(j,i)+iZProj_v(ss)); }); + + } + } + } + + /////////////////////////////////////////// + // Faster alternate self coupling.. use hermiticity to save 2x + /////////////////////////////////////////// + { + mult(tmp,phi,evenmask); linop.Op(tmp,Mphie); + mult(tmp,phi,oddmask ); linop.Op(tmp,Mphio); + + { + auto tmp_ = tmp.View(); + auto evenmask_ = evenmask.View(); + auto oddmask_ = oddmask.View(); + auto Mphie_ = Mphie.View(); + auto Mphio_ = Mphio.View(); + accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{ + coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss)); + }); + } + + blockProject(SelfProj,tmp,Subspace.subspace); + + auto SelfProj_ = SelfProj.View(); + auto A_self = A[self_stencil].View(); + + accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ + for(int j=0;j &out) = 0; // Abstract base virtual void Op (const Field &in, Field &out) = 0; // Abstract base virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base @@ -83,6 +84,9 @@ public: void OpDir (const Field &in, Field &out,int dir,int disp) { _Mat.Mdir(in,out,dir,disp); } + void OpDirAll (const Field &in, std::vector &out){ + _Mat.MdirAll(in,out); + }; void Op (const Field &in, Field &out){ _Mat.M(in,out); } @@ -93,8 +97,7 @@ public: _Mat.MdagM(in,out,n1,n2); } void HermOp(const Field &in, Field &out){ - RealD n1,n2; - HermOpAndNorm(in,out,n1,n2); + _Mat.MdagM(in,out); } }; @@ -116,6 +119,9 @@ public: _Mat.Mdir(in,out,dir,disp); assert(0); } + void OpDirAll (const Field &in, std::vector &out){ + assert(0); + }; void Op (const Field &in, Field &out){ _Mat.M(in,out); assert(0); @@ -154,6 +160,9 @@ public: void OpDir (const Field &in, Field &out,int dir,int disp) { _Mat.Mdir(in,out,dir,disp); } + void OpDirAll (const Field &in, std::vector &out){ + _Mat.MdirAll(in,out); + }; void Op (const Field &in, Field &out){ _Mat.M(in,out); } @@ -162,7 +171,6 @@ public: } void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ _Mat.M(in,out); - ComplexD dot= innerProduct(in,out); n1=real(dot); n2=norm2(out); } @@ -183,6 +191,9 @@ public: void OpDir (const Field &in, Field &out,int dir,int disp) { _Mat.Mdir(in,out,dir,disp); } + void OpDirAll (const Field &in, std::vector &out){ + _Mat.MdirAll(in,out); + }; void Op (const Field &in, Field &out){ _Mat.M(in,out); } @@ -234,6 +245,9 @@ public: void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); } + void OpDirAll (const Field &in, std::vector &out){ + assert(0); + }; }; template class SchurDiagMooeeOperator : public SchurOperatorBase { @@ -320,9 +334,135 @@ public: return axpy_norm(out,-1.0,tmp,in); } }; + + template + class NonHermitianSchurOperatorBase : public LinearOperatorBase + { + public: + virtual RealD Mpc (const Field& in, Field& out) = 0; + virtual RealD MpcDag (const Field& in, Field& out) = 0; + virtual void MpcDagMpc(const Field& in, Field& out, RealD& ni, RealD& no) { + Field tmp(in.Grid()); + tmp.Checkerboard() = in.Checkerboard(); + ni = Mpc(in,tmp); + no = MpcDag(tmp,out); + } + virtual void HermOpAndNorm(const Field& in, Field& out, RealD& n1, RealD& n2) { + assert(0); + } + virtual void HermOp(const Field& in, Field& out) { + assert(0); + } + void Op(const Field& in, Field& out) { + Mpc(in, out); + } + void AdjOp(const Field& in, Field& out) { + MpcDag(in, out); + } + // Support for coarsening to a multigrid + void OpDiag(const Field& in, Field& out) { + assert(0); // must coarsen the unpreconditioned system + } + void OpDir(const Field& in, Field& out, int dir, int disp) { + assert(0); + } + }; + + template + class NonHermitianSchurDiagMooeeOperator : public NonHermitianSchurOperatorBase + { + public: + Matrix& _Mat; + NonHermitianSchurDiagMooeeOperator(Matrix& Mat): _Mat(Mat){}; + virtual RealD Mpc(const Field& in, Field& out) { + Field tmp(in.Grid()); + tmp.Checkerboard() = !in.Checkerboard(); + + _Mat.Meooe(in, tmp); + _Mat.MooeeInv(tmp, out); + _Mat.Meooe(out, tmp); + + _Mat.Mooee(in, out); + + return axpy_norm(out, -1.0, tmp, out); + } + virtual RealD MpcDag(const Field& in, Field& out) { + Field tmp(in.Grid()); + + _Mat.MeooeDag(in, tmp); + _Mat.MooeeInvDag(tmp, out); + _Mat.MeooeDag(out, tmp); + + _Mat.MooeeDag(in, out); + + return axpy_norm(out, -1.0, tmp, out); + } + }; + + template + class NonHermitianSchurDiagOneOperator : public NonHermitianSchurOperatorBase + { + protected: + Matrix &_Mat; + + public: + NonHermitianSchurDiagOneOperator (Matrix& Mat): _Mat(Mat){}; + virtual RealD Mpc(const Field& in, Field& out) { + Field tmp(in.Grid()); + + _Mat.Meooe(in, out); + _Mat.MooeeInv(out, tmp); + _Mat.Meooe(tmp, out); + _Mat.MooeeInv(out, tmp); + + return axpy_norm(out, -1.0, tmp, in); + } + virtual RealD MpcDag(const Field& in, Field& out) { + Field tmp(in.Grid()); + + _Mat.MooeeInvDag(in, out); + _Mat.MeooeDag(out, tmp); + _Mat.MooeeInvDag(tmp, out); + _Mat.MeooeDag(out, tmp); + + return axpy_norm(out, -1.0, tmp, in); + } + }; + + template + class NonHermitianSchurDiagTwoOperator : public NonHermitianSchurOperatorBase + { + protected: + Matrix& _Mat; + + public: + NonHermitianSchurDiagTwoOperator(Matrix& Mat): _Mat(Mat){}; + + virtual RealD Mpc(const Field& in, Field& out) { + Field tmp(in.Grid()); + + _Mat.MooeeInv(in, out); + _Mat.Meooe(out, tmp); + _Mat.MooeeInv(tmp, out); + _Mat.Meooe(out, tmp); + + return axpy_norm(out, -1.0, tmp, in); + } + virtual RealD MpcDag(const Field& in, Field& out) { + Field tmp(in.Grid()); + + _Mat.MeooeDag(in, out); + _Mat.MooeeInvDag(out, tmp); + _Mat.MeooeDag(tmp, out); + _Mat.MooeeInvDag(out, tmp); + + return axpy_norm(out, -1.0, tmp, in); + } + }; + /////////////////////////////////////////////////////////////////////////////////////////////////// // Left handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta --> ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta - // Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta --> ( 1 - Moe Mee^-1 Meo ) Moo^-1 phi=eta ; psi = Moo^-1 phi + // Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta --> ( 1 - Moe Mee^-1 Meo Moo^-1) phi=eta ; psi = Moo^-1 phi /////////////////////////////////////////////////////////////////////////////////////////////////// template using SchurDiagOneRH = SchurDiagTwoOperator ; template using SchurDiagOneLH = SchurDiagOneOperator ; diff --git a/Grid/algorithms/SparseMatrix.h b/Grid/algorithms/SparseMatrix.h index ffed7527..b959f53c 100644 --- a/Grid/algorithms/SparseMatrix.h +++ b/Grid/algorithms/SparseMatrix.h @@ -45,8 +45,13 @@ public: ni=M(in,tmp); no=Mdag(tmp,out); } + virtual void MdagM(const Field &in, Field &out) { + RealD ni, no; + MdagM(in,out,ni,no); + } virtual void Mdiag (const Field &in, Field &out)=0; virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0; + virtual void MdirAll (const Field &in, std::vector &out)=0; }; ///////////////////////////////////////////////////////////////////////////////////////////// @@ -56,12 +61,12 @@ template class CheckerBoardedSparseMatrixBase : public SparseMatrix public: virtual GridBase *RedBlackGrid(void)=0; - ////////////////////////////////////////////////////////////////////// - // Query the even even properties to make algorithmic decisions - ////////////////////////////////////////////////////////////////////// - virtual RealD Mass(void) { return 0.0; }; - virtual int ConstEE(void) { return 1; }; // Disable assumptions unless overridden - virtual int isTrivialEE(void) { return 0; }; // by a derived class that knows better + ////////////////////////////////////////////////////////////////////// + // Query the even even properties to make algorithmic decisions + ////////////////////////////////////////////////////////////////////// + virtual RealD Mass(void) { return 0.0; }; + virtual int ConstEE(void) { return 1; }; // Disable assumptions unless overridden + virtual int isTrivialEE(void) { return 0; }; // by a derived class that knows better // half checkerboard operaions virtual void Meooe (const Field &in, Field &out)=0; diff --git a/Grid/algorithms/approx/Chebyshev.h b/Grid/algorithms/approx/Chebyshev.h index 97e0e807..133db2b4 100644 --- a/Grid/algorithms/approx/Chebyshev.h +++ b/Grid/algorithms/approx/Chebyshev.h @@ -94,6 +94,24 @@ public: Coeffs.assign(0.,order); Coeffs[order-1] = 1.; }; + + // PB - more efficient low pass drops high modes above the low as 1/x uses all Chebyshev's. + // Similar kick effect below the threshold as Lanczos filter approach + void InitLowPass(RealD _lo,RealD _hi,int _order) + { + lo=_lo; + hi=_hi; + order=_order; + + if(order < 2) exit(-1); + Coeffs.resize(order); + for(int j=0;j + +NAMESPACE_BEGIN(Grid); + +template +class JacobiPolynomial : public OperatorFunction { + private: + using OperatorFunction::operator(); + + int order; + RealD hi; + RealD lo; + RealD alpha; + RealD beta; + + public: + void csv(std::ostream &out){ + csv(out,lo,hi); + } + void csv(std::ostream &out,RealD llo,RealD hhi){ + RealD diff = hhi-llo; + RealD delta = diff*1.0e-5; + for (RealD x=llo-delta; x<=hhi; x+=delta) { + RealD f = approx(x); + out<< x<<" "< &Linop, const Field &in, Field &out) { + GridBase *grid=in.Grid(); + + int vol=grid->gSites(); + + Field T0(grid); + Field T1(grid); + Field T2(grid); + Field y(grid); + + + Field *Tnm = &T0; + Field *Tn = &T1; + Field *Tnp = &T2; + + // RealD T0=1.0; + T0=in; + + // RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo)); + // = x * 2/(hi-lo) - (hi+lo)/(hi-lo) + Linop.HermOp(T0,y); + RealD xscale = 2.0/(hi-lo); + RealD mscale = -(hi+lo)/(hi-lo); + Linop.HermOp(T0,y); + y=y*xscale+in*mscale; + + // RealD T1=(alpha-beta)*0.5+(alpha+beta+2.0)*0.5*y; + RealD halfAmB = (alpha-beta)*0.5; + RealD halfApBp2= (alpha+beta+2.0)*0.5; + T1 = halfAmB * in + halfApBp2*y; + + for(int n=2;n<=order;n++){ + + Linop.HermOp(*Tn,y); + y=xscale*y+mscale*(*Tn); + + RealD cnp = 2.0*n*(n+alpha+beta)*(2.0*n-2.0+alpha+beta); + RealD cny = (2.0*n-2.0+alpha+beta)*(2.0*n-1.0+alpha+beta)*(2.0*n+alpha+beta); + RealD cn1 = (2.0*n+alpha+beta-1.0)*(alpha*alpha-beta*beta); + RealD cnm = - 2.0*(n+alpha-1.0)*(n+beta-1.0)*(2.0*n+alpha+beta); + + // Tnp= ( cny * y *Tn + cn1 * Tn + cnm * Tnm )/ cnp; + cny=cny/cnp; + cn1=cn1/cnp; + cn1=cn1/cnp; + cnm=cnm/cnp; + + *Tnp=cny*y + cn1 *(*Tn) + cnm * (*Tnm); + + // Cycle pointers to avoid copies + Field *swizzle = Tnm; + Tnm =Tn; + Tn =Tnp; + Tnp =swizzle; + } + out=*Tnp; + + } +}; +NAMESPACE_END(Grid); +#endif diff --git a/Grid/algorithms/approx/RemezGeneral.cc b/Grid/algorithms/approx/RemezGeneral.cc new file mode 100644 index 00000000..e41b4ed2 --- /dev/null +++ b/Grid/algorithms/approx/RemezGeneral.cc @@ -0,0 +1,473 @@ +#include +#include +#include +#include +#include +#include +#include + +#include + + +// Constructor +AlgRemezGeneral::AlgRemezGeneral(double lower, double upper, long precision, + bigfloat (*f)(bigfloat x, void *data), void *data): f(f), + data(data), + prec(precision), + apstrt(lower), apend(upper), apwidt(upper - lower), + n(0), d(0), pow_n(0), pow_d(0) +{ + bigfloat::setDefaultPrecision(prec); + + std::cout<<"Approximation bounds are ["< tolerance) { + if (iter++ % report_freq==0) + std::cout<<"Iteration " <= tolerance ); + + search(); + } + + int sign; + double error = (double)getErr(mm[0],&sign); + std::cout<<"Converged at "<= 0; i--) yn = x * yn + (num_pows[i] != -1 ? param[c--] : bigfloat(0l)); + + c = n+d; + bigfloat yd = 1l; //Highest degree coefficient is 1.0 + for (int i = pow_d-1; i >= 0; i--) yd = x * yd + (den_pows[i] != -1 ? param[c--] : bigfloat(0l)); + + return(yn/yd); +} + +// Compute size and sign of the approximation error at x +bigfloat AlgRemezGeneral::getErr(bigfloat x, int *sign) const{ + bigfloat f = func(x); + bigfloat e = approx(x) - f; + if (f != 0) e /= f; + if (e < (bigfloat)0.0) { + *sign = -1; + e = -e; + } + else *sign = 1; + + return(e); +} + +// Solve the system AX=B +int AlgRemezGeneral::simq(){ + + int ip, ipj, ipk, ipn; + int idxpiv; + int kp, kp1, kpk, kpn; + int nip, nkp; + bigfloat em, q, rownrm, big, size, pivot, sum; + bigfloat *aa; + bigfloat *X = param.data(); + + int n = neq; + int nm1 = n - 1; + // Initialize IPS and X + + int ij = 0; + for (int i = 0; i < n; i++) { + IPS[i] = i; + rownrm = 0.0; + for(int j = 0; j < n; j++) { + q = abs_bf(A[ij]); + if(rownrm < q) rownrm = q; + ++ij; + } + if (rownrm == (bigfloat)0l) { + std::cout<<"simq rownrm=0\n"; + return(1); + } + X[i] = (bigfloat)1.0 / rownrm; + } + + for (int k = 0; k < nm1; k++) { + big = 0.0; + idxpiv = 0; + + for (int i = k; i < n; i++) { + ip = IPS[i]; + ipk = n*ip + k; + size = abs_bf(A[ipk]) * X[ip]; + if (size > big) { + big = size; + idxpiv = i; + } + } + + if (big == (bigfloat)0l) { + std::cout<<"simq big=0\n"; + return(2); + } + if (idxpiv != k) { + int j = IPS[k]; + IPS[k] = IPS[idxpiv]; + IPS[idxpiv] = j; + } + kp = IPS[k]; + kpk = n*kp + k; + pivot = A[kpk]; + kp1 = k+1; + for (int i = kp1; i < n; i++) { + ip = IPS[i]; + ipk = n*ip + k; + em = -A[ipk] / pivot; + A[ipk] = -em; + nip = n*ip; + nkp = n*kp; + aa = A.data()+nkp+kp1; + for (int j = kp1; j < n; j++) { + ipj = nip + j; + A[ipj] = A[ipj] + em * *aa++; + } + } + } + kpn = n * IPS[n-1] + n - 1; // last element of IPS[n] th row + if (A[kpn] == (bigfloat)0l) { + std::cout<<"simq A[kpn]=0\n"; + return(3); + } + + + ip = IPS[0]; + X[0] = B[ip]; + for (int i = 1; i < n; i++) { + ip = IPS[i]; + ipj = n * ip; + sum = 0.0; + for (int j = 0; j < i; j++) { + sum += A[ipj] * X[j]; + ++ipj; + } + X[i] = B[ip] - sum; + } + + ipn = n * IPS[n-1] + n - 1; + X[n-1] = X[n-1] / A[ipn]; + + for (int iback = 1; iback < n; iback++) { + //i goes (n-1),...,1 + int i = nm1 - iback; + ip = IPS[i]; + nip = n*ip; + sum = 0.0; + aa = A.data()+nip+i+1; + for (int j= i + 1; j < n; j++) + sum += *aa++ * X[j]; + X[i] = (X[i] - sum) / A[nip+i]; + } + + return(0); +} + +void AlgRemezGeneral::csv(std::ostream & os) const{ + os << "Numerator" << std::endl; + for(int i=0;i<=pow_n;i++){ + os << getCoeffNum(i) << "*x^" << i; + if(i!=pow_n) os << " + "; + } + os << std::endl; + + os << "Denominator" << std::endl; + for(int i=0;i<=pow_d;i++){ + os << getCoeffDen(i) << "*x^" << i; + if(i!=pow_d) os << " + "; + } + os << std::endl; + + //For a true minimax solution the errors should all be equal and the signs should oscillate +-+-+- etc + int sign; + os << "Errors at maxima: coordinate, error, (sign)" << std::endl; + for(int i=0;i +#include + +#ifdef HAVE_LIBGMP +#include "bigfloat.h" +#else +#include "bigfloat_double.h" +#endif + + +class AlgRemezGeneral{ + public: + enum PolyType { Even, Odd, Full }; + + private: + + // In GSL-style, pass the function as a function pointer. Any data required to evaluate the function is passed in as a void pointer + bigfloat (*f)(bigfloat x, void *data); + void *data; + + // The approximation parameters + std::vector param; + bigfloat norm; + + // The number of non-zero terms in the numerator and denominator + int n, d; + // The numerator and denominator degree (i.e. the largest power) + int pow_n, pow_d; + + // Specify if the numerator and/or denominator are odd/even polynomials + PolyType num_type; + PolyType den_type; + std::vector num_pows; //contains the mapping, with -1 if not present + std::vector den_pows; + + // The bounds of the approximation + bigfloat apstrt, apwidt, apend; + + // Variables used to calculate the approximation + int nd1, iter; + std::vector xx; + std::vector mm; + std::vector step; + + bigfloat delta, spread; + + // Variables used in search + std::vector yy; + + // Variables used in solving linear equations + std::vector A; + std::vector B; + std::vector IPS; + + // The number of equations we must solve at each iteration (n+d+1) + int neq; + + // The precision of the GNU MP library + long prec; + + // Initialize member variables associated with the polynomial's properties + void setupPolyProperties(int num_degree, int den_degree, PolyType num_type_in, PolyType den_type_in); + + // Initial values of maximal and minmal errors + void initialGuess(); + + // Initialise step sizes + void stpini(); + + // Initialize the algorithm + void reinitializeAlgorithm(); + + // Solve the equations + void equations(); + + // Search for error maxima and minima + void search(); + + // Calculate function required for the approximation + inline bigfloat func(bigfloat x) const{ + return f(x, data); + } + + // Compute size and sign of the approximation error at x + bigfloat getErr(bigfloat x, int *sign) const; + + // Solve the system AX=B where X = param + int simq(); + + // Evaluate the rational form P(x)/Q(x) using coefficients from the solution vector param + bigfloat approx(bigfloat x) const; + + public: + + AlgRemezGeneral(double lower, double upper, long prec, + bigfloat (*f)(bigfloat x, void *data), void *data); + + inline int getDegree(void) const{ + assert(n==d); + return n; + } + // Reset the bounds of the approximation + inline void setBounds(double lower, double upper) { + apstrt = lower; + apend = upper; + apwidt = apend - apstrt; + } + + // Get the bounds of the approximation + inline void getBounds(double &lower, double &upper) const{ + lower=(double)apstrt; + upper=(double)apend; + } + + // Run the algorithm to generate the rational approximation + double generateApprox(int num_degree, int den_degree, + PolyType num_type, PolyType den_type, + const double tolerance = 1e-15, const int report_freq = 1000); + + inline double generateApprox(int num_degree, int den_degree, + const double tolerance = 1e-15, const int report_freq = 1000){ + return generateApprox(num_degree, den_degree, Full, Full, tolerance, report_freq); + } + + // Evaluate the rational form P(x)/Q(x) using coefficients from the + // solution vector param + inline double evaluateApprox(double x) const{ + return (double)approx((bigfloat)x); + } + + // Evaluate the rational form Q(x)/P(x) using coefficients from the solution vector param + inline double evaluateInverseApprox(double x) const{ + return 1.0/(double)approx((bigfloat)x); + } + + // Calculate function required for the approximation + inline double evaluateFunc(double x) const{ + return (double)func((bigfloat)x); + } + + // Calculate inverse function required for the approximation + inline double evaluateInverseFunc(double x) const{ + return 1.0/(double)func((bigfloat)x); + } + + // Dump csv of function, approx and error + void csv(std::ostream &os = std::cout) const; + + // Get the coefficient of the term x^i in the numerator + inline double getCoeffNum(const int i) const{ + return num_pows[i] == -1 ? 0. : double(param[num_pows[i]]); + } + // Get the coefficient of the term x^i in the denominator + inline double getCoeffDen(const int i) const{ + if(i == pow_d) return 1.0; + else return den_pows[i] == -1 ? 0. : double(param[den_pows[i]+n+1]); + } +}; + +#endif diff --git a/Grid/algorithms/approx/ZMobius.cc b/Grid/algorithms/approx/ZMobius.cc new file mode 100644 index 00000000..65af901f --- /dev/null +++ b/Grid/algorithms/approx/ZMobius.cc @@ -0,0 +1,183 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/algorithms/approx/ZMobius.cc + + Copyright (C) 2015 + +Author: Christopher Kelly + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ + +#include +#include + +NAMESPACE_BEGIN(Grid); +NAMESPACE_BEGIN(Approx); + +//Compute the tanh approximation +inline double epsilonMobius(const double x, const std::vector &w){ + int Ls = w.size(); + + ComplexD fxp = 1., fmp = 1.; + for(int i=0;i &w){ + int Ls = w.size(); + + double fxp = 1., fmp = 1.; + for(int i=0;i &omega = *( (std::vector const*)data ); + bigfloat fxp(1.0); + bigfloat fmp(1.0); + + for(int i=0;i &omega_out, const int Ls_out, + const std::vector &omega_in, const int Ls_in, + const RealD lambda_bound){ + assert(omega_in.size() == Ls_in); + omega_out.resize(Ls_out); + + //Use the Remez algorithm to generate the appropriate rational polynomial + //For odd polynomial, to satisfy Haar condition must take either positive or negative half of range (cf https://arxiv.org/pdf/0803.0439.pdf page 6) + AlgRemezGeneral remez(0, lambda_bound, 64, &epsilonMobius, (void*)&omega_in); + remez.generateApprox(Ls_out-1, Ls_out,AlgRemezGeneral::Odd, AlgRemezGeneral::Even, 1e-15, 100); + remez.csv(std::cout); + + //The rational approximation has the form [ f(x) - f(-x) ] / [ f(x) + f(-x) ] where f(x) = \Prod_{i=0}^{L_s-1} ( \omega_i + x ) + //cf https://academiccommons.columbia.edu/doi/10.7916/D8T72HD7 pg 102 + //omega_i are therefore the negative of the complex roots of f(x) + + //We can find the roots by recognizing that the eigenvalues of a matrix A are the roots of the characteristic polynomial + // \rho(\lambda) = det( A - \lambda I ) where I is the unit matrix + //The matrix whose characteristic polynomial is an arbitrary monic polynomial a0 + a1 x + a2 x^2 + ... x^n is the companion matrix + // A = | 0 1 0 0 0 .... 0 | + // | 0 0 1 0 0 .... 0 | + // | : : : : : : | + // | 0 0 0 0 0 1 + // | -a0 -a1 -a2 ... ... -an| + + + //Note the Remez defines the largest power to have unit coefficient + std::vector coeffs(Ls_out+1); + for(int i=0;i > roots(Ls_out); + + //Form the companion matrix + Eigen::MatrixXd compn(Ls_out,Ls_out); + for(int i=0;i slv(compn, false); + + const auto & ev = slv.eigenvalues(); + for(int i=0;i omega_tmp = omega_out; + int s_low=0, s_high=Ls_out-1, ss=0; + for(int s_from = Ls_out-1; s_from >= 0; s_from--){ //loop from largest omega + int s_to; + if(ss % 2 == 0){ + s_to = s_low++; + }else{ + s_to = s_high--; + } + omega_out[s_to] = omega_tmp[s_from]; + ++ss; + } + + std::cout << "Resulting omega_i:" << std::endl; + for(int i=0;i \n"; + + int npt = 60; + double dlt = lambda_bound/double(npt-1); + + for (int i =0; i &omega_out, const int Ls_out, const RealD mobius_param, const int Ls_in, const RealD lambda_bound){ + std::vector omega_in(Ls_in, 1./mobius_param); + computeZmobiusOmega(omega_out, Ls_out, omega_in, Ls_in, lambda_bound); +} + +//ZMobius class takes gamma_i = (b+c) omega_i as its input, where b, c are factored out +void computeZmobiusGamma(std::vector &gamma_out, + const RealD mobius_param_out, const int Ls_out, + const RealD mobius_param_in, const int Ls_in, + const RealD lambda_bound){ + computeZmobiusOmega(gamma_out, Ls_out, mobius_param_in, Ls_in, lambda_bound); + for(int i=0;i &gamma_out, const int Ls_out, const RealD mobius_param, const int Ls_in, const RealD lambda_bound){ + computeZmobiusGamma(gamma_out, mobius_param, Ls_out, mobius_param, Ls_in, lambda_bound); +} + +NAMESPACE_END(Approx); +NAMESPACE_END(Grid); diff --git a/Grid/algorithms/approx/ZMobius.h b/Grid/algorithms/approx/ZMobius.h new file mode 100644 index 00000000..f3e97634 --- /dev/null +++ b/Grid/algorithms/approx/ZMobius.h @@ -0,0 +1,57 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/algorithms/approx/ZMobius.h + + Copyright (C) 2015 + +Author: Christopher Kelly + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#ifndef GRID_ZMOBIUS_APPROX_H +#define GRID_ZMOBIUS_APPROX_H + +#include + +NAMESPACE_BEGIN(Grid); +NAMESPACE_BEGIN(Approx); + +//Compute the Zmobius Omega parameters suitable for eigenvalue range -lambda_bound <= lambda <= lambda_bound +//Note omega_i = 1/(b_i + c_i) where b_i and c_i are the Mobius parameters +void computeZmobiusOmega(std::vector &omega_out, const int Ls_out, + const std::vector &omega_in, const int Ls_in, + const RealD lambda_bound); + +//mobius_param = b+c with b-c=1 +void computeZmobiusOmega(std::vector &omega_out, const int Ls_out, const RealD mobius_param, const int Ls_in, const RealD lambda_bound); + +//ZMobius class takes gamma_i = (b+c) omega_i as its input, where b, c are factored out +void computeZmobiusGamma(std::vector &gamma_out, + const RealD mobius_param_out, const int Ls_out, + const RealD mobius_param_in, const int Ls_in, + const RealD lambda_bound); + +//Assumes mobius_param_out == mobius_param_in +void computeZmobiusGamma(std::vector &gamma_out, const int Ls_out, const RealD mobius_param, const int Ls_in, const RealD lambda_bound); + +NAMESPACE_END(Approx); +NAMESPACE_END(Grid); + +#endif diff --git a/Grid/algorithms/approx/bigfloat_double.h b/Grid/algorithms/approx/bigfloat_double.h index 4a7a02b8..26dd6b02 100644 --- a/Grid/algorithms/approx/bigfloat_double.h +++ b/Grid/algorithms/approx/bigfloat_double.h @@ -25,6 +25,10 @@ Author: Peter Boyle See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ + +#ifndef INCLUDED_BIGFLOAT_DOUBLE_H +#define INCLUDED_BIGFLOAT_DOUBLE_H + #include typedef double mfloat; @@ -186,4 +190,6 @@ public: // friend bigfloat& random(void); }; +#endif + diff --git a/Grid/algorithms/iterative/BiCGSTAB.h b/Grid/algorithms/iterative/BiCGSTAB.h new file mode 100644 index 00000000..3a7be1ef --- /dev/null +++ b/Grid/algorithms/iterative/BiCGSTAB.h @@ -0,0 +1,222 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/algorithms/iterative/BiCGSTAB.h + +Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle +Author: paboyle +Author: juettner +Author: David Murphy + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ + +#ifndef GRID_BICGSTAB_H +#define GRID_BICGSTAB_H + +NAMESPACE_BEGIN(Grid); + +///////////////////////////////////////////////////////////// +// Base classes for iterative processes based on operators +// single input vec, single output vec. +///////////////////////////////////////////////////////////// + +template +class BiCGSTAB : public OperatorFunction +{ + public: + using OperatorFunction::operator(); + + bool ErrorOnNoConverge; // throw an assert when the CG fails to converge. + // Defaults true. + RealD Tolerance; + Integer MaxIterations; + Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion + + BiCGSTAB(RealD tol, Integer maxit, bool err_on_no_conv = true) : + Tolerance(tol), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv){}; + + void operator()(LinearOperatorBase& Linop, const Field& src, Field& psi) + { + psi.Checkerboard() = src.Checkerboard(); + conformable(psi, src); + + RealD cp(0), rho(1), rho_prev(0), alpha(1), beta(0), omega(1); + RealD a(0), bo(0), b(0), ssq(0); + + Field p(src); + Field r(src); + Field rhat(src); + Field v(src); + Field s(src); + Field t(src); + Field h(src); + + v = Zero(); + p = Zero(); + + // Initial residual computation & set up + RealD guess = norm2(psi); + assert(std::isnan(guess) == 0); + + Linop.Op(psi, v); + b = norm2(v); + + r = src - v; + rhat = r; + a = norm2(r); + ssq = norm2(src); + + std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB: guess " << guess << std::endl; + std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB: src " << ssq << std::endl; + std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB: mp " << b << std::endl; + std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB: r " << a << std::endl; + + RealD rsq = Tolerance * Tolerance * ssq; + + // Check if guess is really REALLY good :) + if(a <= rsq){ return; } + + std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB: k=0 residual " << a << " target " << rsq << std::endl; + + GridStopWatch LinalgTimer; + GridStopWatch InnerTimer; + GridStopWatch AxpyNormTimer; + GridStopWatch LinearCombTimer; + GridStopWatch MatrixTimer; + GridStopWatch SolverTimer; + + SolverTimer.Start(); + int k; + for (k = 1; k <= MaxIterations; k++) + { + rho_prev = rho; + + LinalgTimer.Start(); + InnerTimer.Start(); + ComplexD Crho = innerProduct(rhat,r); + InnerTimer.Stop(); + rho = Crho.real(); + + beta = (rho / rho_prev) * (alpha / omega); + + LinearCombTimer.Start(); + bo = beta * omega; + auto p_v = p.View(); + auto r_v = r.View(); + auto v_v = v.View(); + accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{ + coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss)); + }); + LinearCombTimer.Stop(); + LinalgTimer.Stop(); + + MatrixTimer.Start(); + Linop.Op(p,v); + MatrixTimer.Stop(); + + LinalgTimer.Start(); + InnerTimer.Start(); + ComplexD Calpha = innerProduct(rhat,v); + InnerTimer.Stop(); + alpha = rho / Calpha.real(); + + LinearCombTimer.Start(); + auto h_v = h.View(); + auto psi_v = psi.View(); + accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{ + coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss)); + }); + + auto s_v = s.View(); + accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{ + coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss)); + }); + LinearCombTimer.Stop(); + LinalgTimer.Stop(); + + MatrixTimer.Start(); + Linop.Op(s,t); + MatrixTimer.Stop(); + + LinalgTimer.Start(); + InnerTimer.Start(); + ComplexD Comega = innerProduct(t,s); + InnerTimer.Stop(); + omega = Comega.real() / norm2(t); + + LinearCombTimer.Start(); + auto t_v = t.View(); + accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{ + coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss)); + coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss)); + }); + LinearCombTimer.Stop(); + + cp = norm2(r); + LinalgTimer.Stop(); + + std::cout << GridLogIterative << "BiCGSTAB: Iteration " << k << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl; + + // Stopping condition + if(cp <= rsq) + { + SolverTimer.Stop(); + Linop.Op(psi, v); + p = v - src; + + RealD srcnorm = sqrt(norm2(src)); + RealD resnorm = sqrt(norm2(p)); + RealD true_residual = resnorm / srcnorm; + + std::cout << GridLogMessage << "BiCGSTAB Converged on iteration " << k << std::endl; + std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp/ssq) << std::endl; + std::cout << GridLogMessage << "\tTrue residual " << true_residual << std::endl; + std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl; + + std::cout << GridLogMessage << "Time breakdown " << std::endl; + std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() << std::endl; + std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() << std::endl; + std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() << std::endl; + std::cout << GridLogMessage << "\tInner " << InnerTimer.Elapsed() << std::endl; + std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() << std::endl; + std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() << std::endl; + + if(ErrorOnNoConverge){ assert(true_residual / Tolerance < 10000.0); } + + IterationsToComplete = k; + + return; + } + } + + std::cout << GridLogMessage << "BiCGSTAB did NOT converge" << std::endl; + + if(ErrorOnNoConverge){ assert(0); } + IterationsToComplete = k; + } +}; + +NAMESPACE_END(Grid); + +#endif diff --git a/Grid/algorithms/iterative/BiCGSTABMixedPrec.h b/Grid/algorithms/iterative/BiCGSTABMixedPrec.h new file mode 100644 index 00000000..d9b2a9d7 --- /dev/null +++ b/Grid/algorithms/iterative/BiCGSTABMixedPrec.h @@ -0,0 +1,158 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/algorithms/iterative/BiCGSTABMixedPrec.h + +Copyright (C) 2015 + +Author: Christopher Kelly +Author: David Murphy + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ + +#ifndef GRID_BICGSTAB_MIXED_PREC_H +#define GRID_BICGSTAB_MIXED_PREC_H + +NAMESPACE_BEGIN(Grid); + +// Mixed precision restarted defect correction BiCGSTAB +template::value == 2, int>::type = 0, typename std::enable_if< getPrecision::value == 1, int>::type = 0> +class MixedPrecisionBiCGSTAB : public LinearFunction +{ + public: + RealD Tolerance; + RealD InnerTolerance; // Initial tolerance for inner CG. Defaults to Tolerance but can be changed + Integer MaxInnerIterations; + Integer MaxOuterIterations; + GridBase* SinglePrecGrid; // Grid for single-precision fields + RealD OuterLoopNormMult; // Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance + LinearOperatorBase &Linop_f; + LinearOperatorBase &Linop_d; + + Integer TotalInnerIterations; //Number of inner CG iterations + Integer TotalOuterIterations; //Number of restarts + Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step + + //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess + LinearFunction *guesser; + + MixedPrecisionBiCGSTAB(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid, + LinearOperatorBase& _Linop_f, LinearOperatorBase& _Linop_d) : + Linop_f(_Linop_f), Linop_d(_Linop_d), Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), + MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid), OuterLoopNormMult(100.), guesser(NULL) {}; + + void useGuesser(LinearFunction& g){ + guesser = &g; + } + + void operator() (const FieldD& src_d_in, FieldD& sol_d) + { + TotalInnerIterations = 0; + + GridStopWatch TotalTimer; + TotalTimer.Start(); + + int cb = src_d_in.Checkerboard(); + sol_d.Checkerboard() = cb; + + RealD src_norm = norm2(src_d_in); + RealD stop = src_norm * Tolerance*Tolerance; + + GridBase* DoublePrecGrid = src_d_in.Grid(); + FieldD tmp_d(DoublePrecGrid); + tmp_d.Checkerboard() = cb; + + FieldD tmp2_d(DoublePrecGrid); + tmp2_d.Checkerboard() = cb; + + FieldD src_d(DoublePrecGrid); + src_d = src_d_in; //source for next inner iteration, computed from residual during operation + + RealD inner_tol = InnerTolerance; + + FieldF src_f(SinglePrecGrid); + src_f.Checkerboard() = cb; + + FieldF sol_f(SinglePrecGrid); + sol_f.Checkerboard() = cb; + + BiCGSTAB CG_f(inner_tol, MaxInnerIterations); + CG_f.ErrorOnNoConverge = false; + + GridStopWatch InnerCGtimer; + + GridStopWatch PrecChangeTimer; + + Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count + + for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++) + { + // Compute double precision rsd and also new RHS vector. + Linop_d.Op(sol_d, tmp_d); + RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector + + std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Outer iteration " << outer_iter << " residual " << norm << " target " << stop << std::endl; + + if(norm < OuterLoopNormMult * stop){ + std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Outer iteration converged on iteration " << outer_iter << std::endl; + break; + } + while(norm * inner_tol * inner_tol < stop){ inner_tol *= 2; } // inner_tol = sqrt(stop/norm) ?? + + PrecChangeTimer.Start(); + precisionChange(src_f, src_d); + PrecChangeTimer.Stop(); + + sol_f = Zero(); + + //Optionally improve inner solver guess (eg using known eigenvectors) + if(guesser != NULL){ (*guesser)(src_f, sol_f); } + + //Inner CG + CG_f.Tolerance = inner_tol; + InnerCGtimer.Start(); + CG_f(Linop_f, src_f, sol_f); + InnerCGtimer.Stop(); + TotalInnerIterations += CG_f.IterationsToComplete; + + //Convert sol back to double and add to double prec solution + PrecChangeTimer.Start(); + precisionChange(tmp_d, sol_f); + PrecChangeTimer.Stop(); + + axpy(sol_d, 1.0, tmp_d, sol_d); + } + + //Final trial CG + std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Starting final patch-up double-precision solve" << std::endl; + + BiCGSTAB CG_d(Tolerance, MaxInnerIterations); + CG_d(Linop_d, src_d_in, sol_d); + TotalFinalStepIterations = CG_d.IterationsToComplete; + + TotalTimer.Stop(); + std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl; + std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl; + } +}; + +NAMESPACE_END(Grid); + +#endif diff --git a/Grid/algorithms/iterative/BlockConjugateGradient.h b/Grid/algorithms/iterative/BlockConjugateGradient.h index 0bd7cf51..75a8be06 100644 --- a/Grid/algorithms/iterative/BlockConjugateGradient.h +++ b/Grid/algorithms/iterative/BlockConjugateGradient.h @@ -52,6 +52,7 @@ class BlockConjugateGradient : public OperatorFunction { Integer MaxIterations; Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion Integer PrintInterval; //GridLogMessages or Iterative + RealD TrueResidual; BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true) : Tolerance(tol), CGtype(cgtype), blockDim(_Orthog), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv),PrintInterval(100) @@ -306,7 +307,8 @@ void BlockCGrQsolve(LinearOperatorBase &Linop, const Field &B, Field &X) Linop.HermOp(X, AD); AD = AD-B; - std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AD)/norm2(B)) < &Linop, const Field &Src, Field & Linop.HermOp(Psi, AP); AP = AP-Src; - std::cout < &Linop, const std::vector max_resid ) max_resid = rr; } - std::cout << GridLogIterative << "\t Block Iteration "< &Linop, const std::vector IterationsToCompleteShift; // Iterations for this shift int verbose; MultiShiftFunction shifts; + std::vector TrueResidualShift; ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) : MaxIterations(maxit), shifts(_shifts) { verbose=1; + IterationsToCompleteShift.resize(_shifts.order); + TrueResidualShift.resize(_shifts.order); } void operator() (LinearOperatorBase &Linop, const Field &src, Field &psi) @@ -125,6 +129,17 @@ public: // Residuals "r" are src // First search direction "p" is also src cp = norm2(src); + + // Handle trivial case of zero src. + if( cp == 0. ){ + for(int s=0;s void basisOrthogonalize(std::vector &basis,Field &w,int k) { + // If assume basis[j] are already orthonormal, + // can take all inner products in parallel saving 2x bandwidth + // Save 3x bandwidth on the second line of loop. + // perhaps 2.5x speed up. + // 2x overall in Multigrid Lanczos for(int j=0; j &basis,Eigen::MatrixXd& Qt,int j0, int j1, i { typedef decltype(basis[0].View()) View; auto tmp_v = basis[0].View(); - std::vector basis_v(basis.size(),tmp_v); + Vector basis_v(basis.size(),tmp_v); typedef typename Field::vector_object vobj; GridBase* grid = basis[0].Grid(); - + for(int k=0;k > Bt(thread_max() * Nm); // Thread private - thread_region { vobj* B = Bt.data() + Nm * thread_num(); @@ -81,24 +85,89 @@ void basisRotate(std::vector &basis,Eigen::MatrixXd& Qt,int j0, int j1, i } }); } +#else + + int nrot = j1-j0; + + + uint64_t oSites =grid->oSites(); + uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead + + // printf("BasisRotate %d %d nrot %d siteBlock %d\n",j0,j1,nrot,siteBlock); + + Vector Bt(siteBlock * nrot); + auto Bp=&Bt[0]; + + // GPU readable copy of Eigen matrix + Vector Qt_jv(Nm*Nm); + double *Qt_p = & Qt_jv[0]; + for(int k=0;k void basisRotateJ(Field &result,std::vector &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) { + typedef decltype(basis[0].View()) View; typedef typename Field::vector_object vobj; GridBase* grid = basis[0].Grid(); result.Checkerboard() = basis[0].Checkerboard(); auto result_v=result.View(); - thread_for(ss, grid->oSites(),{ - vobj B = Zero(); + Vector basis_v(basis.size(),result_v); + for(int k=0;k Qt_jv(Nm); + double * Qt_j = & Qt_jv[0]; + for(int k=0;koSites(),vobj::Nsimd(),{ + auto B=coalescedRead(zz); for(int k=k0; k& lmd, @@ -589,6 +661,7 @@ until convergence std::vector& evec, Field& w,int Nm,int k) { + std::cout<0) w -= lme[k-1] * evec[k-1]; - ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk) + ComplexD zalph = innerProduct(evec_k,w); RealD alph = real(zalph); - w = w - alph * evec_k;// 5. wk:=wk−αkvk + w = w - alph * evec_k; - RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop - // 7. vk+1 := wk/βk+1 + RealD beta = normalise(w); lmd[k] = alph; lme[k] = beta; - if (k>0 && k % orth_period == 0) { + if ( (k>0) && ( (k % orth_period) == 0 )) { + std::cout<& lmd, std::vector& lme, diff --git a/Grid/algorithms/iterative/NormalEquations.h b/Grid/algorithms/iterative/NormalEquations.h index 270b0115..c9c92777 100644 --- a/Grid/algorithms/iterative/NormalEquations.h +++ b/Grid/algorithms/iterative/NormalEquations.h @@ -33,26 +33,78 @@ NAMESPACE_BEGIN(Grid); /////////////////////////////////////////////////////////////////////////////////////////////////////// // Take a matrix and form an NE solver calling a Herm solver /////////////////////////////////////////////////////////////////////////////////////////////////////// -template class NormalEquations : public OperatorFunction{ +template class NormalEquations { private: SparseMatrixBase & _Matrix; OperatorFunction & _HermitianSolver; - + LinearFunction & _Guess; public: ///////////////////////////////////////////////////// // Wrap the usual normal equations trick ///////////////////////////////////////////////////// - NormalEquations(SparseMatrixBase &Matrix, OperatorFunction &HermitianSolver) - : _Matrix(Matrix), _HermitianSolver(HermitianSolver) {}; + NormalEquations(SparseMatrixBase &Matrix, OperatorFunction &HermitianSolver, + LinearFunction &Guess) + : _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {}; void operator() (const Field &in, Field &out){ Field src(in.Grid()); + Field tmp(in.Grid()); + MdagMLinearOperator,Field> MdagMOp(_Matrix); _Matrix.Mdag(in,src); - _HermitianSolver(src,out); // Mdag M out = Mdag in + _Guess(src,out); + _HermitianSolver(MdagMOp,src,out); // Mdag M out = Mdag in + + } +}; + +template class HPDSolver { +private: + LinearOperatorBase & _Matrix; + OperatorFunction & _HermitianSolver; + LinearFunction & _Guess; +public: + + ///////////////////////////////////////////////////// + // Wrap the usual normal equations trick + ///////////////////////////////////////////////////// + HPDSolver(LinearOperatorBase &Matrix, + OperatorFunction &HermitianSolver, + LinearFunction &Guess) + : _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {}; + + void operator() (const Field &in, Field &out){ + _Guess(in,out); + _HermitianSolver(_Matrix,in,out); // Mdag M out = Mdag in + + } +}; + + +template class MdagMSolver { +private: + SparseMatrixBase & _Matrix; + OperatorFunction & _HermitianSolver; + LinearFunction & _Guess; +public: + + ///////////////////////////////////////////////////// + // Wrap the usual normal equations trick + ///////////////////////////////////////////////////// + MdagMSolver(SparseMatrixBase &Matrix, OperatorFunction &HermitianSolver, + LinearFunction &Guess) + : _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {}; + + void operator() (const Field &in, Field &out){ + + MdagMLinearOperator,Field> MdagMOp(_Matrix); + _Guess(in,out); + + _HermitianSolver(MdagMOp,in,out); // Mdag M out = Mdag in + } }; diff --git a/Grid/algorithms/iterative/PowerMethod.h b/Grid/algorithms/iterative/PowerMethod.h index 8a238ea6..6aa8e923 100644 --- a/Grid/algorithms/iterative/PowerMethod.h +++ b/Grid/algorithms/iterative/PowerMethod.h @@ -30,12 +30,12 @@ template class PowerMethod RealD vden = norm2(src_n); RealD na = vnum/vden; - if ( (fabs(evalMaxApprox/na - 1.0) < 0.01) || (i==_MAX_ITER_EST_-1) ) { + if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) { evalMaxApprox = na; + std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl; return evalMaxApprox; } evalMaxApprox = na; - std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl; src_n = tmp; } assert(0); diff --git a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h index d3188ecb..bf454ade 100644 --- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h +++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h @@ -38,10 +38,11 @@ Author: Peter Boyle /////////////////////////////////////////////////////////////////////////////////////////////////////// NAMESPACE_BEGIN(Grid); +#define GCRLogLevel std::cout << GridLogMessage < -class PrecGeneralisedConjugateResidual : public OperatorFunction { +class PrecGeneralisedConjugateResidual : public LinearFunction { public: - using OperatorFunction::operator(); RealD Tolerance; Integer MaxIterations; @@ -49,23 +50,29 @@ public: int mmax; int nstep; int steps; + int level; GridStopWatch PrecTimer; GridStopWatch MatTimer; GridStopWatch LinalgTimer; - LinearFunction &Preconditioner; + LinearFunction &Preconditioner; + LinearOperatorBase &Linop; - PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearFunction &Prec,int _mmax,int _nstep) : + void Level(int lv) { level=lv; }; + + PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearOperatorBase &_Linop,LinearFunction &Prec,int _mmax,int _nstep) : Tolerance(tol), MaxIterations(maxit), + Linop(_Linop), Preconditioner(Prec), mmax(_mmax), nstep(_nstep) { + level=1; verbose=1; }; - void operator() (LinearOperatorBase &Linop,const Field &src, Field &psi){ + void operator() (const Field &src, Field &psi){ psi=Zero(); RealD cp, ssq,rsq; @@ -84,9 +91,9 @@ public: steps=0; for(int k=0;k &Linop,const Field &src, Field &psi,RealD rsq){ + RealD GCRnStep(const Field &src, Field &psi,RealD rsq){ RealD cp; RealD a, b; @@ -134,9 +143,7 @@ public: std::vector p(mmax,grid); std::vector qq(mmax); - std::cout< class NonHermitianSchurRedBlackDiagMooeeSolve : public SchurRedBlackBase + { + public: + typedef CheckerBoardedSparseMatrixBase Matrix; + + NonHermitianSchurRedBlackDiagMooeeSolve(OperatorFunction& RBSolver, const bool initSubGuess = false, + const bool _solnAsInitGuess = false) + : SchurRedBlackBase(RBSolver, initSubGuess, _solnAsInitGuess) {}; + + ////////////////////////////////////////////////////// + // Override RedBlack specialisation + ////////////////////////////////////////////////////// + virtual void RedBlackSource(Matrix& _Matrix, const Field& src, Field& src_e, Field& src_o) + { + GridBase* grid = _Matrix.RedBlackGrid(); + GridBase* fgrid = _Matrix.Grid(); + + Field tmp(grid); + Field Mtmp(grid); + + pickCheckerboard(Even, src_e, src); + pickCheckerboard(Odd , src_o, src); + + ///////////////////////////////////////////////////// + // src_o = Mdag * (source_o - Moe MeeInv source_e) + ///////////////////////////////////////////////////// + _Matrix.MooeeInv(src_e, tmp); assert( tmp.Checkerboard() == Even ); + _Matrix.Meooe (tmp, Mtmp); assert( Mtmp.Checkerboard() == Odd ); + src_o -= Mtmp; assert( src_o.Checkerboard() == Odd ); + } + + virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol) + { + GridBase* grid = _Matrix.RedBlackGrid(); + GridBase* fgrid = _Matrix.Grid(); + + Field tmp(grid); + Field sol_e(grid); + Field src_e_i(grid); + + /////////////////////////////////////////////////// + // sol_e = M_ee^-1 * ( src_e - Meo sol_o )... + /////////////////////////////////////////////////// + _Matrix.Meooe(sol_o, tmp); assert( tmp.Checkerboard() == Even ); + src_e_i = src_e - tmp; assert( src_e_i.Checkerboard() == Even ); + _Matrix.MooeeInv(src_e_i, sol_e); assert( sol_e.Checkerboard() == Even ); + + setCheckerboard(sol, sol_e); assert( sol_e.Checkerboard() == Even ); + setCheckerboard(sol, sol_o); assert( sol_o.Checkerboard() == Odd ); + } + + virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o) + { + NonHermitianSchurDiagMooeeOperator _OpEO(_Matrix); + this->_HermitianRBSolver(_OpEO, src_o, sol_o); assert(sol_o.Checkerboard() == Odd); + } + + virtual void RedBlackSolve(Matrix& _Matrix, const std::vector& src_o, std::vector& sol_o) + { + NonHermitianSchurDiagMooeeOperator _OpEO(_Matrix); + this->_HermitianRBSolver(_OpEO, src_o, sol_o); + } + }; + /////////////////////////////////////////////////////////////////////////////////////////////////////// // Site diagonal is identity, right preconditioned by Mee^inv // ( 1 - Meo Moo^inv Moe Mee^inv ) phi =( 1 - Meo Moo^inv Moe Mee^inv ) Mee psi = = eta = eta @@ -482,5 +546,76 @@ namespace Grid { this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); } }; + + template class NonHermitianSchurRedBlackDiagTwoSolve : public SchurRedBlackBase + { + public: + typedef CheckerBoardedSparseMatrixBase Matrix; + + ///////////////////////////////////////////////////// + // Wrap the usual normal equations Schur trick + ///////////////////////////////////////////////////// + NonHermitianSchurRedBlackDiagTwoSolve(OperatorFunction& RBSolver, const bool initSubGuess = false, + const bool _solnAsInitGuess = false) + : SchurRedBlackBase(RBSolver, initSubGuess, _solnAsInitGuess) {}; + + virtual void RedBlackSource(Matrix& _Matrix, const Field& src, Field& src_e, Field& src_o) + { + GridBase* grid = _Matrix.RedBlackGrid(); + GridBase* fgrid = _Matrix.Grid(); + + Field tmp(grid); + Field Mtmp(grid); + + pickCheckerboard(Even, src_e, src); + pickCheckerboard(Odd , src_o, src); + + ///////////////////////////////////////////////////// + // src_o = Mdag * (source_o - Moe MeeInv source_e) + ///////////////////////////////////////////////////// + _Matrix.MooeeInv(src_e, tmp); assert( tmp.Checkerboard() == Even ); + _Matrix.Meooe (tmp, Mtmp); assert( Mtmp.Checkerboard() == Odd ); + src_o -= Mtmp; assert( src_o.Checkerboard() == Odd ); + } + + virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol) + { + GridBase* grid = _Matrix.RedBlackGrid(); + GridBase* fgrid = _Matrix.Grid(); + + Field sol_o_i(grid); + Field tmp(grid); + Field sol_e(grid); + + //////////////////////////////////////////////// + // MooeeInv due to pecond + //////////////////////////////////////////////// + _Matrix.MooeeInv(sol_o, tmp); + sol_o_i = tmp; + + /////////////////////////////////////////////////// + // sol_e = M_ee^-1 * ( src_e - Meo sol_o )... + /////////////////////////////////////////////////// + _Matrix.Meooe(sol_o_i, tmp); assert( tmp.Checkerboard() == Even ); + tmp = src_e - tmp; assert( src_e.Checkerboard() == Even ); + _Matrix.MooeeInv(tmp, sol_e); assert( sol_e.Checkerboard() == Even ); + + setCheckerboard(sol, sol_e); assert( sol_e.Checkerboard() == Even ); + setCheckerboard(sol, sol_o_i); assert( sol_o_i.Checkerboard() == Odd ); + }; + + virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o) + { + NonHermitianSchurDiagTwoOperator _OpEO(_Matrix); + this->_HermitianRBSolver(_OpEO, src_o, sol_o); + }; + + virtual void RedBlackSolve(Matrix& _Matrix, const std::vector& src_o, std::vector& sol_o) + { + NonHermitianSchurDiagTwoOperator _OpEO(_Matrix); + this->_HermitianRBSolver(_OpEO, src_o, sol_o); + } + }; } + #endif diff --git a/Grid/allocator/AlignedAllocator.cc b/Grid/allocator/AlignedAllocator.cc index ed27a8bd..d53c4dc2 100644 --- a/Grid/allocator/AlignedAllocator.cc +++ b/Grid/allocator/AlignedAllocator.cc @@ -6,6 +6,12 @@ NAMESPACE_BEGIN(Grid); MemoryStats *MemoryProfiler::stats = nullptr; bool MemoryProfiler::debug = false; +#ifdef GRID_NVCC +#define SMALL_LIMIT (0) +#else +#define SMALL_LIMIT (4096) +#endif + #ifdef POINTER_CACHE int PointerCache::victim; @@ -13,7 +19,7 @@ PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache]; void *PointerCache::Insert(void *ptr,size_t bytes) { - if (bytes < 4096 ) return ptr; + if (bytes < SMALL_LIMIT ) return ptr; #ifdef GRID_OMP assert(omp_in_parallel()==0); @@ -50,7 +56,7 @@ void *PointerCache::Insert(void *ptr,size_t bytes) { void *PointerCache::Lookup(size_t bytes) { - if (bytes < 4096 ) return NULL; + if (bytes < SMALL_LIMIT ) return NULL; #ifdef GRID_OMP assert(omp_in_parallel()==0); diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index 2aa7d82b..8c189be8 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -49,8 +49,13 @@ NAMESPACE_BEGIN(Grid); #ifdef POINTER_CACHE class PointerCache { private: - +/*Pinning pages is costly*/ +/*Could maintain separate large and small allocation caches*/ +#ifdef GRID_NVCC + static const int Ncache=128; +#else static const int Ncache=8; +#endif static int victim; typedef struct { @@ -63,7 +68,6 @@ private: public: - static void *Insert(void *ptr,size_t bytes) ; static void *Lookup(size_t bytes) ; @@ -170,13 +174,14 @@ public: // Unified (managed) memory //////////////////////////////////// if ( ptr == (_Tp *) NULL ) { + // printf(" alignedAllocater cache miss %ld bytes ",bytes); BACKTRACEFP(stdout); auto err = cudaMallocManaged((void **)&ptr,bytes); if( err != cudaSuccess ) { ptr = (_Tp *) NULL; std::cerr << " cudaMallocManaged failed for " << bytes<<" bytes " < friend class Lattice; - GridBase(const Coordinate & processor_grid) : CartesianCommunicator(processor_grid) {}; + GridBase(const Coordinate & processor_grid) : CartesianCommunicator(processor_grid) { LocallyPeriodic=0;}; GridBase(const Coordinate & processor_grid, const CartesianCommunicator &parent, int &split_rank) - : CartesianCommunicator(processor_grid,parent,split_rank) {}; + : CartesianCommunicator(processor_grid,parent,split_rank) {LocallyPeriodic=0;}; GridBase(const Coordinate & processor_grid, const CartesianCommunicator &parent) - : CartesianCommunicator(processor_grid,parent,dummy) {}; + : CartesianCommunicator(processor_grid,parent,dummy) {LocallyPeriodic=0;}; virtual ~GridBase() = default; - // Physics Grid information. Coordinate _simd_layout;// Which dimensions get relayed out over simd lanes. Coordinate _fdimensions;// (full) Global dimensions of array prior to cb removal @@ -80,7 +79,8 @@ public: Coordinate _lstart; // local start of array in gcoors _processor_coor[d]*_ldimensions[d] Coordinate _lend ; // local end of array in gcoors _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1 - bool _isCheckerBoarded; + bool _isCheckerBoarded; + int LocallyPeriodic; public: diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index 0b9b9b56..ec7c54ec 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -173,6 +173,7 @@ public: /////////////////////////////////////////////////// typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_object scalar_object; typedef vobj vector_object; private: diff --git a/Grid/lattice/Lattice_coordinate.h b/Grid/lattice/Lattice_coordinate.h index 16f3641b..a1abe58d 100644 --- a/Grid/lattice/Lattice_coordinate.h +++ b/Grid/lattice/Lattice_coordinate.h @@ -37,19 +37,18 @@ template inline void LatticeCoordinate(Lattice &l,int mu) GridBase *grid = l.Grid(); int Nsimd = grid->iSites(); - Coordinate gcoor; - ExtractBuffer mergebuf(Nsimd); - - vector_type vI; auto l_v = l.View(); - for(int o=0;ooSites();o++){ + thread_for( o, grid->oSites(), { + vector_type vI; + Coordinate gcoor; + ExtractBuffer mergebuf(Nsimd); for(int i=0;iiSites();i++){ grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor); mergebuf[i]=(Integer)gcoor[mu]; } merge(vI,mergebuf); l_v[o]=vI; - } + }); }; // LatticeCoordinate(); diff --git a/Grid/lattice/Lattice_peekpoke.h b/Grid/lattice/Lattice_peekpoke.h index c3e7e931..feca2f44 100644 --- a/Grid/lattice/Lattice_peekpoke.h +++ b/Grid/lattice/Lattice_peekpoke.h @@ -156,7 +156,7 @@ void peekSite(sobj &s,const Lattice &l,const Coordinate &site){ // Peek a scalar object from the SIMD array ////////////////////////////////////////////////////////// template -void peekLocalSite(sobj &s,const Lattice &l,Coordinate &site){ +accelerator_inline void peekLocalSite(sobj &s,const Lattice &l,Coordinate &site){ GridBase *grid = l.Grid(); @@ -185,7 +185,7 @@ void peekLocalSite(sobj &s,const Lattice &l,Coordinate &site){ }; template -void pokeLocalSite(const sobj &s,Lattice &l,Coordinate &site){ +accelerator_inline void pokeLocalSite(const sobj &s,Lattice &l,Coordinate &site){ GridBase *grid=l.Grid(); diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index 865a4b14..c80e7db2 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -1,5 +1,4 @@ /************************************************************************************* - Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/lattice/Lattice_transfer.h @@ -83,12 +82,35 @@ template inline void setCheckerboard(Lattice &full,const Latti }); } - template inline void blockProject(Lattice > &coarseData, + const Lattice &fineData, + const std::vector > &Basis) +{ + GridBase * fine = fineData.Grid(); + GridBase * coarse= coarseData.Grid(); + + Lattice ip(coarse); + + // auto fineData_ = fineData.View(); + auto coarseData_ = coarseData.View(); + auto ip_ = ip.View(); + for(int v=0;voSites(), vobj::Nsimd(), { + coalescedWrite(coarseData_[sc](v),ip_(sc)); + }); + } +} + +template +inline void blockProject1(Lattice > &coarseData, const Lattice &fineData, const std::vector > &Basis) { + typedef iVector coarseSiteData; + coarseSiteData elide; + typedef decltype(coalescedRead(elide)) ScalarComplex; GridBase * fine = fineData.Grid(); GridBase * coarse= coarseData.Grid(); int _ndimension = coarse->_ndimension; @@ -106,26 +128,40 @@ inline void blockProject(Lattice > &coarseData, block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; assert(block_r[d]*coarse->_rdimensions[d] == fine->_rdimensions[d]); } + int blockVol = fine->oSites()/coarse->oSites(); coarseData=Zero(); auto fineData_ = fineData.View(); auto coarseData_ = coarseData.View(); - // Loop over coars parallel, and then loop over fine associated with coarse. - thread_for( sf, fine->oSites(), { - int sc; - Coordinate coor_c(_ndimension); - Coordinate coor_f(_ndimension); - Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); - for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; - Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); + //////////////////////////////////////////////////////////////////////////////////////////////////////// + // To make this lock free, loop over coars parallel, and then loop over fine associated with coarse. + // Otherwise do fine inner product per site, and make the update atomic + //////////////////////////////////////////////////////////////////////////////////////////////////////// + accelerator_for( sci, nbasis*coarse->oSites(), vobj::Nsimd(), { - thread_critical { - for(int i=0;i_rdimensions); // Block coordinate + + int sf; + decltype(innerProduct(Basis_(sf),fineData_(sf))) reduce=Zero(); + + for(int sb=0;sb_rdimensions); + + reduce=reduce+innerProduct(Basis_(sf),fineData_(sf)); } + coalescedWrite(coarseData_[sc](i),reduce); }); return; } @@ -160,7 +196,7 @@ inline void blockZAXPY(Lattice &fineZ, auto fineY_ = fineY.View(); auto coarseA_= coarseA.View(); - thread_for(sf, fine->oSites(), { + accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), { int sc; Coordinate coor_c(_ndimension); @@ -171,7 +207,7 @@ inline void blockZAXPY(Lattice &fineZ, Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); // z = A x + y - fineZ_[sf]=coarseA_[sc]*fineX_[sf]+fineY_[sf]; + coalescedWrite(fineZ_[sf],coarseA_(sc)*fineX_(sf)+fineY_(sf)); }); @@ -196,7 +232,7 @@ inline void blockInnerProduct(Lattice &CoarseInner, fine_inner = localInnerProduct(fineX,fineY); blockSum(coarse_inner,fine_inner); - thread_for(ss, coarse->oSites(),{ + accelerator_for(ss, coarse->oSites(), 1, { CoarseInner_[ss] = coarse_inner_[ss]; }); } @@ -226,23 +262,29 @@ inline void blockSum(Lattice &coarseData,const Lattice &fineData) for(int d=0 ; d<_ndimension;d++){ block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; } + int blockVol = fine->oSites()/coarse->oSites(); // Turn this around to loop threaded over sc and interior loop // over sf would thread better - coarseData=Zero(); auto coarseData_ = coarseData.View(); auto fineData_ = fineData.View(); - thread_for(sf,fine->oSites(),{ - int sc; + accelerator_for(sc,coarse->oSites(),1,{ + + // One thread per sub block Coordinate coor_c(_ndimension); - Coordinate coor_f(_ndimension); - - Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); - for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; - Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); - - thread_critical { + Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate + coarseData_[sc]=Zero(); + + for(int sb=0;sb_rdimensions); + coarseData_[sc]=coarseData_[sc]+fineData_[sf]; } @@ -296,6 +338,7 @@ inline void blockOrthogonalise(Lattice &ip,std::vector > } } +#if 0 template inline void blockPromote(const Lattice > &coarseData, Lattice &fineData, @@ -321,7 +364,7 @@ inline void blockPromote(const Lattice > &coarseData, auto coarseData_ = coarseData.View(); // Loop with a cache friendly loop ordering - thread_for(sf,fine->oSites(),{ + accelerator_for(sf,fine->oSites(),1,{ int sc; Coordinate coor_c(_ndimension); Coordinate coor_f(_ndimension); @@ -332,13 +375,35 @@ inline void blockPromote(const Lattice > &coarseData, for(int i=0;i +inline void blockPromote(const Lattice > &coarseData, + Lattice &fineData, + const std::vector > &Basis) +{ + GridBase * fine = fineData.Grid(); + GridBase * coarse= coarseData.Grid(); + + fineData=Zero(); + for(int i=0;i > ip = PeekIndex<0>(coarseData,i); + Lattice cip(coarse); + auto cip_ = cip.View(); + auto ip_ = ip.View(); + accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{ + coalescedWrite(cip_[sc], ip_(sc)()); + }); + blockZAXPY(fineData,cip,Basis[i],fineData); + } +} +#endif // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars. // Simd layouts need not match since we use peek/poke Local @@ -374,6 +439,67 @@ void localConvert(const Lattice &in,Lattice &out) }); } +template +void localCopyRegion(const Lattice &From,Lattice & To,Coordinate FromLowerLeft, Coordinate ToLowerLeft, Coordinate RegionSize) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + static const int words=sizeof(vobj)/sizeof(vector_type); + + GridBase *Fg = From.Grid(); + GridBase *Tg = To.Grid(); + assert(!Fg->_isCheckerBoarded); + assert(!Tg->_isCheckerBoarded); + int Nsimd = Fg->Nsimd(); + int nF = Fg->_ndimension; + int nT = Tg->_ndimension; + int nd = nF; + assert(nF == nT); + + for(int d=0;d_processors[d] == Tg->_processors[d]); + } + + // the above should guarantee that the operations are local + Coordinate ldf = Fg->_ldimensions; + Coordinate rdf = Fg->_rdimensions; + Coordinate isf = Fg->_istride; + Coordinate osf = Fg->_ostride; + Coordinate rdt = Tg->_rdimensions; + Coordinate ist = Tg->_istride; + Coordinate ost = Tg->_ostride; + auto t_v = To.View(); + auto f_v = From.View(); + accelerator_for(idx,Fg->lSites(),1,{ + sobj s; + Coordinate Fcoor(nd); + Coordinate Tcoor(nd); + Lexicographic::CoorFromIndex(Fcoor,idx,ldf); + int in_region=1; + for(int d=0;d=FromLowerLeft[d]+RegionSize[d]) ){ + in_region=0; + } + Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d]; + } + if (in_region) { + Integer idx_f = 0; for(int d=0;d void InsertSlice(const Lattice &lowDim,Lattice & higherDim,int slice, int orthog) diff --git a/Grid/qcd/action/fermion/CayleyFermion5D.h b/Grid/qcd/action/fermion/CayleyFermion5D.h index 8034b552..f27f4c23 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5D.h +++ b/Grid/qcd/action/fermion/CayleyFermion5D.h @@ -101,7 +101,8 @@ public: virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag); // Efficient support for multigrid coarsening - virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp); + virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp); + virtual void MdirAll(const FermionField &in, std::vector &out); void Meooe5D (const FermionField &in, FermionField &out); void MeooeDag5D (const FermionField &in, FermionField &out); diff --git a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h index 379c5f8f..5aa7bfbd 100644 --- a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h +++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h @@ -62,14 +62,15 @@ public: // Efficient support for multigrid coarsening virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp); + virtual void MdirAll(const FermionField &in, std::vector &out); - /////////////////////////////////////////////////////////////// - // Physical surface field utilities - /////////////////////////////////////////////////////////////// - // virtual void Dminus(const FermionField &psi, FermionField &chi); // Inherit trivial case - // virtual void DminusDag(const FermionField &psi, FermionField &chi); // Inherit trivial case - virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d); - virtual void ImportPhysicalFermionSource (const FermionField &input4d,FermionField &imported5d); + /////////////////////////////////////////////////////////////// + // Physical surface field utilities + /////////////////////////////////////////////////////////////// + // virtual void Dminus(const FermionField &psi, FermionField &chi); // Inherit trivial case + // virtual void DminusDag(const FermionField &psi, FermionField &chi); // Inherit trivial case + virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d); + virtual void ImportPhysicalFermionSource (const FermionField &input4d,FermionField &imported5d); // Constructors ContinuedFractionFermion5D(GaugeField &_Umu, diff --git a/Grid/qcd/action/fermion/FermionOperator.h b/Grid/qcd/action/fermion/FermionOperator.h index 3879dc3e..f0c2a039 100644 --- a/Grid/qcd/action/fermion/FermionOperator.h +++ b/Grid/qcd/action/fermion/FermionOperator.h @@ -89,6 +89,7 @@ public: virtual void Mdiag (const FermionField &in, FermionField &out) { Mooee(in,out);}; // Same as Mooee applied to both CB's virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac + virtual void MdirAll(const FermionField &in, std::vector &out)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac virtual void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector twist) { assert(0);}; diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h index 5fe8246b..0cfae7b6 100644 --- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h +++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h @@ -103,6 +103,7 @@ public: // Multigrid assistance; force term uses too /////////////////////////////////////////////////////////////// void Mdir(const FermionField &in, FermionField &out, int dir, int disp); + void MdirAll(const FermionField &in, std::vector &out); void DhopDir(const FermionField &in, FermionField &out, int dir, int disp); /////////////////////////////////////////////////////////////// diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h index f7ab3a25..0ce1c701 100644 --- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h +++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h @@ -86,7 +86,8 @@ public: void MooeeDag (const FermionField &in, FermionField &out); void MooeeInvDag (const FermionField &in, FermionField &out); - void Mdir (const FermionField &in, FermionField &out,int dir,int disp); + void Mdir (const FermionField &in, FermionField &out,int dir,int disp); + void MdirAll(const FermionField &in, std::vector &out); void DhopDir(const FermionField &in, FermionField &out,int dir,int disp); // These can be overridden by fancy 5d chiral action diff --git a/Grid/qcd/action/fermion/MADWF.h b/Grid/qcd/action/fermion/MADWF.h index f7f0ee1b..6b3c6e71 100644 --- a/Grid/qcd/action/fermion/MADWF.h +++ b/Grid/qcd/action/fermion/MADWF.h @@ -40,6 +40,11 @@ inline void convert(const Fieldi &from,Fieldo &to) to=from; } +struct MADWFinnerIterCallbackBase{ + virtual void operator()(const RealD current_resid){} + virtual ~MADWFinnerIterCallbackBase(){} +}; + template class MADWF { @@ -56,24 +61,30 @@ class MADWF RealD target_resid; int maxiter; - public: + //operator() is called on "callback" at the end of every inner iteration. This allows for example the adjustment of the inner + //tolerance to speed up subsequent iteration + MADWFinnerIterCallbackBase* callback; + + public: MADWF(Matrixo &_Mato, - Matrixi &_Mati, - PVinverter &_PauliVillarsSolvero, + Matrixi &_Mati, + PVinverter &_PauliVillarsSolvero, SchurSolver &_SchurSolveri, Guesser & _Guesseri, RealD resid, - int _maxiter) : + int _maxiter, + MADWFinnerIterCallbackBase* _callback = NULL) : Mato(_Mato),Mati(_Mati), SchurSolveri(_SchurSolveri), - PauliVillarsSolvero(_PauliVillarsSolvero),Guesseri(_Guesseri) - { - target_resid=resid; - maxiter =_maxiter; - }; - + PauliVillarsSolvero(_PauliVillarsSolvero),Guesseri(_Guesseri), + callback(_callback) + { + target_resid=resid; + maxiter =_maxiter; + }; + void operator() (const FermionFieldo &src4,FermionFieldo &sol5) { std::cout << GridLogMessage<< " ************************************************" << std::endl; @@ -177,6 +188,8 @@ class MADWF std::cout << GridLogMessage << "Residual " << i << ": " << resid << std::endl; std::cout << GridLogMessage << "***************************************" < &out); - /////////////////////////////////////////////////////////////// - // Physical surface field utilities - /////////////////////////////////////////////////////////////// - virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d); - virtual void ImportPhysicalFermionSource (const FermionField &input4d,FermionField &imported5d); + /////////////////////////////////////////////////////////////// + // Physical surface field utilities + /////////////////////////////////////////////////////////////// + virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d); + virtual void ImportPhysicalFermionSource (const FermionField &input4d,FermionField &imported5d); // Constructors PartialFractionFermion5D(GaugeField &_Umu, diff --git a/Grid/qcd/action/fermion/WilsonFermion.h b/Grid/qcd/action/fermion/WilsonFermion.h index b62fc0df..2e0bc9bf 100644 --- a/Grid/qcd/action/fermion/WilsonFermion.h +++ b/Grid/qcd/action/fermion/WilsonFermion.h @@ -115,9 +115,10 @@ public: // Multigrid assistance; force term uses too /////////////////////////////////////////////////////////////// void Mdir(const FermionField &in, FermionField &out, int dir, int disp); + void MdirAll(const FermionField &in, std::vector &out); void DhopDir(const FermionField &in, FermionField &out, int dir, int disp); - void DhopDirDisp(const FermionField &in, FermionField &out, int dirdisp, - int gamma, int dag); + void DhopDirAll(const FermionField &in, std::vector &out); + void DhopDirCalc(const FermionField &in, FermionField &out, int dirdisp,int gamma, int dag); /////////////////////////////////////////////////////////////// // Extra methods added by derived diff --git a/Grid/qcd/action/fermion/WilsonFermion5D.h b/Grid/qcd/action/fermion/WilsonFermion5D.h index aa2f51d4..ea71376c 100644 --- a/Grid/qcd/action/fermion/WilsonFermion5D.h +++ b/Grid/qcd/action/fermion/WilsonFermion5D.h @@ -111,15 +111,16 @@ public: virtual void MooeeDag (const FermionField &in, FermionField &out){assert(0);}; virtual void MooeeInvDag (const FermionField &in, FermionField &out){assert(0);}; virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac + virtual void MdirAll(const FermionField &in, std::vector &out){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac // These can be overridden by fancy 5d chiral action virtual void DhopDeriv (GaugeField &mat,const FermionField &U,const FermionField &V,int dag); virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag); virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag); - void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector twist) ; - void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector twist) ; - void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector twist) ; + void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector twist) ; + void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector twist) ; + void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector twist) ; // Implement hopping term non-hermitian hopping term; half cb or both // Implement s-diagonal DW @@ -131,6 +132,9 @@ public: // add a DhopComm // -- suboptimal interface will presently trigger multiple comms. void DhopDir(const FermionField &in, FermionField &out,int dir,int disp); + void DhopDirAll(const FermionField &in,std::vector &out); + void DhopDirComms(const FermionField &in); + void DhopDirCalc(const FermionField &in, FermionField &out,int point); /////////////////////////////////////////////////////////////// // New methods added diff --git a/Grid/qcd/action/fermion/WilsonKernels.h b/Grid/qcd/action/fermion/WilsonKernels.h index eaa49328..1bac9211 100644 --- a/Grid/qcd/action/fermion/WilsonKernels.h +++ b/Grid/qcd/action/fermion/WilsonKernels.h @@ -60,13 +60,25 @@ public: int Ls, int Nsite, const FermionField &in, FermionField &out, int interior=1,int exterior=1) ; + static void DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls, + int Nsite, const FermionField &in, std::vector &out) ; + static void DhopDirKernel(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf, int Ls, int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma); private: - static accelerator void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor * buf, + static accelerator_inline void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor * buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dirdisp, int gamma); + + static accelerator_inline void DhopDirXp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); + static accelerator_inline void DhopDirYp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); + static accelerator_inline void DhopDirZp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); + static accelerator_inline void DhopDirTp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); + static accelerator_inline void DhopDirXm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); + static accelerator_inline void DhopDirYm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); + static accelerator_inline void DhopDirZm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); + static accelerator_inline void DhopDirTm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp); // Specialised variants static accelerator void GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, diff --git a/Grid/qcd/action/fermion/g5HermitianLinop.h b/Grid/qcd/action/fermion/g5HermitianLinop.h index 2e417ceb..90bb6d59 100644 --- a/Grid/qcd/action/fermion/g5HermitianLinop.h +++ b/Grid/qcd/action/fermion/g5HermitianLinop.h @@ -54,6 +54,14 @@ public: _Mat.Mdir(in,tmp,dir,disp); G5R5(out,tmp); } + void OpDirAll(const Field &in, std::vector &out) { + Field tmp(in.Grid()); + _Mat.MdirAll(in,out); + for(int p=0;p &out) { + _Mat.MdirAll(in,out); + for(int p=0;p::Mdir (const FermionField &psi, FermionField &chi,in Meo5D(psi,tmp); this->DhopDir(tmp,chi,dir,disp); } +template +void CayleyFermion5D::MdirAll(const FermionField &psi, std::vector &out) +{ + FermionField tmp(psi.Grid()); + Meo5D(psi,tmp); + this->DhopDirAll(tmp,out); +} + // force terms; five routines; default to Dhop on diagonal template void CayleyFermion5D::MDeriv (GaugeField &mat,const FermionField &U,const FermionField &V,int dag) diff --git a/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h index 7af02263..beeb3e00 100644 --- a/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h @@ -143,6 +143,25 @@ void ContinuedFractionFermion5D::Mdir (const FermionField &psi, FermionFi } } template +void ContinuedFractionFermion5D::MdirAll (const FermionField &psi, std::vector &chi) +{ + int Ls = this->Ls; + + this->DhopDirAll(psi,chi); // Dslash on diagonal. g5 Dslash is hermitian + + for(int p=0;p void ContinuedFractionFermion5D::Meooe (const FermionField &psi, FermionField &chi) { int Ls = this->Ls; diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h index 8675600f..23692d49 100644 --- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h @@ -538,10 +538,16 @@ void ImprovedStaggeredFermion5D::ZeroCounters(void) // Implement the general interface. Here we use SAME mass on all slices ///////////////////////////////////////////////////////////////////////// template -void ImprovedStaggeredFermion5D::Mdir(const FermionField &in, FermionField &out, int dir, int disp) { +void ImprovedStaggeredFermion5D::Mdir(const FermionField &in, FermionField &out, int dir, int disp) +{ DhopDir(in, out, dir, disp); } template +void ImprovedStaggeredFermion5D::MdirAll(const FermionField &in, std::vector &out) +{ + assert(0); +} +template RealD ImprovedStaggeredFermion5D::M(const FermionField &in, FermionField &out) { out.Checkerboard() = in.Checkerboard(); Dhop(in, out, DaggerNo); diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h index ac2283ce..37675da0 100644 --- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h @@ -362,12 +362,19 @@ void ImprovedStaggeredFermion::DhopEO(const FermionField &in, FermionField } template -void ImprovedStaggeredFermion::Mdir(const FermionField &in, FermionField &out, int dir, int disp) { +void ImprovedStaggeredFermion::Mdir(const FermionField &in, FermionField &out, int dir, int disp) +{ DhopDir(in, out, dir, disp); } +template +void ImprovedStaggeredFermion::MdirAll(const FermionField &in, std::vector &out) +{ + assert(0); // Not implemented yet +} template -void ImprovedStaggeredFermion::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) { +void ImprovedStaggeredFermion::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) +{ Compressor compressor; Stencil.HaloExchange(in, compressor); @@ -380,6 +387,7 @@ void ImprovedStaggeredFermion::DhopDir(const FermionField &in, FermionFiel }); }; + template void ImprovedStaggeredFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, @@ -404,7 +412,6 @@ void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st #ifdef GRID_OMP Compressor compressor; int len = U.Grid()->oSites(); - const int LLs = 1; DhopTotalTime -= usecond(); diff --git a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h index 9f8f91ad..edc674cc 100644 --- a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h @@ -31,7 +31,7 @@ Author: Peter Boyle NAMESPACE_BEGIN(Grid); -template + template void PartialFractionFermion5D::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){ // this does both dag and undag but is trivial; make a common helper routing int Ls = this->Ls; @@ -45,8 +45,25 @@ void PartialFractionFermion5D::Mdir (const FermionField &psi, FermionFiel ag5xpby_ssp(chi, scale,chi,0.0,chi,s+1,s+1); } ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1); - } +template +void PartialFractionFermion5D::MdirAll (const FermionField &psi, std::vector &chi){ + // this does both dag and undag but is trivial; make a common helper routing + int Ls = this->Ls; + + this->DhopDirAll(psi,chi); + + for(int point=0;point void PartialFractionFermion5D::Meooe_internal(const FermionField &psi, FermionField &chi,int dag) { diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h index 68d20741..2a202a77 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h @@ -241,6 +241,15 @@ void WilsonFermion5D::DhopDir(const FermionField &in, FermionField &out,in Kernels::DhopDirKernel(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out,dirdisp,gamma); }; +template +void WilsonFermion5D::DhopDirAll(const FermionField &in, std::vector &out) +{ + Compressor compressor(DaggerNo); + Stencil.HaloExchange(in,compressor); + uint64_t Nsite = Umu.Grid()->oSites(); + Kernels::DhopDirAll(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out); +}; + template void WilsonFermion5D::DerivInternal(StencilImpl & st, diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h index 686b67ca..be05fcf8 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h @@ -319,28 +319,51 @@ void WilsonFermion::DhopEO(const FermionField &in, FermionField &out,int d } template -void WilsonFermion::Mdir(const FermionField &in, FermionField &out, int dir, int disp) { +void WilsonFermion::Mdir(const FermionField &in, FermionField &out, int dir, int disp) +{ DhopDir(in, out, dir, disp); } +template +void WilsonFermion::MdirAll(const FermionField &in, std::vector &out) +{ + DhopDirAll(in, out); +} template void WilsonFermion::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) { + Compressor compressor(DaggerNo); + Stencil.HaloExchange(in, compressor); + int skip = (disp == 1) ? 0 : 1; int dirdisp = dir + skip * 4; int gamma = dir + (1 - skip) * 4; - DhopDirDisp(in, out, dirdisp, gamma, DaggerNo); + DhopDirCalc(in, out, dirdisp, gamma, DaggerNo); }; - template -void WilsonFermion::DhopDirDisp(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) +void WilsonFermion::DhopDirAll(const FermionField &in, std::vector &out) { - Compressor compressor(dag); - + Compressor compressor(DaggerNo); Stencil.HaloExchange(in, compressor); + + assert((out.size()==8)||(out.size()==9)); + for(int dir=0;dir +void WilsonFermion::DhopDirCalc(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) +{ int Ls=1; - int Nsite=in.oSites(); + uint64_t Nsite=in.oSites(); Kernels::DhopDirKernel(Stencil, Umu, Stencil.CommBuf(), Ls, Nsite, in, out, dirdisp, gamma); }; @@ -348,7 +371,8 @@ template void WilsonFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, - FermionField &out, int dag) { + FermionField &out, int dag) +{ #ifdef GRID_OMP if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) DhopInternalOverlappedComms(st,lo,U,in,out,dag); diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 7397ed92..1fff4f5a 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -91,8 +91,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) } \ synchronise(); -#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon) \ - if (gamma == Dir) { \ +#define GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon) \ if (SE->_is_local ) { \ int perm= SE->_permute; \ auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane); \ @@ -102,10 +101,14 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) } \ synchronise(); \ Impl::multLink(Uchi, U[sU], chi, dir, SE, st); \ - Recon(result, Uchi); \ - synchronise(); \ + Recon(result, Uchi); + +#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon) \ + if (gamma == Dir) { \ + GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon); \ } + //////////////////////////////////////////////////////////////////// // All legs kernels ; comms then compute //////////////////////////////////////////////////////////////////// @@ -284,7 +287,36 @@ void WilsonKernels::GenericDhopSiteExt(StencilView &st, DoubledGaugeField } }; -template +#define DhopDirMacro(Dir,spProj,spRecon) \ + template \ + void WilsonKernels::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \ + int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \ + { \ + typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; \ + typedef decltype(coalescedRead(in[0])) calcSpinor; \ + calcHalfSpinor chi; \ + calcSpinor result; \ + calcHalfSpinor Uchi; \ + StencilEntry *SE; \ + int ptype; \ + const int Nsimd = SiteHalfSpinor::Nsimd(); \ + const int lane=SIMTlane(Nsimd); \ + \ + SE = st.GetEntry(ptype, dir, sF); \ + GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,spRecon); \ + coalescedWrite(out[sF], result,lane); \ + } + +DhopDirMacro(Xp,spProjXp,spReconXp); +DhopDirMacro(Yp,spProjYp,spReconYp); +DhopDirMacro(Zp,spProjZp,spReconZp); +DhopDirMacro(Tp,spProjTp,spReconTp); +DhopDirMacro(Xm,spProjXm,spReconXm); +DhopDirMacro(Ym,spProjYm,spReconYm); +DhopDirMacro(Zm,spProjZm,spReconZm); +DhopDirMacro(Tm,spProjTm,spReconTm); + +template void WilsonKernels::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma) { @@ -299,18 +331,7 @@ void WilsonKernels::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si const int lane=SIMTlane(Nsimd); SE = st.GetEntry(ptype, dir, sF); - if (gamma == Xp) { - if (SE->_is_local ) { - int perm= SE->_permute; - auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane); - spProjXp(chi,tmp); - } else { - chi = coalescedRead(buf[SE->_offset],lane); - } - Impl::multLink(Uchi, U[sU], chi, dir, SE, st); - spReconXp(result, Uchi); - } - + GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp); GENERIC_DHOPDIR_LEG(Yp,spProjYp,spReconYp); GENERIC_DHOPDIR_LEG(Zp,spProjZp,spReconZp); GENERIC_DHOPDIR_LEG(Tp,spProjTp,spReconTp); @@ -321,6 +342,38 @@ void WilsonKernels::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si coalescedWrite(out[sF], result,lane); } +template +void WilsonKernels::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls, + int Nsite, const FermionField &in, std::vector &out) +{ + auto U_v = U.View(); + auto in_v = in.View(); + auto st_v = st.View(); + + auto out_Xm = out[0].View(); + auto out_Ym = out[1].View(); + auto out_Zm = out[2].View(); + auto out_Tm = out[3].View(); + auto out_Xp = out[4].View(); + auto out_Yp = out[5].View(); + auto out_Zp = out[6].View(); + auto out_Tp = out[7].View(); + + accelerator_forNB(sss,Nsite*Ls,Simd::Nsimd(),{ + int sU=sss/Ls; + int sF =sss; + DhopDirXm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xm,0); + DhopDirYm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Ym,1); + DhopDirZm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zm,2); + DhopDirTm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tm,3); + DhopDirXp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xp,4); + DhopDirYp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Yp,5); + DhopDirZp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zp,6); + DhopDirTp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tp,7); + }); +} + + template void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls, int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma) @@ -332,13 +385,32 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S auto in_v = in.View(); auto out_v = out.View(); auto st_v = st.View(); - accelerator_for(ss,Nsite,Simd::Nsimd(),{ - for(int s=0;s, static Registrar< ConjugateGradientModule, HMC_SolverModuleFactory > __CGWFmodXMLInit("ConjugateGradient"); +static Registrar< BiCGSTABModule, + HMC_SolverModuleFactory > __CGWFmodXMLInit("BiCGSTAB"); static Registrar< ConjugateResidualModule, HMC_SolverModuleFactory > __CRWFmodXMLInit("ConjugateResidual"); diff --git a/Grid/qcd/modules/SolverModules.h b/Grid/qcd/modules/SolverModules.h index 65cb91f9..2daaaed5 100644 --- a/Grid/qcd/modules/SolverModules.h +++ b/Grid/qcd/modules/SolverModules.h @@ -119,6 +119,17 @@ class ConjugateGradientModule: public SolverModule +class BiCGSTABModule: public SolverModule { + typedef SolverModule SolverBase; + using SolverBase::SolverBase; // for constructors + + // acquire resource + virtual void initialize(){ + this->SolverPtr.reset(new BiCGSTAB(this->Par_.tolerance, this->Par_.max_iterations, true)); + } +}; + template class ConjugateResidualModule: public SolverModule { typedef SolverModule SolverBase; diff --git a/Grid/qcd/utils/CovariantLaplacian.h b/Grid/qcd/utils/CovariantLaplacian.h index 0e0620a7..94322507 100644 --- a/Grid/qcd/utils/CovariantLaplacian.h +++ b/Grid/qcd/utils/CovariantLaplacian.h @@ -92,6 +92,7 @@ public: }; void Mdir(const GaugeField&, GaugeField&, int, int){ assert(0);} + void MdirAll(const GaugeField&, std::vector &){ assert(0);} void Mdiag(const GaugeField&, GaugeField&){ assert(0);} void ImportGauge(const GaugeField& _U) { diff --git a/Grid/serialisation/BaseIO.h b/Grid/serialisation/BaseIO.h index a9147c49..bf424fc7 100644 --- a/Grid/serialisation/BaseIO.h +++ b/Grid/serialisation/BaseIO.h @@ -97,6 +97,23 @@ namespace Grid { template struct is_tensor_variable : public std::false_type {}; template struct is_tensor_variable::value && !is_tensor_fixed::value>::type> : public std::true_type {}; + + // Helper functions to get the ultimate scalar inside a tensor, and corresponding size + template + inline typename std::enable_if::value, const typename ET::Index>::type + getScalarCount(const ET &eigenTensor) { return eigenTensor.size() * Traits::count; } + template + inline typename std::enable_if::value, const typename ET::Scalar *>::type + getFirstScalar(const ET &eigenTensor) { return eigenTensor.data(); } + template + inline typename std::enable_if::value, typename ET::Scalar *>::type + getFirstScalar(ET &eigenTensor) { return eigenTensor.data(); } + template + inline typename std::enable_if::value, const typename Traits::scalar_type *>::type + getFirstScalar(const ET &eigenTensor) { return eigenTensor.data()->begin(); } + template + inline typename std::enable_if::value, typename Traits::scalar_type *>::type + getFirstScalar(ET &eigenTensor) { return eigenTensor.data()->begin(); } } // Abstract writer/reader classes //////////////////////////////////////////// @@ -128,23 +145,6 @@ namespace Grid { typename std::enable_if::value>::type write(const std::string &s, const ETensor &output); - // Helper functions for Scalar vs Container specialisations - template - inline typename std::enable_if::value, - const typename ETensor::Scalar *>::type - getFirstScalar(const ETensor &output) - { - return output.data(); - } - - template - inline typename std::enable_if::value, - const typename EigenIO::Traits::scalar_type *>::type - getFirstScalar(const ETensor &output) - { - return output.data()->begin(); - } - template inline typename std::enable_if::value, void>::type copyScalars(S * &pCopy, const S &Source) @@ -318,12 +318,12 @@ namespace Grid { TotalDims[TensorRank + i] = Traits::Dimension(i); // If the Tensor isn't in Row-Major order, then we'll need to copy it's data - const bool CopyData{NumElements > 1 && ETensor::Layout != Eigen::StorageOptions::RowMajor}; + const bool CopyData{NumElements > 1 && static_cast( ETensor::Layout ) != static_cast( Eigen::StorageOptions::RowMajor )}; const Scalar * pWriteBuffer; std::vector CopyBuffer; const Index TotalNumElements = NumElements * Traits::count; if( !CopyData ) { - pWriteBuffer = getFirstScalar( output ); + pWriteBuffer = EigenIO::getFirstScalar( output ); } else { // Regardless of the Eigen::Tensor storage order, the copy will be Row Major CopyBuffer.resize( TotalNumElements ); diff --git a/Grid/simd/Grid_gpu_vec.h b/Grid/simd/Grid_gpu_vec.h index 471fbccc..4584fb36 100644 --- a/Grid/simd/Grid_gpu_vec.h +++ b/Grid/simd/Grid_gpu_vec.h @@ -403,6 +403,10 @@ namespace Optimization { accelerator_inline GpuVectorRD operator()(GpuVectorRD a, GpuVectorRD b){ return a/b; } + accelerator_inline GpuVectorI operator()(GpuVectorI a, GpuVectorI b){ + return a/b; + } + // Danger -- element wise divide fro complex, not complex div. // See Grid_vector_types.h lines around 735, applied after "toReal" accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b){ diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index 472013f4..570f4234 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -628,6 +628,7 @@ void Grid_debug_handler_init(void) sigaction(SIGSEGV,&sa,NULL); sigaction(SIGTRAP,&sa,NULL); sigaction(SIGBUS,&sa,NULL); + sigaction(SIGUSR2,&sa,NULL); feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO); diff --git a/Hadrons/A2AMatrix.hpp b/Hadrons/A2AMatrix.hpp deleted file mode 100644 index 72ac7951..00000000 --- a/Hadrons/A2AMatrix.hpp +++ /dev/null @@ -1,777 +0,0 @@ -/************************************************************************************* - -Grid physics library, www.github.com/paboyle/Grid - -Source file: Hadrons/A2AMatrix.hpp - -Copyright (C) 2015-2019 - -Author: Antonin Portelli -Author: Peter Boyle - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along -with this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - -See the full license in the file "LICENSE" in the top level distribution directory -*************************************************************************************/ -/* END LEGAL */ -#ifndef A2A_Matrix_hpp_ -#define A2A_Matrix_hpp_ - -#include -#include -#include -#ifdef USE_MKL -#include "mkl.h" -#include "mkl_cblas.h" -#endif - -#ifndef HADRONS_A2AM_NAME -#define HADRONS_A2AM_NAME "a2aMatrix" -#endif - -#ifndef HADRONS_A2AM_IO_TYPE -#define HADRONS_A2AM_IO_TYPE ComplexF -#endif - -#define HADRONS_A2AM_PARALLEL_IO - -BEGIN_HADRONS_NAMESPACE - -// general A2A matrix set based on Eigen tensors and Grid-allocated memory -// Dimensions: -// 0 - ext - external field (momentum, EM field, ...) -// 1 - str - spin-color structure -// 2 - t - timeslice -// 3 - i - left A2A mode index -// 4 - j - right A2A mode index -template -using A2AMatrixSet = Eigen::TensorMap>; - -template -using A2AMatrix = Eigen::Matrix; - -template -using A2AMatrixTr = Eigen::Matrix; - -/****************************************************************************** - * Abstract class for A2A kernels * - ******************************************************************************/ -template -class A2AKernel -{ -public: - A2AKernel(void) = default; - virtual ~A2AKernel(void) = default; - virtual void operator()(A2AMatrixSet &m, const Field *left, const Field *right, - const unsigned int orthogDim, double &time) = 0; - virtual double flops(const unsigned int blockSizei, const unsigned int blockSizej) = 0; - virtual double bytes(const unsigned int blockSizei, const unsigned int blockSizej) = 0; -}; - -/****************************************************************************** - * Class to handle A2A matrix block HDF5 I/O * - ******************************************************************************/ -template -class A2AMatrixIo -{ -public: - // constructors - A2AMatrixIo(void) = default; - A2AMatrixIo(std::string filename, std::string dataname, - const unsigned int nt, const unsigned int ni = 0, - const unsigned int nj = 0); - // destructor - ~A2AMatrixIo(void) = default; - // access - unsigned int getNi(void) const; - unsigned int getNj(void) const; - unsigned int getNt(void) const; - size_t getSize(void) const; - // file allocation - template - void initFile(const MetadataType &d, const unsigned int chunkSize); - // block I/O - void saveBlock(const T *data, const unsigned int i, const unsigned int j, - const unsigned int blockSizei, const unsigned int blockSizej); - void saveBlock(const A2AMatrixSet &m, const unsigned int ext, const unsigned int str, - const unsigned int i, const unsigned int j); - template