diff --git a/Grid/GridCore.h b/Grid/GridCore.h index 41c64ef6..90d01f2e 100644 --- a/Grid/GridCore.h +++ b/Grid/GridCore.h @@ -59,6 +59,7 @@ Author: paboyle #include #include #include +#include #include #include NAMESPACE_CHECK(GridCore) diff --git a/Grid/algorithms/Algorithms.h b/Grid/algorithms/Algorithms.h index 9eaf89f3..42fda6d1 100644 --- a/Grid/algorithms/Algorithms.h +++ b/Grid/algorithms/Algorithms.h @@ -29,6 +29,9 @@ Author: Peter Boyle #ifndef GRID_ALGORITHMS_H #define GRID_ALGORITHMS_H +NAMESPACE_CHECK(blas); +#include + NAMESPACE_CHECK(algorithms); #include #include @@ -44,7 +47,10 @@ NAMESPACE_CHECK(SparseMatrix); #include #include NAMESPACE_CHECK(approx); -#include +#include +#include +#include +NAMESPACE_CHECK(deflation); #include NAMESPACE_CHECK(ConjGrad); #include @@ -67,10 +73,11 @@ NAMESPACE_CHECK(BiCGSTAB); #include #include #include - +#include +#include NAMESPACE_CHECK(PowerMethod); -#include -NAMESPACE_CHECK(CoarsendMatrix); +#include +NAMESPACE_CHECK(multigrid); #include #endif diff --git a/Grid/algorithms/LinearOperator.h b/Grid/algorithms/LinearOperator.h index 5096231d..fb996bf1 100644 --- a/Grid/algorithms/LinearOperator.h +++ b/Grid/algorithms/LinearOperator.h @@ -145,6 +145,44 @@ public: } }; +//////////////////////////////////////////////////////////////////// +// Create a shifted HermOp +//////////////////////////////////////////////////////////////////// +template +class ShiftedHermOpLinearOperator : public LinearOperatorBase { + LinearOperatorBase &_Mat; + RealD _shift; +public: + ShiftedHermOpLinearOperator(LinearOperatorBase &Mat,RealD shift): _Mat(Mat), _shift(shift){}; + // Support for coarsening to a multigrid + void OpDiag (const Field &in, Field &out) { + assert(0); + } + void OpDir (const Field &in, Field &out,int dir,int disp) { + assert(0); + } + void OpDirAll (const Field &in, std::vector &out){ + assert(0); + }; + void Op (const Field &in, Field &out){ + HermOp(in,out); + } + void AdjOp (const Field &in, Field &out){ + HermOp(in,out); + } + void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ + HermOp(in,out); + ComplexD dot = innerProduct(in,out); + n1=real(dot); + n2=norm2(out); + } + void HermOp(const Field &in, Field &out){ + _Mat.HermOp(in,out); + out = out + _shift*in; + } +}; + + //////////////////////////////////////////////////////////////////// // Wrap an already herm matrix //////////////////////////////////////////////////////////////////// diff --git a/Grid/algorithms/approx/Chebyshev.h b/Grid/algorithms/approx/Chebyshev.h index 1d6984f3..130daf6c 100644 --- a/Grid/algorithms/approx/Chebyshev.h +++ b/Grid/algorithms/approx/Chebyshev.h @@ -90,9 +90,8 @@ public: order=_order; if(order < 2) exit(-1); - Coeffs.resize(order); - Coeffs.assign(0.,order); - Coeffs[order-1] = 1.; + Coeffs.resize(order,0.0); + Coeffs[order-1] = 1.0; }; // PB - more efficient low pass drops high modes above the low as 1/x uses all Chebyshev's. diff --git a/Grid/algorithms/approx/MultiShiftFunction.h b/Grid/algorithms/approx/MultiShiftFunction.h index 58c75f6c..51d79f94 100644 --- a/Grid/algorithms/approx/MultiShiftFunction.h +++ b/Grid/algorithms/approx/MultiShiftFunction.h @@ -40,7 +40,7 @@ public: RealD norm; RealD lo,hi; - MultiShiftFunction(int n,RealD _lo,RealD _hi): poles(n), residues(n), lo(_lo), hi(_hi) {;}; + MultiShiftFunction(int n,RealD _lo,RealD _hi): poles(n), residues(n), tolerances(n), lo(_lo), hi(_hi) {;}; RealD approx(RealD x); void csv(std::ostream &out); void gnuplot(std::ostream &out); diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h index f6418b7e..a7edb485 100644 --- a/Grid/algorithms/blas/BatchedBlas.h +++ b/Grid/algorithms/blas/BatchedBlas.h @@ -42,6 +42,7 @@ Author: Peter Boyle #ifdef GRID_ONE_MKL #include #endif + /////////////////////////////////////////////////////////////////////// // Need to rearrange lattice data to be in the right format for a // batched multiply. Might as well make these static, dense packed @@ -633,7 +634,6 @@ public: deviceVector beta_p(1); acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD)); acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD)); - // std::cout << "blasZgemmStridedBatched mnk "< + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +NAMESPACE_BEGIN(Grid); + + +/* + MultiRHS block projection + + Import basis -> nblock x nbasis x (block x internal) + Import vector of fine lattice objects -> nblock x nrhs x (block x internal) + + => coarse_(nrhs x nbasis )^block = via batched GEMM + +//template +//inline void blockProject(Lattice > &coarseData, +// const VLattice &fineData, +// const VLattice &Basis) +*/ + +template +class MultiRHSBlockProject +{ +public: + + typedef typename Field::scalar_type scalar; + typedef typename Field::scalar_object scalar_object; + typedef Field Fermion; + + int nbasis; + GridBase *coarse_grid; + GridBase *fine_grid; + uint64_t block_vol; + uint64_t fine_vol; + uint64_t coarse_vol; + uint64_t words; + + // Row major layout "C" order: + // BLAS_V[coarse_vol][nbasis][block_vol][words] + // BLAS_F[coarse_vol][nrhs][block_vol][words] + // BLAS_C[coarse_vol][nrhs][nbasis] + /* + * in Fortran column major notation (cuBlas order) + * + * Vxb = [v1(x)][..][vn(x)] ... x coarse vol + * + * Fxr = [r1(x)][..][rm(x)] ... x coarse vol + * + * Block project: + * C_br = V^dag F x coarse vol + * + * Block promote: + * F_xr = Vxb Cbr x coarse_vol + */ + deviceVector BLAS_V; // words * block_vol * nbasis x coarse_vol + deviceVector BLAS_F; // nrhs x fine_vol * words -- the sources + deviceVector BLAS_C; // nrhs x coarse_vol * nbasis -- the coarse coeffs + + RealD blasNorm2(deviceVector &blas) + { + scalar ss(0.0); + std::vector tmp(blas.size()); + acceleratorCopyFromDevice(&blas[0],&tmp[0],blas.size()*sizeof(scalar)); + for(int64_t s=0;sGlobalSum(ss); + return real(ss); + } + + MultiRHSBlockProject(){}; + ~MultiRHSBlockProject(){ Deallocate(); }; + + void Deallocate(void) + { + nbasis=0; + coarse_grid=nullptr; + fine_grid=nullptr; + fine_vol=0; + block_vol=0; + coarse_vol=0; + words=0; + BLAS_V.resize(0); + BLAS_F.resize(0); + BLAS_C.resize(0); + } + void Allocate(int _nbasis,GridBase *_fgrid,GridBase *_cgrid) + { + nbasis=_nbasis; + + fine_grid=_fgrid; + coarse_grid=_cgrid; + + fine_vol = fine_grid->lSites(); + coarse_vol = coarse_grid->lSites(); + block_vol = fine_vol/coarse_vol; + + words = sizeof(scalar_object)/sizeof(scalar); + + BLAS_V.resize (fine_vol * words * nbasis ); + } + void ImportFineGridVectors(std::vector &vecs, deviceVector &blas) + { + int nvec = vecs.size(); + typedef typename Field::vector_object vobj; + // std::cout << GridLogMessage <<" BlockProjector importing "<_ndimension; + assert(block_vol == fine_grid->oSites() / coarse_grid->oSites()); + + Coordinate block_r (_ndimension); + for(int d=0 ; d<_ndimension;d++){ + block_r[d] = fine_grid->_rdimensions[d] / coarse_grid->_rdimensions[d]; + } + + uint64_t sz = blas.size(); + + acceleratorMemSet(&blas[0],0,blas.size()*sizeof(scalar)); + + Coordinate fine_rdimensions = fine_grid->_rdimensions; + Coordinate coarse_rdimensions = coarse_grid->_rdimensions; + int64_t bv= block_vol; + for(int v=0;voSites(); + + // loop over fine sites + const int Nsimd = vobj::Nsimd(); + // std::cout << "sz "<oSites() * block_vol * nvec * words<oSites() * block_vol * nvec * words); + uint64_t lwords= words; // local variable for copy in to GPU + accelerator_for(sf,osites,Nsimd,{ +#ifdef GRID_SIMT + { + int lane=acceleratorSIMTlane(Nsimd); // buffer lane +#else + for(int lane=0;lane &vecs, deviceVector &blas) + { + typedef typename Field::vector_object vobj; + + int nvec = vecs.size(); + + assert(vecs[0].Grid()==fine_grid); + + subdivides(coarse_grid,fine_grid); // require they map + + int _ndimension = coarse_grid->_ndimension; + assert(block_vol == fine_grid->oSites() / coarse_grid->oSites()); + + Coordinate block_r (_ndimension); + for(int d=0 ; d<_ndimension;d++){ + block_r[d] = fine_grid->_rdimensions[d] / coarse_grid->_rdimensions[d]; + } + Coordinate fine_rdimensions = fine_grid->_rdimensions; + Coordinate coarse_rdimensions = coarse_grid->_rdimensions; + + // std::cout << " export fine Blas norm "<oSites(); + uint64_t lwords = words; + // std::cout << " Nsimd is "< + void ImportCoarseGridVectors(std::vector > &vecs, deviceVector &blas) + { + int nvec = vecs.size(); + typedef typename vobj::scalar_object coarse_scalar_object; + + // std::cout << " BlockProjector importing "<_ndimension; + + uint64_t sz = blas.size(); + + Coordinate coarse_rdimensions = coarse_grid->_rdimensions; + + for(int v=0;voSites(); + + // loop over fine sites + const int Nsimd = vobj::Nsimd(); + uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar); + assert(cwords==nbasis); + + accelerator_for(sc,osites,Nsimd,{ +#ifdef GRID_SIMT + { + int lane=acceleratorSIMTlane(Nsimd); // buffer lane +#else + for(int lane=0;lane + void ExportCoarseGridVectors(std::vector > &vecs, deviceVector &blas) + { + int nvec = vecs.size(); + typedef typename vobj::scalar_object coarse_scalar_object; + // std::cout << GridLogMessage<<" BlockProjector exporting "<_ndimension; + + uint64_t sz = blas.size(); + + Coordinate coarse_rdimensions = coarse_grid->_rdimensions; + + // std::cout << " export coarsee Blas norm "<oSites(); + + // loop over fine sites + const int Nsimd = vobj::Nsimd(); + uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar); + assert(cwords==nbasis); + + accelerator_for(sc,osites,Nsimd,{ + // Wrap in a macro "FOR_ALL_LANES(lane,{ ... }); +#ifdef GRID_SIMT + { + int lane=acceleratorSIMTlane(Nsimd); // buffer lane +#else + for(int lane=0;lane &vecs) + { + // std::cout << " BlockProjector Import basis size "< + void blockProject(std::vector &fine,std::vector< Lattice > & coarse) + { + int nrhs=fine.size(); + int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar); + // std::cout << "blockProject nbasis " < Vd(coarse_vol); + deviceVector Fd(coarse_vol); + deviceVector Cd(coarse_vol); + + // std::cout << "BlockProject pointers"< + void blockPromote(std::vector &fine,std::vector > & coarse) + { + int nrhs=fine.size(); + int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar); + assert(nbasis==_nbasis); + + BLAS_F.resize (fine_vol * words * nrhs ); + BLAS_C.resize (coarse_vol * nbasis * nrhs ); + + ImportCoarseGridVectors(coarse, BLAS_C); + + GridBLAS BLAS; + + deviceVector Vd(coarse_vol); + deviceVector Fd(coarse_vol); + deviceVector Cd(coarse_vol); + + for(int c=0;c + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +NAMESPACE_BEGIN(Grid); + + +/* Need helper object for BLAS accelerated mrhs projection + + i) MultiRHS Deflation + + Import Evecs -> nev x vol x internal + Import vector of Lattice objects -> nrhs x vol x internal + => Cij (nrhs x Nev) via GEMM. + => Guess (nrhs x vol x internal) = C x evecs (via GEMM) + Export + + + ii) MultiRHS block projection + + Import basis -> nblock x nbasis x (block x internal) + Import vector of fine lattice objects -> nblock x nrhs x (block x internal) + + => coarse_(nrhs x nbasis )^block = via batched GEMM + + iii) Alternate interface: + Import higher dim Lattice object-> vol x nrhs layout + +*/ +template +class MultiRHSDeflation +{ +public: + + typedef typename Field::scalar_type scalar; + typedef typename Field::scalar_object scalar_object; + + int nev; + std::vector eval; + GridBase *grid; + uint64_t vol; + uint64_t words; + + deviceVector BLAS_E; // nev x vol -- the eigenbasis (up to a 1/sqrt(lambda)) + deviceVector BLAS_R; // nrhs x vol -- the sources + deviceVector BLAS_G; // nrhs x vol -- the guess + deviceVector BLAS_C; // nrhs x nev -- the coefficients + + MultiRHSDeflation(){}; + ~MultiRHSDeflation(){ Deallocate(); }; + + void Deallocate(void) + { + nev=0; + grid=nullptr; + vol=0; + words=0; + BLAS_E.resize(0); + BLAS_R.resize(0); + BLAS_C.resize(0); + BLAS_G.resize(0); + } + void Allocate(int _nev,GridBase *_grid) + { + nev=_nev; + grid=_grid; + vol = grid->lSites(); + words = sizeof(scalar_object)/sizeof(scalar); + eval.resize(nev); + BLAS_E.resize (vol * words * nev ); + std::cout << GridLogMessage << " Allocate for "< &evec,std::vector &_eval) + { + ImportEigenBasis(evec,_eval,0,evec.size()); + } + // Could use to import a batch of eigenvectors + void ImportEigenBasis(std::vector &evec,std::vector &_eval, int _ev0, int _nev) + { + assert(_ev0+_nev<=evec.size()); + + Allocate(_nev,evec[0].Grid()); + + // Imports a sub-batch of eigenvectors, _ev0, ..., _ev0+_nev-1 + for(int e=0;e &source,std::vector & guess) + { + int nrhs = source.size(); + assert(source.size()==guess.size()); + assert(grid == guess[0].Grid()); + conformable(guess[0],source[0]); + + int64_t vw = vol * words; + + RealD t0 = usecond(); + BLAS_R.resize(nrhs * vw); // cost free if size doesn't change + BLAS_G.resize(nrhs * vw); // cost free if size doesn't change + BLAS_C.resize(nev * nrhs);// cost free if size doesn't change + + ///////////////////////////////////////////// + // Copy in the multi-rhs sources + ///////////////////////////////////////////// + // for(int r=0;r * Script A = SolverMatrix * Script P = Preconditioner * - * Deflation methods considered - * -- Solve P A x = P b [ like Luscher ] - * DEF-1 M P A x = M P b [i.e. left precon] - * DEF-2 P^T M A x = P^T M b - * ADEF-1 Preconditioner = M P + Q [ Q + M + M A Q] - * ADEF-2 Preconditioner = P^T M + Q - * BNN Preconditioner = P^T M P + Q - * BNN2 Preconditioner = M P + P^TM +Q - M P A M - * * Implement ADEF-2 * * Vstart = P^Tx + Qb * M1 = P^TM + Q * M2=M3=1 - * Vout = x */ +NAMESPACE_BEGIN(Grid); -// abstract base -template -class TwoLevelFlexiblePcg : public LinearFunction + +template +class TwoLevelCG : public LinearFunction { public: - int verbose; RealD Tolerance; Integer MaxIterations; - const int mmax = 5; GridBase *grid; - GridBase *coarsegrid; - LinearOperatorBase *_Linop - OperatorFunction *_Smoother, - LinearFunction *_CoarseSolver; - - // Need somthing that knows how to get from Coarse to fine and back again + // Fine operator, Smoother, CoarseSolver + LinearOperatorBase &_FineLinop; + LinearFunction &_Smoother; // more most opertor functions - TwoLevelFlexiblePcg(RealD tol, - Integer maxit, - LinearOperatorBase *Linop, - LinearOperatorBase *SmootherLinop, - OperatorFunction *Smoother, - OperatorFunction CoarseLinop - ) : + TwoLevelCG(RealD tol, + Integer maxit, + LinearOperatorBase &FineLinop, + LinearFunction &Smoother, + GridBase *fine) : Tolerance(tol), MaxIterations(maxit), - _Linop(Linop), - _PreconditionerLinop(PrecLinop), - _Preconditioner(Preconditioner) - { - verbose=0; + _FineLinop(FineLinop), + _Smoother(Smoother) + { + grid = fine; }; - - // The Pcg routine is common to all, but the various matrices differ from derived - // implementation to derived implmentation - void operator() (const Field &src, Field &psi){ - void operator() (const Field &src, Field &psi){ - - psi.Checkerboard() = src.Checkerboard(); - grid = src.Grid(); - + + virtual void operator() (const Field &src, Field &x) + { + std::cout << GridLogMessage<<"HDCG: fPcg starting single RHS"< p (mmax,grid); + int mmax = 5; + std::cout << GridLogMessage<<"HDCG: fPcg allocating"< p(mmax,grid); std::vector mmp(mmax,grid); std::vector pAp(mmax); - - Field x (grid); x = psi; - Field z (grid); + Field z(grid); Field tmp(grid); - Field r (grid); - Field mu (grid); - + Field mp (grid); + Field r (grid); + Field mu (grid); + + std::cout << GridLogMessage<<"HDCG: fPcg allocated"< int peri_kp = (k+1) % mmax; rtz=rtzp; - d= M3(p[peri_k],mp,mmp[peri_k],tmp); + d= PcgM3(p[peri_k],mmp[peri_k]); a = rtz/d; // Memorise this pAp[peri_k] = d; - + axpy(x,a,p[peri_k],x); RealD rn = axpy_norm(r,-a,mmp[peri_k],r); // Compute z = M x - M1(r,z,tmp,mp); - + PcgM1(r,z); + + { + RealD n1,n2; + n1=norm2(r); + n2=norm2(z); + std::cout << GridLogMessage<<"HDCG::fPcg iteration "< z + b p ; b = + // Standard search direction p -> z + b p b = (rtzp)/rtz; - + int northog; + // k=zero <=> peri_kp=1; northog = 1 + // k=1 <=> peri_kp=2; northog = 2 + // ... ... ... + // k=mmax-2<=> peri_kp=mmax-1; northog = mmax-1 + // k=mmax-1<=> peri_kp=0; northog = 1 + // northog = (peri_kp==0)?1:peri_kp; // This is the fCG(mmax) algorithm northog = (k>mmax-1)?(mmax-1):k; // This is the fCG-Tr(mmax-1) algorithm + std::cout< } RealD rrn=sqrt(rn/ssq); - std::cout< &src, std::vector &x) + { + std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<Barrier(); + int nrhs = src.size(); + std::vector f(nrhs); + std::vector rtzp(nrhs); + std::vector rtz(nrhs); + std::vector a(nrhs); + std::vector d(nrhs); + std::vector b(nrhs); + std::vector rptzp(nrhs); + ///////////////////////////// + // Set up history vectors + ///////////////////////////// + int mmax = 3; + std::cout << GridLogMessage<<"HDCG: fPcg allocating"<Barrier(); + std::vector > p(nrhs); for(int r=0;rBarrier(); + std::vector > mmp(nrhs); for(int r=0;rBarrier(); + std::vector > pAp(nrhs); for(int r=0;rBarrier(); + std::vector z(nrhs,grid); + std::vector mp (nrhs,grid); + std::vector r (nrhs,grid); + std::vector mu (nrhs,grid); + std::cout << GridLogMessage<<"HDCG: fPcg allocated z,mp,r,mu"<Barrier(); + + //Initial residual computation & set up + std::vector src_nrm(nrhs); + for(int rhs=0;rhs tn(nrhs); + + GridStopWatch HDCGTimer; + HDCGTimer.Start(); + ////////////////////////// + // x0 = Vstart -- possibly modify guess + ////////////////////////// + Vstart(x,src); + + for(int rhs=0;rhs ssq(nrhs); + std::vector rsq(nrhs); + std::vector pp(nrhs,grid); + + for(int rhs=0;rhs rn(nrhs); + for (int k=0;k<=MaxIterations;k++){ + + int peri_k = k % mmax; + int peri_kp = (k+1) % mmax; + + for(int rhs=0;rhsBarrier(); + + RealD max_rn=0.0; + for(int rhs=0;rhsmmax-1)?(mmax-1):k; // This is the fCG-Tr(mmax-1) algorithm + std::cout< max_rn ) max_rn = rrn; + } + + // Stopping condition based on worst case + if ( max_rn <= Tolerance ) { + + HDCGTimer.Stop(); + std::cout< & in,std::vector & out) + { + std::cout << "PcgM1 default (cheat) mrhs version"<PcgM1(in[rhs],out[rhs]); + } + } + virtual void PcgM1(Field & in, Field & out) =0; + virtual void Vstart(std::vector & x,std::vector & src) + { + std::cout << "Vstart default (cheat) mrhs version"<Vstart(x[rhs],src[rhs]); + } + } + virtual void Vstart(Field & x,const Field & src)=0; + virtual void PcgM2(const Field & in, Field & out) { + out=in; } - virtual void M1(Field & in, Field & out) {// the smoother + virtual RealD PcgM3(const Field & p, Field & mmp){ + RealD dd; + _FineLinop.HermOp(p,mmp); + ComplexD dot = innerProduct(p,mmp); + dd=real(dot); + return dd; + } + ///////////////////////////////////////////////////////////////////// + // Only Def1 has non-trivial Vout. + ///////////////////////////////////////////////////////////////////// + +}; + +template +class TwoLevelADEF2 : public TwoLevelCG +{ + public: + /////////////////////////////////////////////////////////////////////////////////// + // Need something that knows how to get from Coarse to fine and back again + // void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){ + // void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){ + /////////////////////////////////////////////////////////////////////////////////// + GridBase *coarsegrid; + Aggregation &_Aggregates; + LinearFunction &_CoarseSolver; + LinearFunction &_CoarseSolverPrecise; + /////////////////////////////////////////////////////////////////////////////////// + + // more most opertor functions + TwoLevelADEF2(RealD tol, + Integer maxit, + LinearOperatorBase &FineLinop, + LinearFunction &Smoother, + LinearFunction &CoarseSolver, + LinearFunction &CoarseSolverPrecise, + Aggregation &Aggregates + ) : + TwoLevelCG(tol,maxit,FineLinop,Smoother,Aggregates.FineGrid), + _CoarseSolver(CoarseSolver), + _CoarseSolverPrecise(CoarseSolverPrecise), + _Aggregates(Aggregates) + { + coarsegrid = Aggregates.CoarseGrid; + }; + + virtual void PcgM1(Field & in, Field & out) + { + GRID_TRACE("MultiGridPreconditioner "); // [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min] - Field tmp(grid); - Field Min(grid); - PcgM(in,Min); // Smoother call + Field tmp(this->grid); + Field Min(this->grid); + CoarseField PleftProj(this->coarsegrid); + CoarseField PleftMss_proj(this->coarsegrid); - HermOp(Min,out); + GridStopWatch SmootherTimer; + GridStopWatch MatrixTimer; + SmootherTimer.Start(); + this->_Smoother(in,Min); + SmootherTimer.Stop(); + + MatrixTimer.Start(); + this->_FineLinop.HermOp(Min,out); + MatrixTimer.Stop(); axpy(tmp,-1.0,out,in); // tmp = in - A Min - ProjectToSubspace(tmp,PleftProj); - ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s - PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min] + GridStopWatch ProjTimer; + GridStopWatch CoarseTimer; + GridStopWatch PromTimer; + ProjTimer.Start(); + this->_Aggregates.ProjectToSubspace(PleftProj,tmp); + ProjTimer.Stop(); + CoarseTimer.Start(); + this->_CoarseSolver(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s + CoarseTimer.Stop(); + PromTimer.Start(); + this->_Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min] + PromTimer.Stop(); + std::cout << GridLogPerformance << "PcgM1 breakdown "<Mprec(p,mmp,tmp,0,1);// Dag no - // linop_d->Mprec(mmp,mp,tmp,1);// Dag yes - // Pleft(mp,mmp); - // d=real(linop_d->inner(p,mmp)); - } - - virtual void VstartDef2(Field & xconst Field & src){ - //case PcgDef2: - //case PcgAdef2: - //case PcgAdef2f: - //case PcgV11f: + virtual void Vstart(Field & x,const Field & src) + { + std::cout << GridLogMessage<<"HDCG: fPcg Vstart "< // = src_s - (A guess)_s - src_s + (A guess)_s // = 0 /////////////////////////////////// - Field r(grid); - Field mmp(grid); - - HermOp(x,mmp); - axpy (r, -1.0, mmp, src); // r_{-1} = src - A x - ProjectToSubspace(r,PleftProj); - ApplyInverseCG(PleftProj,PleftMss_proj); // Ass^{-1} r_s - PromoteFromSubspace(PleftMss_proj,mmp); - x=x+mmp; + Field r(this->grid); + Field mmp(this->grid); + CoarseField PleftProj(this->coarsegrid); + CoarseField PleftMss_proj(this->coarsegrid); + + std::cout << GridLogMessage<<"HDCG: fPcg Vstart projecting "<_Aggregates.ProjectToSubspace(PleftProj,src); + std::cout << GridLogMessage<<"HDCG: fPcg Vstart coarse solve "<_CoarseSolverPrecise(PleftProj,PleftMss_proj); // Ass^{-1} r_s + std::cout << GridLogMessage<<"HDCG: fPcg Vstart promote "<_Aggregates.PromoteFromSubspace(PleftMss_proj,x); } +}; + + +template +class TwoLevelADEF1defl : public TwoLevelCG +{ +public: + const std::vector &evec; + const std::vector &eval; + + TwoLevelADEF1defl(RealD tol, + Integer maxit, + LinearOperatorBase &FineLinop, + LinearFunction &Smoother, + std::vector &_evec, + std::vector &_eval) : + TwoLevelCG(tol,maxit,FineLinop,Smoother,_evec[0].Grid()), + evec(_evec), + eval(_eval) + {}; + + // Can just inherit existing M2 + // Can just inherit existing M3 + + // Simple vstart - do nothing virtual void Vstart(Field & x,const Field & src){ - return; + x=src; // Could apply Q + }; + + // Override PcgM1 + virtual void PcgM1(Field & in, Field & out) + { + GRID_TRACE("EvecPreconditioner "); + int N=evec.size(); + Field Pin(this->grid); + Field Qin(this->grid); + + //MP + Q = M(1-AQ) + Q = M + // // If we are eigenvector deflating in coarse space + // // Q = Sum_i |phi_i> 1/lambda_i _Smoother(Pin,out); + + out = out + Qin; } +}; - ///////////////////////////////////////////////////////////////////// - // Only Def1 has non-trivial Vout. Override in Def1 - ///////////////////////////////////////////////////////////////////// - virtual void Vout (Field & in, Field & out,Field & src){ - out = in; - //case PcgDef1: - // //Qb + PT x - // ProjectToSubspace(src,PleftProj); - // ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} r_s - // PromoteFromSubspace(PleftMss_proj,tmp); - // - // Pright(in,out); - // - // linop_d->axpy(out,tmp,out,1.0); - // break; - } +NAMESPACE_END(Grid); - //////////////////////////////////////////////////////////////////////////////////////////////// - // Pright and Pleft are common to all implementations - //////////////////////////////////////////////////////////////////////////////////////////////// - virtual void Pright(Field & in,Field & out){ - // P_R = [ 1 0 ] - // [ -Mss^-1 Msb 0 ] - Field in_sbar(grid); - - ProjectToSubspace(in,PleftProj); - PromoteFromSubspace(PleftProj,out); - axpy(in_sbar,-1.0,out,in); // in_sbar = in - in_s - - HermOp(in_sbar,out); - ProjectToSubspace(out,PleftProj); // Mssbar in_sbar (project) - - ApplyInverse (PleftProj,PleftMss_proj); // Mss^{-1} Mssbar - PromoteFromSubspace(PleftMss_proj,out); // - - axpy(out,-1.0,out,in_sbar); // in_sbar - Mss^{-1} Mssbar in_sbar - } - virtual void Pleft (Field & in,Field & out){ - // P_L = [ 1 -Mbs Mss^-1] - // [ 0 0 ] - Field in_sbar(grid); - Field tmp2(grid); - Field Mtmp(grid); - - ProjectToSubspace(in,PleftProj); - PromoteFromSubspace(PleftProj,out); - axpy(in_sbar,-1.0,out,in); // in_sbar = in - in_s - - ApplyInverse(PleftProj,PleftMss_proj); // Mss^{-1} in_s - PromoteFromSubspace(PleftMss_proj,out); - - HermOp(out,Mtmp); - - ProjectToSubspace(Mtmp,PleftProj); // Msbar s Mss^{-1} - PromoteFromSubspace(PleftProj,tmp2); - - axpy(out,-1.0,tmp2,Mtmp); - axpy(out,-1.0,out,in_sbar); // in_sbar - Msbars Mss^{-1} in_s - } -} - -template -class TwoLevelFlexiblePcgADef2 : public TwoLevelFlexiblePcg { - public: - virtual void M(Field & in,Field & out,Field & tmp){ - - } - virtual void M1(Field & in, Field & out,Field & tmp,Field & mp){ - - } - virtual void M2(Field & in, Field & out){ - - } - virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp){ - - } - virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp){ - - } -} -/* -template -class TwoLevelFlexiblePcgAD : public TwoLevelFlexiblePcg { - public: - virtual void M(Field & in,Field & out,Field & tmp); - virtual void M1(Field & in, Field & out,Field & tmp,Field & mp); - virtual void M2(Field & in, Field & out); - virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp); - virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp); -} - -template -class TwoLevelFlexiblePcgDef1 : public TwoLevelFlexiblePcg { - public: - virtual void M(Field & in,Field & out,Field & tmp); - virtual void M1(Field & in, Field & out,Field & tmp,Field & mp); - virtual void M2(Field & in, Field & out); - virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp); - virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp); - virtual void Vout (Field & in, Field & out,Field & src,Field & tmp); -} - -template -class TwoLevelFlexiblePcgDef2 : public TwoLevelFlexiblePcg { - public: - virtual void M(Field & in,Field & out,Field & tmp); - virtual void M1(Field & in, Field & out,Field & tmp,Field & mp); - virtual void M2(Field & in, Field & out); - virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp); - virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp); -} - -template -class TwoLevelFlexiblePcgV11: public TwoLevelFlexiblePcg { - public: - virtual void M(Field & in,Field & out,Field & tmp); - virtual void M1(Field & in, Field & out,Field & tmp,Field & mp); - virtual void M2(Field & in, Field & out); - virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp); - virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp); -} -*/ #endif diff --git a/Grid/algorithms/iterative/AdefMrhs.h b/Grid/algorithms/iterative/AdefMrhs.h new file mode 100644 index 00000000..a2788138 --- /dev/null +++ b/Grid/algorithms/iterative/AdefMrhs.h @@ -0,0 +1,414 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/algorithms/iterative/AdefGeneric.h + + Copyright (C) 2015 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#pragma once + + + /* + * Compared to Tang-2009: P=Pleft. P^T = PRight Q=MssInv. + * Script A = SolverMatrix + * Script P = Preconditioner + * + * Implement ADEF-2 + * + * Vstart = P^Tx + Qb + * M1 = P^TM + Q + * M2=M3=1 + */ +NAMESPACE_BEGIN(Grid); + + +template +class TwoLevelCGmrhs +{ + public: + RealD Tolerance; + Integer MaxIterations; + GridBase *grid; + + // Fine operator, Smoother, CoarseSolver + LinearOperatorBase &_FineLinop; + LinearFunction &_Smoother; + + GridStopWatch ProjectTimer; + GridStopWatch PromoteTimer; + GridStopWatch DeflateTimer; + GridStopWatch CoarseTimer; + GridStopWatch FineTimer; + GridStopWatch SmoothTimer; + GridStopWatch InsertTimer; + + + // more most opertor functions + TwoLevelCGmrhs(RealD tol, + Integer maxit, + LinearOperatorBase &FineLinop, + LinearFunction &Smoother, + GridBase *fine) : + Tolerance(tol), + MaxIterations(maxit), + _FineLinop(FineLinop), + _Smoother(Smoother) + { + grid = fine; + }; + + // Vector case + virtual void operator() (std::vector &src, std::vector &x) + { + std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<Barrier(); + int nrhs = src.size(); + std::vector f(nrhs); + std::vector rtzp(nrhs); + std::vector rtz(nrhs); + std::vector a(nrhs); + std::vector d(nrhs); + std::vector b(nrhs); + std::vector rptzp(nrhs); + ///////////////////////////// + // Set up history vectors + ///////////////////////////// + int mmax = 3; + + std::vector > p(nrhs); for(int r=0;r > mmp(nrhs); for(int r=0;r > pAp(nrhs); for(int r=0;r z(nrhs,grid); + std::vector mp (nrhs,grid); + std::vector r (nrhs,grid); + std::vector mu (nrhs,grid); + + //Initial residual computation & set up + std::vector src_nrm(nrhs); + for(int rhs=0;rhs tn(nrhs); + + GridStopWatch HDCGTimer; + ////////////////////////// + // x0 = Vstart -- possibly modify guess + ////////////////////////// + Vstart(x,src); + + for(int rhs=0;rhs ssq(nrhs); + std::vector rsq(nrhs); + std::vector pp(nrhs,grid); + + for(int rhs=0;rhs rn(nrhs); + for (int k=0;k<=MaxIterations;k++){ + + int peri_k = k % mmax; + int peri_kp = (k+1) % mmax; + + for(int rhs=0;rhsmmax-1)?(mmax-1):k; // This is the fCG-Tr(mmax-1) algorithm + for(int back=0; back < northog; back++){ + int peri_back = (k-back)%mmax; + RealD pbApk= real(innerProduct(mmp[rhs][peri_back],p[rhs][peri_kp])); + RealD beta = -pbApk/pAp[rhs][peri_back]; + axpy(p[rhs][peri_kp],beta,p[rhs][peri_back],p[rhs][peri_kp]); + } + + RealD rrn=sqrt(rn[rhs]/ssq[rhs]); + RealD rtn=sqrt(rtz[rhs]/ssq[rhs]); + RealD rtnp=sqrt(rtzp[rhs]/ssq[rhs]); + + std::cout< max_rn ) max_rn = rrn; + } + LinalgTimer.Stop(); + + // Stopping condition based on worst case + if ( max_rn <= Tolerance ) { + + HDCGTimer.Stop(); + std::cout< & in,std::vector & out) = 0; + virtual void Vstart(std::vector & x,std::vector & src) = 0; + virtual void PcgM2(const Field & in, Field & out) { + out=in; + } + + virtual RealD PcgM3(const Field & p, Field & mmp){ + RealD dd; + _FineLinop.HermOp(p,mmp); + ComplexD dot = innerProduct(p,mmp); + dd=real(dot); + return dd; + } + +}; + +template +class TwoLevelADEF2mrhs : public TwoLevelCGmrhs +{ +public: + GridBase *coarsegrid; + GridBase *coarsegridmrhs; + LinearFunction &_CoarseSolverMrhs; + LinearFunction &_CoarseSolverPreciseMrhs; + MultiRHSBlockProject &_Projector; + MultiRHSDeflation &_Deflator; + + + TwoLevelADEF2mrhs(RealD tol, + Integer maxit, + LinearOperatorBase &FineLinop, + LinearFunction &Smoother, + LinearFunction &CoarseSolverMrhs, + LinearFunction &CoarseSolverPreciseMrhs, + MultiRHSBlockProject &Projector, + MultiRHSDeflation &Deflator, + GridBase *_coarsemrhsgrid) : + TwoLevelCGmrhs(tol, maxit,FineLinop,Smoother,Projector.fine_grid), + _CoarseSolverMrhs(CoarseSolverMrhs), + _CoarseSolverPreciseMrhs(CoarseSolverPreciseMrhs), + _Projector(Projector), + _Deflator(Deflator) + { + coarsegrid = Projector.coarse_grid; + coarsegridmrhs = _coarsemrhsgrid;// Thi could be in projector + }; + + // Override Vstart + virtual void Vstart(std::vector & x,std::vector & src) + { + int nrhs=x.size(); + /////////////////////////////////// + // Choose x_0 such that + // x_0 = guess + (A_ss^inv) r_s = guess + Ass_inv [src -Aguess] + // = [1 - Ass_inv A] Guess + Assinv src + // = P^T guess + Assinv src + // = Vstart [Tang notation] + // This gives: + // W^T (src - A x_0) = src_s - A guess_s - r_s + // = src_s - (A guess)_s - src_s + (A guess)_s + // = 0 + /////////////////////////////////// + std::vector PleftProj(nrhs,this->coarsegrid); + std::vector PleftMss_proj(nrhs,this->coarsegrid); + CoarseField PleftProjMrhs(this->coarsegridmrhs); + CoarseField PleftMss_projMrhs(this->coarsegridmrhs); + + this->_Projector.blockProject(src,PleftProj); + this->_Deflator.DeflateSources(PleftProj,PleftMss_proj); + for(int rhs=0;rhs_CoarseSolverPreciseMrhs(PleftProjMrhs,PleftMss_projMrhs); // Ass^{-1} r_s + + for(int rhs=0;rhs_Projector.blockPromote(x,PleftMss_proj); + } + + virtual void PcgM1(std::vector & in,std::vector & out){ + + int nrhs=in.size(); + + // [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min] + std::vector tmp(nrhs,this->grid); + std::vector Min(nrhs,this->grid); + + std::vector PleftProj(nrhs,this->coarsegrid); + std::vector PleftMss_proj(nrhs,this->coarsegrid); + + CoarseField PleftProjMrhs(this->coarsegridmrhs); + CoarseField PleftMss_projMrhs(this->coarsegridmrhs); + + for(int rhs=0;rhsSmoothTimer.Start(); + this->_Smoother(in[rhs],Min[rhs]); + this->SmoothTimer.Stop(); + + this->FineTimer.Start(); + this->_FineLinop.HermOp(Min[rhs],out[rhs]); + + axpy(tmp[rhs],-1.0,out[rhs],in[rhs]); // resid = in - A Min + this->FineTimer.Stop(); + + } + + this->ProjectTimer.Start(); + this->_Projector.blockProject(tmp,PleftProj); + this->ProjectTimer.Stop(); + this->DeflateTimer.Start(); + this->_Deflator.DeflateSources(PleftProj,PleftMss_proj); + this->DeflateTimer.Stop(); + this->InsertTimer.Start(); + for(int rhs=0;rhsInsertTimer.Stop(); + + this->CoarseTimer.Start(); + this->_CoarseSolverMrhs(PleftProjMrhs,PleftMss_projMrhs); // Ass^{-1} [in - A Min]_s + this->CoarseTimer.Stop(); + + this->InsertTimer.Start(); + for(int rhs=0;rhsInsertTimer.Stop(); + this->PromoteTimer.Start(); + this->_Projector.blockPromote(tmp,PleftMss_proj);// tmp= Q[in - A Min] + this->PromoteTimer.Stop(); + this->FineTimer.Start(); + for(int rhs=0;rhsFineTimer.Stop(); + } +}; + + +NAMESPACE_END(Grid); + + diff --git a/Grid/algorithms/iterative/ConjugateGradient.h b/Grid/algorithms/iterative/ConjugateGradient.h index 3308d8fe..8b4c8fc5 100644 --- a/Grid/algorithms/iterative/ConjugateGradient.h +++ b/Grid/algorithms/iterative/ConjugateGradient.h @@ -54,11 +54,14 @@ public: ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true) : Tolerance(tol), MaxIterations(maxit), - ErrorOnNoConverge(err_on_no_conv){}; + ErrorOnNoConverge(err_on_no_conv) + {}; void operator()(LinearOperatorBase &Linop, const Field &src, Field &psi) { GRID_TRACE("ConjugateGradient"); + GridStopWatch PreambleTimer; + PreambleTimer.Start(); psi.Checkerboard() = src.Checkerboard(); conformable(psi, src); @@ -66,22 +69,26 @@ public: RealD cp, c, a, d, b, ssq, qq; //RealD b_pred; - Field p(src); - Field mmp(src); - Field r(src); + // Was doing copies + Field p(src.Grid()); + Field mmp(src.Grid()); + Field r(src.Grid()); // Initial residual computation & set up + ssq = norm2(src); RealD guess = norm2(psi); assert(std::isnan(guess) == 0); - - Linop.HermOpAndNorm(psi, mmp, d, b); - - r = src - mmp; - p = r; - - a = norm2(p); + if ( guess == 0.0 ) { + r = src; + p = r; + a = ssq; + } else { + Linop.HermOpAndNorm(psi, mmp, d, b); + r = src - mmp; + p = r; + a = norm2(p); + } cp = a; - ssq = norm2(src); // Handle trivial case of zero src if (ssq == 0.){ @@ -111,6 +118,7 @@ public: std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl; + PreambleTimer.Stop(); GridStopWatch LinalgTimer; GridStopWatch InnerTimer; GridStopWatch AxpyNormTimer; @@ -183,13 +191,14 @@ public: << "\tTrue residual " << true_residual << "\tTarget " << Tolerance << std::endl; - std::cout << GridLogMessage << "Time breakdown "< +Author: Yong-Chull Jang +Author: Chulwoo Jung + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#pragma once + +#include //memset + +NAMESPACE_BEGIN(Grid); + +#define Glog std::cout << GridLogMessage + +///////////////////////////////////////////////////////////// +// Implicitly restarted block lanczos +///////////////////////////////////////////////////////////// +template +class ImplicitlyRestartedBlockLanczosCoarse { + +private: + + std::string cname = std::string("ImplicitlyRestartedBlockLanczosCoarse"); + int MaxIter; // Max iterations + int Nstop; // Number of evecs checked for convergence + int Nu; // Number of vecs in the unit block + int Nk; // Number of converged sought + int Nm; // total number of vectors + int Nblock_k; // Nk/Nu + int Nblock_m; // Nm/Nu + int Nconv_test_interval; // Number of skipped vectors when checking a convergence + RealD eresid; + IRBLdiagonalisation diagonalisation; + //////////////////////////////////// + // Embedded objects + //////////////////////////////////// + SortEigen _sort; + LinearOperatorBase &_Linop; + OperatorFunction &_poly; + GridBase * f_grid; + GridBase * mrhs_grid; + int mrhs; + ///////////////////////// + // BLAS objects + ///////////////////////// + int Nevec_acc; // Number of eigenvectors stored in the buffer evec_acc + + void VectorPoly(std::vector &in,std::vector &out) + { + Field mrhs_in(mrhs_grid); + Field mrhs_out(mrhs_grid); + for(int r=0;r= in.size()) rrr = 0; + InsertSlice(in[rrr],mrhs_in,rr,0); + } + _poly(_Linop,mrhs_in,mrhs_out); + for(int rr=0;rr &Linop, // op + GridBase * f_Grid, + GridBase * mrhs_Grid, + int _mrhs, + OperatorFunction & poly, // polynomial + int _Nstop, // really sought vecs + int _Nconv_test_interval, // conv check interval + int _Nu, // vecs in the unit block + int _Nk, // sought vecs + int _Nm, // total vecs + RealD _eresid, // resid in lmd deficit + int _MaxIter, // Max iterations + IRBLdiagonalisation _diagonalisation = IRBLdiagonaliseWithEigen) + : _Linop(Linop), _poly(poly),f_grid(f_Grid), mrhs_grid(mrhs_Grid), + Nstop(_Nstop), Nconv_test_interval(_Nconv_test_interval), mrhs(_mrhs), + Nu(_Nu), Nk(_Nk), Nm(_Nm), + Nblock_m(_Nm/_Nu), Nblock_k(_Nk/_Nu), + eresid(_eresid), MaxIter(_MaxIter), + diagonalisation(_diagonalisation), + Nevec_acc(_Nu) + { assert( (Nk%Nu==0) && (Nm%Nu==0) ); }; + + //////////////////////////////// + // Helpers + //////////////////////////////// + static RealD normalize(Field& v, int if_print=0) + { + RealD nn = norm2(v); + nn = sqrt(nn); + v = v * (1.0/nn); + return nn; + } + + void orthogonalize(Field& w, std::vector& evec, int k, int if_print=0) + { + typedef typename Field::scalar_type MyComplex; + ComplexD ip; + + for(int j=0; j 1e-14) + Glog<<"orthogonalize before: "< 1e-14) + Glog<<"orthogonalize after: "<& evec, int k) + { + orthogonalize(w, evec, k,1); + } + + void orthogonalize(std::vector& w, int _Nu, std::vector& evec, int k, int if_print=0) + { + typedef typename Field::scalar_type MyComplex; + MyComplex ip; + + for(int j=0; j& evec, int k, int Nu) + { + typedef typename Field::scalar_type MyComplex; + MyComplex ip; + + for(int j=0; j& eval, + std::vector& evec, + const std::vector& src, int& Nconv, LanczosType Impl) + { + switch (Impl) { + case LanczosType::irbl: + calc_irbl(eval,evec,src,Nconv); + break; + + case LanczosType::rbl: + calc_rbl(eval,evec,src,Nconv); + break; + } + } + + void calc_irbl(std::vector& eval, + std::vector& evec, + const std::vector& src, int& Nconv) + { + std::string fname = std::string(cname+"::calc_irbl()"); + GridBase *grid = evec[0].Grid(); + assert(grid == src[0].Grid()); + assert( Nu = src.size() ); + + Glog << std::string(74,'*') << std::endl; + Glog << fname + " starting iteration 0 / "<< MaxIter<< std::endl; + Glog << std::string(74,'*') << std::endl; + Glog <<" -- seek Nk = "<< Nk <<" vectors"<< std::endl; + Glog <<" -- accept Nstop = "<< Nstop <<" vectors"<< std::endl; + Glog <<" -- total Nm = "<< Nm <<" vectors"<< std::endl; + Glog <<" -- size of eval = "<< eval.size() << std::endl; + Glog <<" -- size of evec = "<< evec.size() << std::endl; + if ( diagonalisation == IRBLdiagonaliseWithEigen ) { + Glog << "Diagonalisation is Eigen "<< std::endl; +#ifdef USE_LAPACK + } else if ( diagonalisation == IRBLdiagonaliseWithLAPACK ) { + Glog << "Diagonalisation is LAPACK "<< std::endl; +#endif + } else { + abort(); + } + Glog << std::string(74,'*') << std::endl; + + assert(Nm == evec.size() && Nm == eval.size()); + + std::vector> lmd(Nu,std::vector(Nm,0.0)); + std::vector> lme(Nu,std::vector(Nm,0.0)); + std::vector> lmd2(Nu,std::vector(Nm,0.0)); + std::vector> lme2(Nu,std::vector(Nm,0.0)); + std::vector eval2(Nm); + std::vector resid(Nk); + + Eigen::MatrixXcd Qt = Eigen::MatrixXcd::Zero(Nm,Nm); + Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm); + + std::vector Iconv(Nm); + std::vector B(Nm,grid); // waste of space replicating + + std::vector f(Nu,grid); + std::vector f_copy(Nu,grid); + Field v(grid); + + Nconv = 0; + + RealD beta_k; + + // set initial vector + for (int i=0; i& eval, + std::vector& evec, + const std::vector& src, int& Nconv) + { + std::string fname = std::string(cname+"::calc_rbl()"); + GridBase *grid = evec[0].Grid(); + assert(grid == src[0].Grid()); + assert( Nu = src.size() ); + + int Np = (Nm-Nk); + if (Np > 0 && MaxIter > 1) Np /= MaxIter; + int Nblock_p = Np/Nu; + for(int i=0;i< evec.size();i++) evec[0].Advise()=AdviseInfrequentUse; + + Glog << std::string(74,'*') << std::endl; + Glog << fname + " starting iteration 0 / "<< MaxIter<< std::endl; + Glog << std::string(74,'*') << std::endl; + Glog <<" -- seek (min) Nk = "<< Nk <<" vectors"<< std::endl; + Glog <<" -- seek (inc) Np = "<< Np <<" vectors"<< std::endl; + Glog <<" -- seek (max) Nm = "<< Nm <<" vectors"<< std::endl; + Glog <<" -- accept Nstop = "<< Nstop <<" vectors"<< std::endl; + Glog <<" -- size of eval = "<< eval.size() << std::endl; + Glog <<" -- size of evec = "<< evec.size() << std::endl; + if ( diagonalisation == IRBLdiagonaliseWithEigen ) { + Glog << "Diagonalisation is Eigen "<< std::endl; +#ifdef USE_LAPACK + } else if ( diagonalisation == IRBLdiagonaliseWithLAPACK ) { + Glog << "Diagonalisation is LAPACK "<< std::endl; +#endif + } else { + abort(); + } + Glog << std::string(74,'*') << std::endl; + + assert(Nm == evec.size() && Nm == eval.size()); + + std::vector> lmd(Nu,std::vector(Nm,0.0)); + std::vector> lme(Nu,std::vector(Nm,0.0)); + std::vector> lmd2(Nu,std::vector(Nm,0.0)); + std::vector> lme2(Nu,std::vector(Nm,0.0)); + std::vector eval2(Nk); + std::vector resid(Nm); + + Eigen::MatrixXcd Qt = Eigen::MatrixXcd::Zero(Nm,Nm); + Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm); + + std::vector Iconv(Nm); +// int Ntest=Nu; +// std::vector B(Nm,grid); // waste of space replicating + std::vector B(1,grid); // waste of space replicating + + std::vector f(Nu,grid); + std::vector f_copy(Nu,grid); + Field v(grid); + + Nconv = 0; + +// RealD beta_k; + + // set initial vector + for (int i=0; i Btmp(Nstop,grid); // waste of space replicating + + for(int i=0; i>& lmd, + std::vector>& lme, + std::vector& evec, + std::vector& w, + std::vector& w_copy, + int b) + { + const RealD tiny = 1.0e-20; + + int Nu = w.size(); + int Nm = evec.size(); + assert( b < Nm/Nu ); + + // converts block index to full indicies for an interval [L,R) + int L = Nu*b; + int R = Nu*(b+1); + + Real beta; + + assert((Nu%mrhs)==0); + std::vector in(mrhs,f_grid); + std::vector out(mrhs,f_grid); + + // unnecessary copy. Can or should it be avoided? + int k_start = 0; + while ( k_start < Nu) { + Glog << "k_start= "<0) { + for (int u=0; u& eval, + std::vector>& lmd, + std::vector>& lme, + int Nu, int Nk, int Nm, + Eigen::MatrixXcd & Qt, // Nm x Nm + GridBase *grid) + { + assert( Nk%Nu == 0 && Nm%Nu == 0 ); + assert( Nk <= Nm ); + Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk); + + for ( int u=0; u eigensolver(BlockTriDiag); + + for (int i = 0; i < Nk; i++) { + eval[Nk-1-i] = eigensolver.eigenvalues()(i); + } + for (int i = 0; i < Nk; i++) { + for (int j = 0; j < Nk; j++) { + Qt(j,Nk-1-i) = eigensolver.eigenvectors()(j,i); + //Qt(Nk-1-i,j) = eigensolver.eigenvectors()(i,j); + //Qt(i,j) = eigensolver.eigenvectors()(i,j); + } + } + } + +#ifdef USE_LAPACK + void diagonalize_lapack(std::vector& eval, + std::vector>& lmd, + std::vector>& lme, + int Nu, int Nk, int Nm, + Eigen::MatrixXcd & Qt, // Nm x Nm + GridBase *grid) + { + Glog << "diagonalize_lapack: Nu= "<_Nprocessors; + int node = grid->_processor; + int interval = (NN/total)+1; + double vl = 0.0, vu = 0.0; + MKL_INT il = interval*node+1 , iu = interval*(node+1); + if (iu > NN) iu=NN; + Glog << "node "<= il-1; i--){ + evals_tmp[i] = evals_tmp[i - (il-1)]; + if (il>1) evals_tmp[i-(il-1)]=0.; + for (int j = 0; j< NN; j++){ + evec_tmp[i*NN+j] = evec_tmp[(i - (il-1))*NN+j]; + if (il>1) { + evec_tmp[(i-(il-1))*NN+j].imag=0.; + evec_tmp[(i-(il-1))*NN+j].real=0.; + } + } + } + } + { + grid->GlobalSumVector(evals_tmp,NN); + grid->GlobalSumVector((double*)evec_tmp,2*NN*NN); + } + } + for (int i = 0; i < Nk; i++) + eval[Nk-1-i] = evals_tmp[i]; + for (int i = 0; i < Nk; i++) { + for (int j = 0; j < Nk; j++) { +// Qt(j,Nk-1-i) = eigensolver.eigenvectors()(j,i); + Qt(j,Nk-1-i)=std::complex + ( evec_tmp[i*Nk+j].real, + evec_tmp[i*Nk+j].imag); +// ( evec_tmp[(Nk-1-j)*Nk+Nk-1-i].real, +// evec_tmp[(Nk-1-j)*Nk+Nk-1-i].imag); + + } + } + +if (1){ + Eigen::SelfAdjointEigenSolver eigensolver(BlockTriDiag); + + for (int i = 0; i < Nk; i++) { + Glog << "eval = "<& eval, + std::vector>& lmd, + std::vector>& lme, + int Nu, int Nk, int Nm, + Eigen::MatrixXcd & Qt, + GridBase *grid) + { + Qt = Eigen::MatrixXcd::Identity(Nm,Nm); + if ( diagonalisation == IRBLdiagonaliseWithEigen ) { + diagonalize_Eigen(eval,lmd,lme,Nu,Nk,Nm,Qt,grid); +#ifdef USE_LAPACK + } else if ( diagonalisation == IRBLdiagonaliseWithLAPACK ) { + diagonalize_lapack(eval,lmd,lme,Nu,Nk,Nm,Qt,grid); +#endif + } else { + assert(0); + } + } + + + void unpackHermitBlockTriDiagMatToEigen( + std::vector>& lmd, + std::vector>& lme, + int Nu, int Nb, int Nk, int Nm, + Eigen::MatrixXcd& M) + { + //Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n'; + assert( Nk%Nu == 0 && Nm%Nu == 0 ); + assert( Nk <= Nm ); + M = Eigen::MatrixXcd::Zero(Nk,Nk); + + // rearrange + for ( int u=0; u>& lmd, + std::vector>& lme, + int Nu, int Nb, int Nk, int Nm, + Eigen::MatrixXcd& M) + { + //Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n'; + assert( Nk%Nu == 0 && Nm%Nu == 0 ); + assert( Nk <= Nm ); + + // rearrange + for ( int u=0; u QRD(Mtmp); + Q = QRD.householderQ(); + R = QRD.matrixQR(); // upper triangular part is the R matrix. + // lower triangular part used to represent series + // of Q sequence. + + // equivalent operation of Qprod *= Q + //M = Eigen::MatrixXcd::Zero(Nm,Nm); + + //for (int i=0; i Nm) kmax = Nm; + for (int k=i; ki) M(i,j) = conj(M(j,i)); + // if (i-j > Nu || j-i > Nu) M(i,j) = 0.; + // } + //} + + //Glog << "shiftedQRDecompEigen() end" << endl; + } + + void exampleQRDecompEigen(void) + { + Eigen::MatrixXd A = Eigen::MatrixXd::Zero(3,3); + Eigen::MatrixXd Q = Eigen::MatrixXd::Zero(3,3); + Eigen::MatrixXd R = Eigen::MatrixXd::Zero(3,3); + Eigen::MatrixXd P = Eigen::MatrixXd::Zero(3,3); + + A(0,0) = 12.0; + A(0,1) = -51.0; + A(0,2) = 4.0; + A(1,0) = 6.0; + A(1,1) = 167.0; + A(1,2) = -68.0; + A(2,0) = -4.0; + A(2,1) = 24.0; + A(2,2) = -41.0; + + Glog << "matrix A before ColPivHouseholder" << std::endl; + for ( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + Glog << "A[" << i << "," << j << "] = " << A(i,j) << '\n'; + } + } + Glog << std::endl; + + Eigen::ColPivHouseholderQR QRD(A); + + Glog << "matrix A after ColPivHouseholder" << std::endl; + for ( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + Glog << "A[" << i << "," << j << "] = " << A(i,j) << '\n'; + } + } + Glog << std::endl; + + Glog << "HouseholderQ with sequence lenth = nonzeroPiviots" << std::endl; + Q = QRD.householderQ().setLength(QRD.nonzeroPivots()); + for ( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + Glog << "Q[" << i << "," << j << "] = " << Q(i,j) << '\n'; + } + } + Glog << std::endl; + + Glog << "HouseholderQ with sequence lenth = 1" << std::endl; + Q = QRD.householderQ().setLength(1); + for ( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + Glog << "Q[" << i << "," << j << "] = " << Q(i,j) << '\n'; + } + } + Glog << std::endl; + + Glog << "HouseholderQ with sequence lenth = 2" << std::endl; + Q = QRD.householderQ().setLength(2); + for ( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + Glog << "Q[" << i << "," << j << "] = " << Q(i,j) << '\n'; + } + } + Glog << std::endl; + + Glog << "matrixR" << std::endl; + R = QRD.matrixR(); + for ( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + Glog << "R[" << i << "," << j << "] = " << R(i,j) << '\n'; + } + } + Glog << std::endl; + + Glog << "rank = " << QRD.rank() << std::endl; + Glog << "threshold = " << QRD.threshold() << std::endl; + + Glog << "matrixP" << std::endl; + P = QRD.colsPermutation(); + for ( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + Glog << "P[" << i << "," << j << "] = " << P(i,j) << '\n'; + } + } + Glog << std::endl; + + + Glog << "QR decomposition without column pivoting" << std::endl; + + A(0,0) = 12.0; + A(0,1) = -51.0; + A(0,2) = 4.0; + A(1,0) = 6.0; + A(1,1) = 167.0; + A(1,2) = -68.0; + A(2,0) = -4.0; + A(2,1) = 24.0; + A(2,2) = -41.0; + + Glog << "matrix A before Householder" << std::endl; + for ( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + Glog << "A[" << i << "," << j << "] = " << A(i,j) << '\n'; + } + } + Glog << std::endl; + + Eigen::HouseholderQR QRDplain(A); + + Glog << "HouseholderQ" << std::endl; + Q = QRDplain.householderQ(); + for ( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + Glog << "Q[" << i << "," << j << "] = " << Q(i,j) << '\n'; + } + } + Glog << std::endl; + + Glog << "matrix A after Householder" << std::endl; + for ( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + Glog << "A[" << i << "," << j << "] = " << A(i,j) << '\n'; + } + } + Glog << std::endl; + } + +}; + +NAMESPACE_END(Grid); +#undef Glog +#undef USE_LAPACK + diff --git a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h index d39701f4..37f36dda 100644 --- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h +++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h @@ -79,14 +79,16 @@ template class ImplicitlyRestartedLanczosHermOpTester : public Imp RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0); std::cout.precision(13); - std::cout<& evec, Field& w,int Nm,int k) { - std::cout<0) w -= lme[k-1] * evec[k-1]; @@ -480,18 +482,18 @@ until convergence lme[k] = beta; if ( (k>0) && ( (k % orth_period) == 0 )) { - std::cout<& lmd, std::vector& lme, diff --git a/Grid/algorithms/iterative/NormalEquations.h b/Grid/algorithms/iterative/NormalEquations.h index c9c92777..df90bee7 100644 --- a/Grid/algorithms/iterative/NormalEquations.h +++ b/Grid/algorithms/iterative/NormalEquations.h @@ -33,7 +33,7 @@ NAMESPACE_BEGIN(Grid); /////////////////////////////////////////////////////////////////////////////////////////////////////// // Take a matrix and form an NE solver calling a Herm solver /////////////////////////////////////////////////////////////////////////////////////////////////////// -template class NormalEquations { +template class NormalEquations : public LinearFunction{ private: SparseMatrixBase & _Matrix; OperatorFunction & _HermitianSolver; @@ -60,7 +60,7 @@ public: } }; -template class HPDSolver { +template class HPDSolver : public LinearFunction { private: LinearOperatorBase & _Matrix; OperatorFunction & _HermitianSolver; @@ -78,13 +78,13 @@ public: void operator() (const Field &in, Field &out){ _Guess(in,out); - _HermitianSolver(_Matrix,in,out); // Mdag M out = Mdag in + _HermitianSolver(_Matrix,in,out); //M out = in } }; -template class MdagMSolver { +template class MdagMSolver : public LinearFunction { private: SparseMatrixBase & _Matrix; OperatorFunction & _HermitianSolver; diff --git a/Grid/algorithms/iterative/PowerMethod.h b/Grid/algorithms/iterative/PowerMethod.h index 027ea68c..5c60993d 100644 --- a/Grid/algorithms/iterative/PowerMethod.h +++ b/Grid/algorithms/iterative/PowerMethod.h @@ -20,7 +20,7 @@ template class PowerMethod RealD evalMaxApprox = 0.0; auto src_n = src; auto tmp = src; - const int _MAX_ITER_EST_ = 50; + const int _MAX_ITER_EST_ = 100; for (int i=0;i<_MAX_ITER_EST_;i++) { diff --git a/Grid/algorithms/multigrid/Aggregates.h b/Grid/algorithms/multigrid/Aggregates.h new file mode 100644 index 00000000..404ff27b --- /dev/null +++ b/Grid/algorithms/multigrid/Aggregates.h @@ -0,0 +1,478 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/algorithms/Aggregates.h + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +NAMESPACE_BEGIN(Grid); + +inline RealD AggregatePowerLaw(RealD x) +{ + // return std::pow(x,-4); + // return std::pow(x,-3); + return std::pow(x,-5); +} + +template +class Aggregation { +public: + constexpr int Nbasis(void) { return nbasis; }; + + typedef iVector siteVector; + typedef Lattice CoarseVector; + typedef Lattice > CoarseMatrix; + + typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field + typedef Lattice FineField; + + GridBase *CoarseGrid; + GridBase *FineGrid; + std::vector > subspace; + int checkerboard; + int Checkerboard(void){return checkerboard;} + Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) : + CoarseGrid(_CoarseGrid), + FineGrid(_FineGrid), + subspace(nbasis,_FineGrid), + checkerboard(_checkerboard) + { + }; + + + void Orthogonalise(void){ + CoarseScalar InnerProd(CoarseGrid); + // std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"< &hermop,int nn=nbasis) + { + + RealD scale; + + ConjugateGradient CG(1.0e-2,100,false); + FineField noise(FineGrid); + FineField Mn(FineGrid); + + for(int b=0;b "< "< &hermop, + int nn, + double hi, + double lo, + int orderfilter, + int ordermin, + int orderstep, + double filterlo + ) { + + RealD scale; + + FineField noise(FineGrid); + FineField Mn(FineGrid); + FineField tmp(FineGrid); + + // New normalised noise + gaussian(RNG,noise); + scale = std::pow(norm2(noise),-0.5); + noise=noise*scale; + + std::cout << GridLogMessage<<" Chebyshev subspace pass-1 : ord "< "< Cheb(lo,hi,orderfilter); + Cheb(hermop,noise,Mn); + // normalise + scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; + subspace[b] = Mn; + hermop.Op(Mn,tmp); + std::cout< "<oSites(), Nsimd, { + coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); + coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss)); + }); + + // Possible more fine grained control is needed than a linear sweep, + // but huge productivity gain if this is simple algorithm and not a tunable + int m =1; + if ( n>=ordermin ) m=n-ordermin; + if ( (m%orderstep)==0 ) { + Mn=*Tnp; + scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; + subspace[b] = Mn; + hermop.Op(Mn,tmp); + std::cout< "< &hermop, + int nn, + double hi, + double lo, + int orderfilter + ) { + + RealD scale; + + FineField noise(FineGrid); + FineField Mn(FineGrid); + FineField tmp(FineGrid); + + // New normalised noise + std::cout << GridLogMessage<<" Chebyshev subspace pure noise : ord "< "< Cheb(lo,hi,orderfilter); + Cheb(hermop,noise,Mn); + scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; + + // Refine + Chebyshev PowerLaw(lo,hi,1000,AggregatePowerLaw); + noise = Mn; + PowerLaw(hermop,noise,Mn); + scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; + + // normalise + subspace[b] = Mn; + hermop.Op(Mn,tmp); + std::cout< "< &hermop, + int nn, + double hi, + int orderfilter + ) { + + RealD scale; + + FineField noise(FineGrid); + FineField Mn(FineGrid); + FineField tmp(FineGrid); + + // New normalised noise + std::cout << GridLogMessage<<" Chebyshev subspace pure noise : ord "< "< Cheb(0.0,hi,orderfilter,AggregatePowerLaw); + Cheb(hermop,noise,Mn); + // normalise + scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; + subspace[b] = Mn; + hermop.Op(Mn,tmp); + std::cout< "< &hermop, + double hi + ) { + + RealD scale; + + FineField noise(FineGrid); + FineField Mn(FineGrid); + FineField tmp(FineGrid); + + // New normalised noise + for(int b =0;b "< Cheb1(3.0,hi,300); + Chebyshev Cheb2(1.0,hi,50); + Chebyshev Cheb3(0.5,hi,300); + Chebyshev Cheb4(0.05,hi,500); + Chebyshev Cheb5(0.01,hi,2000); + */ + /* 242 */ + /* + Chebyshev Cheb3(0.1,hi,300); + Chebyshev Cheb2(0.02,hi,1000); + Chebyshev Cheb1(0.003,hi,2000); + 8? + */ + /* How many?? + */ + Chebyshev Cheb2(0.001,hi,2500); // 169 iters on HDCG after refine + Chebyshev Cheb1(0.02,hi,600); + + // Chebyshev Cheb2(0.001,hi,1500); + // Chebyshev Cheb1(0.02,hi,600); + Cheb1(hermop,noise,Mn); scale = std::pow(norm2(Mn),-0.5); noise=Mn*scale; + hermop.Op(noise,tmp); std::cout< "< "< "< "< "< "< &hermop, + double Lo,double tol,int maxit) + { + + RealD scale; + + FineField noise(FineGrid); + FineField Mn(FineGrid); + FineField tmp(FineGrid); + + // New normalised noise + std::cout << GridLogMessage<<" Multishift subspace : Lo "< alpha({1.0/6.0,-1.0/2.0,1.0/2.0,-1.0/6.0}); + std::vector shifts({Lo,Lo+epsilon,Lo+2*epsilon,Lo+3*epsilon}); + std::vector tols({tol,tol,tol,tol}); + std::cout << "sizes "< MSCG(maxit,msf); + + for(int b =0;b "< "< &hermop, + double Lo,double tol,int maxit) + { + FineField tmp(FineGrid); + for(int b =0;b CGsloppy(tol,maxit,false); + ShiftedHermOpLinearOperator ShiftedFineHermOp(hermop,Lo); + tmp=Zero(); + CGsloppy(hermop,subspace[b],tmp); + RealD scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale; + subspace[b]=tmp; + hermop.Op(subspace[b],tmp); + std::cout< "< &hermop, + TwoLevelADEF2mrhs & theHDCG, + int nrhs) + { + std::vector src_mrhs(nrhs,FineGrid); + std::vector res_mrhs(nrhs,FineGrid); + FineField tmp(FineGrid); + for(int b =0;b "< "< &CoarseInner, blockSum(CoarseInner,fine_inner_msk); } - -class Geometry { -public: - int npoint; - int base; - std::vector directions ; - std::vector displacements; - std::vector points_dagger; - - Geometry(int _d) { - - base = (_d==5) ? 1:0; - - // make coarse grid stencil for 4d , not 5d - if ( _d==5 ) _d=4; - - npoint = 2*_d+1; - directions.resize(npoint); - displacements.resize(npoint); - points_dagger.resize(npoint); - for(int d=0;d<_d;d++){ - directions[d ] = d+base; - directions[d+_d] = d+base; - displacements[d ] = +1; - displacements[d+_d]= -1; - points_dagger[d ] = d+_d; - points_dagger[d+_d] = d; - } - directions [2*_d]=0; - displacements[2*_d]=0; - points_dagger[2*_d]=2*_d; - } - - int point(int dir, int disp) { - assert(disp == -1 || disp == 0 || disp == 1); - assert(base+0 <= dir && dir < base+4); - - // directions faster index = new indexing - // 4d (base = 0): - // point 0 1 2 3 4 5 6 7 8 - // dir 0 1 2 3 0 1 2 3 0 - // disp +1 +1 +1 +1 -1 -1 -1 -1 0 - // 5d (base = 1): - // point 0 1 2 3 4 5 6 7 8 - // dir 1 2 3 4 1 2 3 4 0 - // disp +1 +1 +1 +1 -1 -1 -1 -1 0 - - // displacements faster index = old indexing - // 4d (base = 0): - // point 0 1 2 3 4 5 6 7 8 - // dir 0 0 1 1 2 2 3 3 0 - // disp +1 -1 +1 -1 +1 -1 +1 -1 0 - // 5d (base = 1): - // point 0 1 2 3 4 5 6 7 8 - // dir 1 1 2 2 3 3 4 4 0 - // disp +1 -1 +1 -1 +1 -1 +1 -1 0 - - if(dir == 0 and disp == 0) - return 8; - else // New indexing - return (1 - disp) / 2 * 4 + dir - base; - // else // Old indexing - // return (4 * (dir - base) + 1 - disp) / 2; - } -}; - -template -class Aggregation { -public: - typedef iVector siteVector; - typedef Lattice CoarseVector; - typedef Lattice > CoarseMatrix; - - typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field - typedef Lattice FineField; - - GridBase *CoarseGrid; - GridBase *FineGrid; - std::vector > subspace; - int checkerboard; - int Checkerboard(void){return checkerboard;} - Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) : - CoarseGrid(_CoarseGrid), - FineGrid(_FineGrid), - subspace(nbasis,_FineGrid), - checkerboard(_checkerboard) - { - }; - - void Orthogonalise(void){ - CoarseScalar InnerProd(CoarseGrid); - std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"< &hermop,int nn=nbasis) { - - RealD scale; - - ConjugateGradient CG(1.0e-2,100,false); - FineField noise(FineGrid); - FineField Mn(FineGrid); - - for(int b=0;b "< "< &hermop, - int nn, - double hi, - double lo, - int orderfilter, - int ordermin, - int orderstep, - double filterlo - ) { - - RealD scale; - - FineField noise(FineGrid); - FineField Mn(FineGrid); - FineField tmp(FineGrid); - - // New normalised noise - gaussian(RNG,noise); - scale = std::pow(norm2(noise),-0.5); - noise=noise*scale; - - // Initial matrix element - hermop.Op(noise,Mn); std::cout< "< Cheb(lo,hi,orderfilter); - Cheb(hermop,noise,Mn); - // normalise - scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; - subspace[b] = Mn; - hermop.Op(Mn,tmp); - std::cout< "<oSites(), Nsimd, { - coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); - coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss)); - }); - - // Possible more fine grained control is needed than a linear sweep, - // but huge productivity gain if this is simple algorithm and not a tunable - int m =1; - if ( n>=ordermin ) m=n-ordermin; - if ( (m%orderstep)==0 ) { - Mn=*Tnp; - scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; - subspace[b] = Mn; - hermop.Op(Mn,tmp); - std::cout< "< diff --git a/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h b/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h new file mode 100644 index 00000000..a4098316 --- /dev/null +++ b/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h @@ -0,0 +1,619 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/algorithms/GeneralCoarsenedMatrix.h + + Copyright (C) 2015 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +#include // needed for Dagger(Yes|No), Inverse(Yes|No) + +#include +#include + +NAMESPACE_BEGIN(Grid); + +// Fine Object == (per site) type of fine field +// nbasis == number of deflation vectors +template +class GeneralCoarsenedMatrix : public SparseMatrixBase > > { +public: + + typedef GeneralCoarsenedMatrix GeneralCoarseOp; + typedef iVector siteVector; + typedef iMatrix siteMatrix; + typedef Lattice > CoarseComplexField; + typedef Lattice CoarseVector; + typedef Lattice > CoarseMatrix; + typedef iMatrix Cobj; + typedef iVector Cvec; + typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field + typedef Lattice FineField; + typedef Lattice FineComplexField; + typedef CoarseVector Field; + //////////////////// + // Data members + //////////////////// + int hermitian; + GridBase * _FineGrid; + GridCartesian * _CoarseGrid; + NonLocalStencilGeometry &geom; + PaddedCell Cell; + GeneralLocalStencil Stencil; + + std::vector _A; + std::vector _Adag; + std::vector MultTemporaries; + + /////////////////////// + // Interface + /////////////////////// + GridBase * Grid(void) { return _CoarseGrid; }; // this is all the linalg routines need to know + GridBase * FineGrid(void) { return _FineGrid; }; // this is all the linalg routines need to know + GridCartesian * CoarseGrid(void) { return _CoarseGrid; }; // this is all the linalg routines need to know + + /* void ShiftMatrix(RealD shift) + { + int Nd=_FineGrid->Nd(); + Coordinate zero_shift(Nd,0); + for(int p=0;p &A,const CoarseVector &in, CoarseVector &out) + { + RealD tviews=0; RealD ttot=0; RealD tmult=0; RealD texch=0; RealD text=0; RealD ttemps=0; RealD tcopy=0; + RealD tmult2=0; + + ttot=-usecond(); + conformable(CoarseGrid(),in.Grid()); + conformable(in.Grid(),out.Grid()); + out.Checkerboard() = in.Checkerboard(); + CoarseVector tin=in; + + texch-=usecond(); + CoarseVector pin = Cell.ExchangePeriodic(tin); + texch+=usecond(); + + CoarseVector pout(pin.Grid()); + + int npoint = geom.npoint; + typedef LatticeView Aview; + typedef LatticeView Vview; + + const int Nsimd = CComplex::Nsimd(); + + int64_t osites=pin.Grid()->oSites(); + + RealD flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd(); + RealD bytes = 1.0*osites*sizeof(siteMatrix)*npoint + + 2.0*osites*sizeof(siteVector)*npoint; + + { + tviews-=usecond(); + autoView( in_v , pin, AcceleratorRead); + autoView( out_v , pout, AcceleratorWriteDiscard); + autoView( Stencil_v , Stencil, AcceleratorRead); + tviews+=usecond(); + + // Static and prereserve to keep UVM region live and not resized across multiple calls + ttemps-=usecond(); + MultTemporaries.resize(npoint,pin.Grid()); + ttemps+=usecond(); + std::vector AcceleratorViewContainer_h; + std::vector AcceleratorVecViewContainer_h; + + tviews-=usecond(); + for(int p=0;p AcceleratorViewContainer; AcceleratorViewContainer.resize(npoint); + static deviceVector AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(npoint); + + auto Aview_p = &AcceleratorViewContainer[0]; + auto Vview_p = &AcceleratorVecViewContainer[0]; + tcopy-=usecond(); + acceleratorCopyToDevice(&AcceleratorViewContainer_h[0],&AcceleratorViewContainer[0],npoint *sizeof(Aview)); + acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],npoint *sizeof(Vview)); + tcopy+=usecond(); + + tmult-=usecond(); + accelerator_for(spb, osites*nbasis*npoint, Nsimd, { + typedef decltype(coalescedRead(in_v[0](0))) calcComplex; + int32_t ss = spb/(nbasis*npoint); + int32_t bp = spb%(nbasis*npoint); + int32_t point= bp/nbasis; + int32_t b = bp%nbasis; + auto SE = Stencil_v.GetEntry(point,ss); + auto nbr = coalescedReadGeneralPermute(in_v[SE->_offset],SE->_permute,Nd); + auto res = coalescedRead(Aview_p[point][ss](0,b))*nbr(0); + for(int bb=1;bbgSites() ;bidx++){ + Coordinate bcoor; + CoarseGrid()->GlobalIndexToGlobalCoor(bidx,bcoor); + + for(int p=0;pGlobalDimensions()[mu]; + scoor[mu] = (bcoor[mu] - geom.shifts[p][mu] + L) % L; // Modulo arithmetic + } + // Flip to poke/peekLocalSite and not too bad + auto link = peekSite(_A[p],scoor); + int pp = geom.Reverse(p); + pokeSite(adj(link),_Adag[pp],bcoor); + } + } + } + ///////////////////////////////////////////////////////////// + // + // A) Only reduced flops option is to use a padded cell of depth 4 + // and apply MpcDagMpc in the padded cell. + // + // Makes for ONE application of MpcDagMpc per vector instead of 30 or 80. + // With the effective cell size around (B+8)^4 perhaps 12^4/4^4 ratio + // Cost is 81x more, same as stencil size. + // + // But: can eliminate comms and do as local dirichlet. + // + // Local exchange gauge field once. + // Apply to all vectors, local only computation. + // Must exchange ghost subcells in reverse process of PaddedCell to take inner products + // + // B) Can reduce cost: pad by 1, apply Deo (4^4+6^4+8^4+8^4 )/ (4x 4^4) + // pad by 2, apply Doe + // pad by 3, apply Deo + // then break out 8x directions; cost is ~10x MpcDagMpc per vector + // + // => almost factor of 10 in setup cost, excluding data rearrangement + // + // Intermediates -- ignore the corner terms, leave approximate and force Hermitian + // Intermediates -- pad by 2 and apply 1+8+24 = 33 times. + ///////////////////////////////////////////////////////////// + + ////////////////////////////////////////////////////////// + // BFM HDCG style approach: Solve a system of equations to get Aij + ////////////////////////////////////////////////////////// + /* + * Here, k,l index which possible shift within the 3^Nd "ball" connected by MdagM. + * + * conj(phases[block]) proj[k][ block*Nvec+j ] = \sum_ball e^{i q_k . delta} < phi_{block,j} | MdagM | phi_{(block+delta),i} > + * = \sum_ball e^{iqk.delta} A_ji + * + * Must invert matrix M_k,l = e^[i q_k . delta_l] + * + * Where q_k = delta_k . (2*M_PI/global_nb[mu]) + */ +#if 0 + void CoarsenOperator(LinearOperatorBase > &linop, + Aggregation & Subspace) + { + std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl; + GridBase *grid = FineGrid(); + + RealD tproj=0.0; + RealD teigen=0.0; + RealD tmat=0.0; + RealD tphase=0.0; + RealD tinv=0.0; + + ///////////////////////////////////////////////////////////// + // Orthogonalise the subblocks over the basis + ///////////////////////////////////////////////////////////// + CoarseScalar InnerProd(CoarseGrid()); + blockOrthogonalise(InnerProd,Subspace.subspace); + + const int npoint = geom.npoint; + + Coordinate clatt = CoarseGrid()->GlobalDimensions(); + int Nd = CoarseGrid()->Nd(); + + /* + * Here, k,l index which possible momentum/shift within the N-points connected by MdagM. + * Matrix index i is mapped to this shift via + * geom.shifts[i] + * + * conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block] + * = \sum_{l in ball} e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} > + * = \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l} + * = M_{kl} A_ji^{b.b+l} + * + * Must assemble and invert matrix M_k,l = e^[i q_k . delta_l] + * + * Where q_k = delta_k . (2*M_PI/global_nb[mu]) + * + * Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j} + */ + teigen-=usecond(); + Eigen::MatrixXcd Mkl = Eigen::MatrixXcd::Zero(npoint,npoint); + Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint); + ComplexD ci(0.0,1.0); + for(int k=0;k ComputeProj(npoint,CoarseGrid()); + std::vector FT(npoint,CoarseGrid()); + for(int i=0;ioSites(); + autoView( A_v , _A[k], AcceleratorWrite); + autoView( FT_v , FT[k], AcceleratorRead); + accelerator_for(sss, osites, 1, { + for(int j=0;j > &linop, + Aggregation & Subspace) + { + std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl; + GridBase *grid = FineGrid(); + + RealD tproj=0.0; + RealD teigen=0.0; + RealD tmat=0.0; + RealD tphase=0.0; + RealD tphaseBZ=0.0; + RealD tinv=0.0; + + ///////////////////////////////////////////////////////////// + // Orthogonalise the subblocks over the basis + ///////////////////////////////////////////////////////////// + CoarseScalar InnerProd(CoarseGrid()); + blockOrthogonalise(InnerProd,Subspace.subspace); + + // for(int s=0;sGlobalDimensions(); + int Nd = CoarseGrid()->Nd(); + + /* + * Here, k,l index which possible momentum/shift within the N-points connected by MdagM. + * Matrix index i is mapped to this shift via + * geom.shifts[i] + * + * conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block] + * = \sum_{l in ball} e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} > + * = \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l} + * = M_{kl} A_ji^{b.b+l} + * + * Must assemble and invert matrix M_k,l = e^[i q_k . delta_l] + * + * Where q_k = delta_k . (2*M_PI/global_nb[mu]) + * + * Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j} + */ + teigen-=usecond(); + Eigen::MatrixXcd Mkl = Eigen::MatrixXcd::Zero(npoint,npoint); + Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint); + ComplexD ci(0.0,1.0); + for(int k=0;k phaF(npoint,grid); + std::vector pha(npoint,CoarseGrid()); + + CoarseVector coarseInner(CoarseGrid()); + + typedef typename CComplex::scalar_type SComplex; + FineComplexField one(grid); one=SComplex(1.0); + FineComplexField zz(grid); zz = Zero(); + tphase=-usecond(); + for(int p=0;p ComputeProj(npoint,CoarseGrid()); + std::vector FT(npoint,CoarseGrid()); + for(int i=0;ioSites(); + autoView( A_v , _A[k], AcceleratorWrite); + autoView( FT_v , FT[k], AcceleratorRead); + accelerator_for(sss, osites, 1, { + for(int j=0;j &out){assert(0);}; +}; + + + +NAMESPACE_END(Grid); diff --git a/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h b/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h new file mode 100644 index 00000000..98f4b22c --- /dev/null +++ b/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h @@ -0,0 +1,729 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/algorithms/GeneralCoarsenedMatrixMultiRHS.h + + Copyright (C) 2015 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + + +NAMESPACE_BEGIN(Grid); + + +// Fine Object == (per site) type of fine field +// nbasis == number of deflation vectors +template +class MultiGeneralCoarsenedMatrix : public SparseMatrixBase > > { +public: + typedef typename CComplex::scalar_object SComplex; + typedef GeneralCoarsenedMatrix GeneralCoarseOp; + typedef MultiGeneralCoarsenedMatrix MultiGeneralCoarseOp; + + typedef iVector siteVector; + typedef iMatrix siteMatrix; + typedef iVector calcVector; + typedef iMatrix calcMatrix; + typedef Lattice > CoarseComplexField; + typedef Lattice CoarseVector; + typedef Lattice > CoarseMatrix; + typedef iMatrix Cobj; + typedef iVector Cvec; + typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field + typedef Lattice FineField; + typedef Lattice FineComplexField; + typedef CoarseVector Field; + + //////////////////// + // Data members + //////////////////// + GridCartesian * _CoarseGridMulti; + NonLocalStencilGeometry geom; + NonLocalStencilGeometry geom_srhs; + PaddedCell Cell; + GeneralLocalStencil Stencil; + + deviceVector BLAS_B; + deviceVector BLAS_C; + std::vector > BLAS_A; + + std::vector > BLAS_AP; + std::vector > BLAS_BP; + deviceVector BLAS_CP; + + /////////////////////// + // Interface + /////////////////////// + GridBase * Grid(void) { return _CoarseGridMulti; }; // this is all the linalg routines need to know + GridCartesian * CoarseGrid(void) { return _CoarseGridMulti; }; // this is all the linalg routines need to know + + // Can be used to do I/O on the operator matrices externally + void SetMatrix (int p,CoarseMatrix & A) + { + assert(A.size()==geom_srhs.npoint); + GridtoBLAS(A[p],BLAS_A[p]); + } + void GetMatrix (int p,CoarseMatrix & A) + { + assert(A.size()==geom_srhs.npoint); + BLAStoGrid(A[p],BLAS_A[p]); + } + void CopyMatrix (GeneralCoarseOp &_Op) + { + for(int p=0;plSites(); + int32_t unpadded_sites = CoarseGridMulti->lSites(); + + int32_t nrhs = CoarseGridMulti->FullDimensions()[0]; // # RHS + int32_t orhs = nrhs/CComplex::Nsimd(); + + padded_sites = padded_sites/nrhs; + unpadded_sites = unpadded_sites/nrhs; + + ///////////////////////////////////////////////// + // Device data vector storage + ///////////////////////////////////////////////// + BLAS_A.resize(geom.npoint); + for(int p=0;p lSite + assert(nbr void GridtoBLAS(const Lattice &from,deviceVector &to) + { + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + GridBase *Fg = from.Grid(); + assert(!Fg->_isCheckerBoarded); + int nd = Fg->_ndimension; + + to.resize(Fg->lSites()); + + Coordinate LocalLatt = Fg->LocalDimensions(); + size_t nsite = 1; + for(int i=0;i_ostride; + Coordinate f_istride = Fg->_istride; + Coordinate f_rdimensions = Fg->_rdimensions; + + autoView(from_v,from,AcceleratorRead); + auto to_v = &to[0]; + + const int words=sizeof(vobj)/sizeof(vector_type); + accelerator_for(idx,nsite,1,{ + + Coordinate from_coor, base; + Lexicographic::CoorFromIndex(base,idx,LocalLatt); + for(int i=0;i void BLAStoGrid(Lattice &grid,deviceVector &in) + { + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + GridBase *Tg = grid.Grid(); + assert(!Tg->_isCheckerBoarded); + int nd = Tg->_ndimension; + + assert(in.size()==Tg->lSites()); + + Coordinate LocalLatt = Tg->LocalDimensions(); + size_t nsite = 1; + for(int i=0;i_ostride; + Coordinate t_istride = Tg->_istride; + Coordinate t_rdimensions = Tg->_rdimensions; + + autoView(to_v,grid,AcceleratorWrite); + auto from_v = &in[0]; + + const int words=sizeof(vobj)/sizeof(vector_type); + accelerator_for(idx,nsite,1,{ + + Coordinate to_coor, base; + Lexicographic::CoorFromIndex(base,idx,LocalLatt); + for(int i=0;i > &linop, + Aggregation & Subspace, + GridBase *CoarseGrid) + { +#if 0 + std::cout << GridLogMessage<< "GeneralCoarsenMatrixMrhs "<< std::endl; + + GridBase *grid = Subspace.FineGrid; + + ///////////////////////////////////////////////////////////// + // Orthogonalise the subblocks over the basis + ///////////////////////////////////////////////////////////// + CoarseScalar InnerProd(CoarseGrid); + blockOrthogonalise(InnerProd,Subspace.subspace); + + const int npoint = geom_srhs.npoint; + + Coordinate clatt = CoarseGrid->GlobalDimensions(); + int Nd = CoarseGrid->Nd(); + /* + * Here, k,l index which possible momentum/shift within the N-points connected by MdagM. + * Matrix index i is mapped to this shift via + * geom.shifts[i] + * + * conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block] + * = \sum_{l in ball} e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} > + * = \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l} + * = M_{kl} A_ji^{b.b+l} + * + * Must assemble and invert matrix M_k,l = e^[i q_k . delta_l] + * + * Where q_k = delta_k . (2*M_PI/global_nb[mu]) + * + * Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j} + */ + Eigen::MatrixXcd Mkl = Eigen::MatrixXcd::Zero(npoint,npoint); + Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint); + ComplexD ci(0.0,1.0); + for(int k=0;k phaF(npoint,grid); + std::vector pha(npoint,CoarseGrid); + + CoarseVector coarseInner(CoarseGrid); + + typedef typename CComplex::scalar_type SComplex; + FineComplexField one(grid); one=SComplex(1.0); + FineComplexField zz(grid); zz = Zero(); + for(int p=0;p _A; + _A.resize(geom_srhs.npoint,CoarseGrid); + + std::vector ComputeProj(npoint,CoarseGrid); + CoarseVector FT(CoarseGrid); + for(int i=0;ioSites(); + autoView( A_v , _A[k], AcceleratorWrite); + autoView( FT_v , FT, AcceleratorRead); + accelerator_for(sss, osites, 1, { + for(int j=0;j > Projector; + Projector.Allocate(nbasis,grid,CoarseGrid); + Projector.ImportBasis(Subspace.subspace); + + const int npoint = geom_srhs.npoint; + + Coordinate clatt = CoarseGrid->GlobalDimensions(); + int Nd = CoarseGrid->Nd(); + /* + * Here, k,l index which possible momentum/shift within the N-points connected by MdagM. + * Matrix index i is mapped to this shift via + * geom.shifts[i] + * + * conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block] + * = \sum_{l in ball} e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} > + * = \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l} + * = M_{kl} A_ji^{b.b+l} + * + * Must assemble and invert matrix M_k,l = e^[i q_k . delta_l] + * + * Where q_k = delta_k . (2*M_PI/global_nb[mu]) + * + * Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j} + */ + Eigen::MatrixXcd Mkl = Eigen::MatrixXcd::Zero(npoint,npoint); + Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint); + ComplexD ci(0.0,1.0); + for(int k=0;k phaF(npoint,grid); + std::vector pha(npoint,CoarseGrid); + + CoarseVector coarseInner(CoarseGrid); + + tphase=-usecond(); + typedef typename CComplex::scalar_type SComplex; + FineComplexField one(grid); one=SComplex(1.0); + FineComplexField zz(grid); zz = Zero(); + for(int p=0;p _A; + _A.resize(geom_srhs.npoint,CoarseGrid); + + // Count use small chunks than npoint == 81 and save memory + int batch = 9; + std::vector _MphaV(batch,grid); + std::vector TmpProj(batch,CoarseGrid); + + std::vector ComputeProj(npoint,CoarseGrid); + CoarseVector FT(CoarseGrid); + for(int i=0;ioSites(); + autoView( A_v , _A[k], AcceleratorWrite); + autoView( FT_v , FT, AcceleratorRead); + accelerator_for(sss, osites, 1, { + for(int j=0;jM(in,out); + } + void M (const CoarseVector &in, CoarseVector &out) + { + // std::cout << GridLogMessage << "New Mrhs coarse"< Vview; + + const int Nsimd = CComplex::Nsimd(); + + int64_t nrhs =pin.Grid()->GlobalDimensions()[0]; + assert(nrhs>=1); + + RealD flops,bytes; + int64_t osites=in.Grid()->oSites(); // unpadded + int64_t unpadded_vol = CoarseGrid()->lSites()/nrhs; + + flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd(); + bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0] + + 2.0*osites*sizeof(siteVector)*npoint; + + + t_GtoB=-usecond(); + GridtoBLAS(pin,BLAS_B); + t_GtoB+=usecond(); + + GridBLAS BLAS; + + t_mult=-usecond(); + for(int p=0;p &out){assert(0);}; +}; + +NAMESPACE_END(Grid); diff --git a/Grid/algorithms/multigrid/Geometry.h b/Grid/algorithms/multigrid/Geometry.h new file mode 100644 index 00000000..e239484a --- /dev/null +++ b/Grid/algorithms/multigrid/Geometry.h @@ -0,0 +1,238 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/algorithms/GeneralCoarsenedMatrix.h + + Copyright (C) 2015 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +NAMESPACE_BEGIN(Grid); + + +///////////////////////////////////////////////////////////////// +// Geometry class in cartesian case +///////////////////////////////////////////////////////////////// + +class Geometry { +public: + int npoint; + int base; + std::vector directions ; + std::vector displacements; + std::vector points_dagger; + + Geometry(int _d) { + + base = (_d==5) ? 1:0; + + // make coarse grid stencil for 4d , not 5d + if ( _d==5 ) _d=4; + + npoint = 2*_d+1; + directions.resize(npoint); + displacements.resize(npoint); + points_dagger.resize(npoint); + for(int d=0;d<_d;d++){ + directions[d ] = d+base; + directions[d+_d] = d+base; + displacements[d ] = +1; + displacements[d+_d]= -1; + points_dagger[d ] = d+_d; + points_dagger[d+_d] = d; + } + directions [2*_d]=0; + displacements[2*_d]=0; + points_dagger[2*_d]=2*_d; + } + + int point(int dir, int disp) { + assert(disp == -1 || disp == 0 || disp == 1); + assert(base+0 <= dir && dir < base+4); + + // directions faster index = new indexing + // 4d (base = 0): + // point 0 1 2 3 4 5 6 7 8 + // dir 0 1 2 3 0 1 2 3 0 + // disp +1 +1 +1 +1 -1 -1 -1 -1 0 + // 5d (base = 1): + // point 0 1 2 3 4 5 6 7 8 + // dir 1 2 3 4 1 2 3 4 0 + // disp +1 +1 +1 +1 -1 -1 -1 -1 0 + + // displacements faster index = old indexing + // 4d (base = 0): + // point 0 1 2 3 4 5 6 7 8 + // dir 0 0 1 1 2 2 3 3 0 + // disp +1 -1 +1 -1 +1 -1 +1 -1 0 + // 5d (base = 1): + // point 0 1 2 3 4 5 6 7 8 + // dir 1 1 2 2 3 3 4 4 0 + // disp +1 -1 +1 -1 +1 -1 +1 -1 0 + + if(dir == 0 and disp == 0) + return 8; + else // New indexing + return (1 - disp) / 2 * 4 + dir - base; + // else // Old indexing + // return (4 * (dir - base) + 1 - disp) / 2; + } +}; + +///////////////////////////////////////////////////////////////// +// Less local equivalent of Geometry class in cartesian case +///////////////////////////////////////////////////////////////// +class NonLocalStencilGeometry { +public: + // int depth; + int skip; + int hops; + int npoint; + std::vector shifts; + Coordinate stencil_size; + Coordinate stencil_lo; + Coordinate stencil_hi; + GridCartesian *grid; + GridCartesian *Grid() {return grid;}; + int Depth(void){return 1;}; // Ghost zone depth + int Hops(void){return hops;}; // # of hops=> level of corner fill in in stencil + int DimSkip(void){return skip;}; + + virtual ~NonLocalStencilGeometry() {}; + + int Reverse(int point) + { + int Nd = Grid()->Nd(); + Coordinate shft = shifts[point]; + Coordinate rev(Nd); + for(int mu=0;mushifts.resize(0); + int Nd = this->grid->Nd(); + + int dd = this->DimSkip(); + for(int s0=this->stencil_lo[dd+0];s0<=this->stencil_hi[dd+0];s0++){ + for(int s1=this->stencil_lo[dd+1];s1<=this->stencil_hi[dd+1];s1++){ + for(int s2=this->stencil_lo[dd+2];s2<=this->stencil_hi[dd+2];s2++){ + for(int s3=this->stencil_lo[dd+3];s3<=this->stencil_hi[dd+3];s3++){ + Coordinate sft(Nd,0); + sft[dd+0] = s0; + sft[dd+1] = s1; + sft[dd+2] = s2; + sft[dd+3] = s3; + int nhops = abs(s0)+abs(s1)+abs(s2)+abs(s3); + if(nhops<=this->hops) this->shifts.push_back(sft); + }}}} + this->npoint = this->shifts.size(); + std::cout << GridLogMessage << "NonLocalStencilGeometry has "<< this->npoint << " terms in stencil "<GlobalDimensions(); + stencil_size.resize(grid->Nd()); + stencil_lo.resize(grid->Nd()); + stencil_hi.resize(grid->Nd()); + for(int d=0;dNd();d++){ + if ( latt[d] == 1 ) { + stencil_lo[d] = 0; + stencil_hi[d] = 0; + stencil_size[d]= 1; + } else if ( latt[d] == 2 ) { + stencil_lo[d] = -1; + stencil_hi[d] = 0; + stencil_size[d]= 2; + } else if ( latt[d] > 2 ) { + stencil_lo[d] = -1; + stencil_hi[d] = 1; + stencil_size[d]= 3; + } + } + this->BuildShifts(); + }; + +}; + +// Need to worry about red-black now +class NonLocalStencilGeometry4D : public NonLocalStencilGeometry { +public: + virtual int DerivedDimSkip(void) { return 0;}; + NonLocalStencilGeometry4D(GridCartesian *Coarse,int _hops) : NonLocalStencilGeometry(Coarse,_hops,0) { }; + virtual ~NonLocalStencilGeometry4D() {}; +}; +class NonLocalStencilGeometry5D : public NonLocalStencilGeometry { +public: + virtual int DerivedDimSkip(void) { return 1; }; + NonLocalStencilGeometry5D(GridCartesian *Coarse,int _hops) : NonLocalStencilGeometry(Coarse,_hops,1) { }; + virtual ~NonLocalStencilGeometry5D() {}; +}; +/* + * Bunch of different options classes + */ +class NextToNextToNextToNearestStencilGeometry4D : public NonLocalStencilGeometry4D { +public: + NextToNextToNextToNearestStencilGeometry4D(GridCartesian *Coarse) : NonLocalStencilGeometry4D(Coarse,4) + { + }; +}; +class NextToNextToNextToNearestStencilGeometry5D : public NonLocalStencilGeometry5D { +public: + NextToNextToNextToNearestStencilGeometry5D(GridCartesian *Coarse) : NonLocalStencilGeometry5D(Coarse,4) + { + }; +}; +class NextToNearestStencilGeometry4D : public NonLocalStencilGeometry4D { +public: + NextToNearestStencilGeometry4D(GridCartesian *Coarse) : NonLocalStencilGeometry4D(Coarse,2) + { + }; +}; +class NextToNearestStencilGeometry5D : public NonLocalStencilGeometry5D { +public: + NextToNearestStencilGeometry5D(GridCartesian *Coarse) : NonLocalStencilGeometry5D(Coarse,2) + { + }; +}; +class NearestStencilGeometry4D : public NonLocalStencilGeometry4D { +public: + NearestStencilGeometry4D(GridCartesian *Coarse) : NonLocalStencilGeometry4D(Coarse,1) + { + }; +}; +class NearestStencilGeometry5D : public NonLocalStencilGeometry5D { +public: + NearestStencilGeometry5D(GridCartesian *Coarse) : NonLocalStencilGeometry5D(Coarse,1) + { + }; +}; + +NAMESPACE_END(Grid); diff --git a/Grid/algorithms/multigrid/MultiGrid.h b/Grid/algorithms/multigrid/MultiGrid.h new file mode 100644 index 00000000..c7e67f39 --- /dev/null +++ b/Grid/algorithms/multigrid/MultiGrid.h @@ -0,0 +1,34 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: Grid/algorithms/multigrid/MultiGrid.h + + Copyright (C) 2023 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#pragma once + +#include +#include +#include +#include +#include diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index ac787016..8a27f527 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -175,9 +175,56 @@ template using cshiftAllocator = std::allocator; template using Vector = std::vector >; template using stencilVector = std::vector >; -template using commVector = std::vector >; +template using commVector = std::vector >; template using deviceVector = std::vector >; -template using cshiftVector = std::vector >; +template using cshiftVector = std::vector >; + +/* +template class vecView +{ + protected: + T * data; + uint64_t size; + ViewMode mode; + void * cpu_ptr; + public: + accelerator_inline T & operator[](size_t i) const { return this->data[i]; }; + vecView(std::vector &refer_to_me,ViewMode _mode) + { + cpu_ptr = &refer_to_me[0]; + size = refer_to_me.size(); + mode = _mode; + data =(T *) MemoryManager::ViewOpen(cpu_ptr, + size*sizeof(T), + mode, + AdviseDefault); + } + void ViewClose(void) + { // Inform the manager + MemoryManager::ViewClose(this->cpu_ptr,this->mode); + } +}; + +template vecView VectorView(std::vector &vec,ViewMode _mode) +{ + vecView ret(vec,_mode); // does the open + return ret; // must be closed +} + +// Little autoscope assister +template +class VectorViewCloser +{ + View v; // Take a copy of view and call view close when I go out of scope automatically + public: + VectorViewCloser(View &_v) : v(_v) {}; + ~VectorViewCloser() { auto ptr = v.cpu_ptr; v.ViewClose(); MemoryManager::NotifyDeletion(ptr);} +}; + +#define autoVecView(v_v,v,mode) \ + auto v_v = VectorView(v,mode); \ + ViewCloser _autoView##v_v(v_v); +*/ NAMESPACE_END(Grid); diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h index 0dc78f04..528ebbaa 100644 --- a/Grid/allocator/MemoryManager.h +++ b/Grid/allocator/MemoryManager.h @@ -209,9 +209,9 @@ private: static void CpuViewClose(uint64_t Ptr); static uint64_t CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint); #endif - static void NotifyDeletion(void * CpuPtr); public: + static void NotifyDeletion(void * CpuPtr); static void Print(void); static void PrintAll(void); static void PrintState( void* CpuPtr); diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc index e758ac2f..c610fb9c 100644 --- a/Grid/allocator/MemoryManagerCache.cc +++ b/Grid/allocator/MemoryManagerCache.cc @@ -8,7 +8,7 @@ NAMESPACE_BEGIN(Grid); static char print_buffer [ MAXLINE ]; #define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer; -#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer; +#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer; //#define dprintf(...) @@ -111,7 +111,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) /////////////////////////////////////////////////////////// assert(AccCache.state!=Empty); - mprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); + dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); assert(AccCache.accLock==0); assert(AccCache.cpuLock==0); assert(AccCache.CpuPtr!=(uint64_t)NULL); @@ -141,7 +141,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache) /////////////////////////////////////////////////////////////////////////// assert(AccCache.state!=Empty); - mprintf("MemoryManager: Evict cpu %lx acc %lx cpuLock %ld accLock %ld\n", + mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n", (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr, (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); if (AccCache.accLock!=0) return; @@ -155,7 +155,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache) AccCache.AccPtr=(uint64_t)NULL; AccCache.state=CpuDirty; // CPU primary now DeviceBytes -=AccCache.bytes; - dprintf("MemoryManager: Free(%lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); + dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); } // uint64_t CpuPtr = AccCache.CpuPtr; DeviceEvictions++; @@ -169,7 +169,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache) assert(AccCache.AccPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL); acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes); - mprintf("MemoryManager: Flush %lx -> %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + mprintf("MemoryManager: acceleratorCopyFromDevice Flush AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); DeviceToHostBytes+=AccCache.bytes; DeviceToHostXfer++; AccCache.state=Consistent; @@ -184,7 +184,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache) AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); DeviceBytes+=AccCache.bytes; } - mprintf("MemoryManager: Clone %lx <- %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + mprintf("MemoryManager: acceleratorCopyToDevice Clone AccPtr %lx <- CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes); HostToDeviceBytes+=AccCache.bytes; HostToDeviceXfer++; @@ -474,6 +474,7 @@ void MemoryManager::Print(void) std::cout << GridLogMessage << DeviceEvictions << " Evictions from device " << std::endl; std::cout << GridLogMessage << DeviceDestroy << " Destroyed vectors on device " << std::endl; std::cout << GridLogMessage << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl; + acceleratorMem(); std::cout << GridLogMessage << "--------------------------------------------" << std::endl; } void MemoryManager::PrintAll(void) diff --git a/Grid/cartesian/Cartesian_base.h b/Grid/cartesian/Cartesian_base.h index ae1fd1fd..bb3c3b3f 100644 --- a/Grid/cartesian/Cartesian_base.h +++ b/Grid/cartesian/Cartesian_base.h @@ -70,8 +70,8 @@ public: Coordinate _istride; // Inner stride i.e. within simd lane int _osites; // _isites*_osites = product(dimensions). int _isites; - int _fsites; // _isites*_osites = product(dimensions). - int _gsites; + int64_t _fsites; // _isites*_osites = product(dimensions). + int64_t _gsites; Coordinate _slice_block;// subslice information Coordinate _slice_stride; Coordinate _slice_nblock; @@ -183,7 +183,7 @@ public: inline int Nsimd(void) const { return _isites; };// Synonymous with iSites inline int oSites(void) const { return _osites; }; inline int lSites(void) const { return _isites*_osites; }; - inline int gSites(void) const { return _isites*_osites*_Nprocessors; }; + inline int64_t gSites(void) const { return (int64_t)_isites*(int64_t)_osites*(int64_t)_Nprocessors; }; inline int Nd (void) const { return _ndimension;}; inline const Coordinate LocalStarts(void) { return _lstart; }; @@ -214,7 +214,7 @@ public: //////////////////////////////////////////////////////////////// // Global addressing //////////////////////////////////////////////////////////////// - void GlobalIndexToGlobalCoor(int gidx,Coordinate &gcoor){ + void GlobalIndexToGlobalCoor(int64_t gidx,Coordinate &gcoor){ assert(gidx< gSites()); Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions); } @@ -222,7 +222,7 @@ public: assert(lidx &list); + void SendToRecvFromBegin(std::vector &list, + void *xmit, + int dest, + void *recv, + int from, + int bytes,int dir); + void SendToRecvFrom(void *xmit, int xmit_to_rank, void *recv, diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 89b042e9..5fa70da4 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -306,6 +306,44 @@ void CartesianCommunicator::GlobalSumVector(double *d,int N) int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); assert(ierr==0); } + +void CartesianCommunicator::SendToRecvFromBegin(std::vector &list, + void *xmit, + int dest, + void *recv, + int from, + int bytes,int dir) +{ + MPI_Request xrq; + MPI_Request rrq; + + assert(dest != _processor); + assert(from != _processor); + + int tag; + + tag= dir+from*32; + int ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator,&rrq); + assert(ierr==0); + list.push_back(rrq); + + tag= dir+_processor*32; + ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator,&xrq); + assert(ierr==0); + list.push_back(xrq); +} +void CartesianCommunicator::CommsComplete(std::vector &list) +{ + int nreq=list.size(); + + if (nreq==0) return; + + std::vector status(nreq); + int ierr = MPI_Waitall(nreq,&list[0],&status[0]); + assert(ierr==0); + list.resize(0); +} + // Basic Halo comms primitive void CartesianCommunicator::SendToRecvFrom(void *xmit, int dest, diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc index a06e054d..7e7dfac8 100644 --- a/Grid/communicator/Communicator_none.cc +++ b/Grid/communicator/Communicator_none.cc @@ -91,6 +91,17 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit, { assert(0); } +void CartesianCommunicator::CommsComplete(std::vector &list){ assert(0);} +void CartesianCommunicator::SendToRecvFromBegin(std::vector &list, + void *xmit, + int dest, + void *recv, + int from, + int bytes,int dir) +{ + assert(0); +} + void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes) { bcopy(in,out,bytes*words); diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index b0b759b5..1aefd9c1 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -234,9 +234,12 @@ public: } template inline Lattice & operator = (const sobj & r){ - auto me = View(CpuWrite); - thread_for(ss,me.size(),{ - me[ss]= r; + vobj vtmp; + vtmp = r; + auto me = View(AcceleratorWrite); + accelerator_for(ss,me.size(),vobj::Nsimd(),{ + auto stmp=coalescedRead(vtmp); + coalescedWrite(me[ss],stmp); }); me.ViewClose(); return *this; @@ -360,7 +363,7 @@ public: template std::ostream& operator<< (std::ostream& stream, const Lattice &o){ typedef typename vobj::scalar_object sobj; - for(int g=0;g_gsites;g++){ + for(int64_t g=0;g_gsites;g++){ Coordinate gcoor; o.Grid()->GlobalIndexToGlobalCoor(g,gcoor); diff --git a/Grid/lattice/Lattice_crc.h b/Grid/lattice/Lattice_crc.h index e31d8441..f9aca1f6 100644 --- a/Grid/lattice/Lattice_crc.h +++ b/Grid/lattice/Lattice_crc.h @@ -29,7 +29,7 @@ Author: Peter Boyle NAMESPACE_BEGIN(Grid); -template void DumpSliceNorm(std::string s,Lattice &f,int mu=-1) +template void DumpSliceNorm(std::string s,const Lattice &f,int mu=-1) { auto ff = localNorm2(f); if ( mu==-1 ) mu = f.Grid()->Nd()-1; diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 4e11378d..7b66c31d 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -204,6 +204,27 @@ template inline RealD norm2(const Lattice &arg){ return real(nrm); } + +template +inline auto norm2(const LatticeUnaryExpression & expr) ->RealD +{ + return norm2(closure(expr)); +} + +template +inline auto norm2(const LatticeBinaryExpression & expr) ->RealD +{ + return norm2(closure(expr)); +} + + +template +inline auto norm2(const LatticeTrinaryExpression & expr) ->RealD +{ + return norm2(closure(expr)); +} + + //The global maximum of the site norm2 template inline RealD maxLocalNorm2(const Lattice &arg) { diff --git a/Grid/lattice/Lattice_rng.h b/Grid/lattice/Lattice_rng.h index 7c6c97de..292722c9 100644 --- a/Grid/lattice/Lattice_rng.h +++ b/Grid/lattice/Lattice_rng.h @@ -365,9 +365,14 @@ public: _bernoulli.resize(_vol,std::discrete_distribution{1,1}); _uid.resize(_vol,std::uniform_int_distribution() ); } - - template inline void fill(Lattice &l,std::vector &dist){ - + template inline void fill(Lattice &l,std::vector &dist) + { + if ( l.Grid()->_isCheckerBoarded ) { + Lattice tmp(_grid); + fill(tmp,dist); + pickCheckerboard(l.Checkerboard(),l,tmp); + return; + } typedef typename vobj::scalar_object scalar_object; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; @@ -430,7 +435,7 @@ public: //////////////////////////////////////////////// thread_for( lidx, _grid->lSites(), { - int gidx; + int64_t gidx; int o_idx; int i_idx; int rank; diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index a936b1c0..c475829d 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -276,18 +276,33 @@ inline void blockProject(Lattice > &coarseData, autoView( coarseData_ , coarseData, AcceleratorWrite); autoView( ip_ , ip, AcceleratorWrite); + RealD t_IP=0; + RealD t_co=0; + RealD t_za=0; for(int v=0;v + t_IP+=usecond(); + t_co-=usecond(); accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), { convertType(coarseData_[sc](v),ip_[sc]); }); + t_co+=usecond(); // improve numerical stability of projection // |fine> = |fine> - |basis> ip=-ip; + t_za-=usecond(); blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed); + t_za+=usecond(); } + // std::cout << GridLogPerformance << " blockProject : blockInnerProduct : "< inline void batchBlockProject(std::vector>> &coarseData, const std::vector> &fineData, @@ -393,8 +408,15 @@ template Lattice coarse_inner(coarse); // Precision promotion + RealD t; + t=-usecond(); fine_inner = localInnerProductD(fineX,fineY); + // t+=usecond(); std::cout << GridLogPerformance << " blockInnerProduct : localInnerProductD "< convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss])); }); } + // t+=usecond(); std::cout << GridLogPerformance << " blockInnerProduct : convertType "< &ip,Lattice &fineX) template inline void blockSum(Lattice &coarseData,const Lattice &fineData) { + const int maxsubsec=256; + typedef iVector vSubsec; + GridBase * fine = fineData.Grid(); GridBase * coarse= coarseData.Grid(); @@ -463,35 +489,62 @@ inline void blockSum(Lattice &coarseData,const Lattice &fineData) autoView( coarseData_ , coarseData, AcceleratorWrite); autoView( fineData_ , fineData, AcceleratorRead); - auto coarseData_p = &coarseData_[0]; - auto fineData_p = &fineData_[0]; + auto coarseData_p = &coarseData_[0]; + auto fineData_p = &fineData_[0]; Coordinate fine_rdimensions = fine->_rdimensions; Coordinate coarse_rdimensions = coarse->_rdimensions; - accelerator_for(sc,coarse->oSites(),1,{ + vobj zz = Zero(); + // Somewhat lazy calculation + // Find the biggest power of two subsection divisor less than or equal to maxsubsec + int subsec=maxsubsec; + int subvol; + subvol=blockVol/subsec; + while(subvol*subsec!=blockVol){ + subsec = subsec/2; + subvol=blockVol/subsec; + }; + + Lattice coarseTmp(coarse); + autoView( coarseTmp_, coarseTmp, AcceleratorWriteDiscard); + auto coarseTmp_p= &coarseTmp_[0]; + + // Sum within subsecs in a first kernel + accelerator_for(sce,subsec*coarse->oSites(),vobj::Nsimd(),{ + + int sc=sce/subsec; + int e=sce%subsec; + // One thread per sub block Coordinate coor_c(_ndimension); Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate - vobj cd = Zero(); - - for(int sb=0;sboSites(),vobj::Nsimd(),{ + auto cd = coalescedRead(coarseTmp_p[sc](0)); + for(int e=1;e &ip,std::vector > blockOrthonormalize(ip,Basis); } -#if 0 +#ifdef GRID_ACCELERATED // TODO: CPU optimized version here template inline void blockPromote(const Lattice > &coarseData, @@ -574,26 +627,37 @@ inline void blockPromote(const Lattice > &coarseData, autoView( fineData_ , fineData, AcceleratorWrite); autoView( coarseData_ , coarseData, AcceleratorRead); + typedef LatticeView Vview; + std::vector AcceleratorVecViewContainer_h; + for(int v=0;v AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(nbasis); + acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],nbasis *sizeof(Vview)); + auto Basis_p = &AcceleratorVecViewContainer[0]; // Loop with a cache friendly loop ordering - accelerator_for(sf,fine->oSites(),1,{ + Coordinate frdimensions=fine->_rdimensions; + Coordinate crdimensions=coarse->_rdimensions; + accelerator_for(sf,fine->oSites(),vobj::Nsimd(),{ int sc; Coordinate coor_c(_ndimension); Coordinate coor_f(_ndimension); - Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); + Lexicographic::CoorFromIndex(coor_f,sf,frdimensions); for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; - Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); + Lexicographic::IndexFromCoor(coor_c,sc,crdimensions); - for(int i=0;i inline void blockPromote(const Lattice > &coarseData, Lattice &fineData, @@ -680,7 +744,11 @@ void localCopyRegion(const Lattice &From,Lattice & To,Coordinate Fro typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; - static const int words=sizeof(vobj)/sizeof(vector_type); + const int words=sizeof(vobj)/sizeof(vector_type); + + ////////////////////////////////////////////////////////////////////////////////////////// + // checks should guarantee that the operations are local + ////////////////////////////////////////////////////////////////////////////////////////// GridBase *Fg = From.Grid(); GridBase *Tg = To.Grid(); @@ -695,52 +763,38 @@ void localCopyRegion(const Lattice &From,Lattice & To,Coordinate Fro for(int d=0;d_processors[d] == Tg->_processors[d]); } - // the above should guarantee that the operations are local - -#if 1 + + /////////////////////////////////////////////////////////// + // do the index calc on the GPU + /////////////////////////////////////////////////////////// + Coordinate f_ostride = Fg->_ostride; + Coordinate f_istride = Fg->_istride; + Coordinate f_rdimensions = Fg->_rdimensions; + Coordinate t_ostride = Tg->_ostride; + Coordinate t_istride = Tg->_istride; + Coordinate t_rdimensions = Tg->_rdimensions; size_t nsite = 1; for(int i=0;ioIndex(from_coor); - int fiidx = Fg->iIndex(from_coor); - int toidx = Tg->oIndex(to_coor); - int tiidx = Tg->iIndex(to_coor); - int* tt = table + 4*idx; - tt[0] = foidx; - tt[1] = fiidx; - tt[2] = toidx; - tt[3] = tiidx; - }); - - int* table_d = (int*)acceleratorAllocDevice(tbytes); - acceleratorCopyToDevice(table,table_d,tbytes); typedef typename vobj::vector_type vector_type; typedef typename vobj::scalar_type scalar_type; autoView(from_v,From,AcceleratorRead); autoView(to_v,To,AcceleratorWrite); - + accelerator_for(idx,nsite,1,{ - static const int words=sizeof(vobj)/sizeof(vector_type); - int* tt = table_d + 4*idx; - int from_oidx = *tt++; - int from_lane = *tt++; - int to_oidx = *tt++; - int to_lane = *tt; + + Coordinate from_coor, to_coor, base; + Lexicographic::CoorFromIndex(base,idx,RegionSize); + for(int i=0;i &From,Lattice & To,Coordinate Fro stmp = getlane(from[w], from_lane); putlane(to[w], stmp, to_lane); } - }); - - acceleratorFreeDevice(table_d); - free(table); - - -#else - Coordinate ldf = Fg->_ldimensions; - Coordinate rdf = Fg->_rdimensions; - Coordinate isf = Fg->_istride; - Coordinate osf = Fg->_ostride; - Coordinate rdt = Tg->_rdimensions; - Coordinate ist = Tg->_istride; - Coordinate ost = Tg->_ostride; - - autoView( t_v , To, CpuWrite); - autoView( f_v , From, CpuRead); - thread_for(idx,Fg->lSites(),{ - sobj s; - Coordinate Fcoor(nd); - Coordinate Tcoor(nd); - Lexicographic::CoorFromIndex(Fcoor,idx,ldf); - int in_region=1; - for(int d=0;d=FromLowerLeft[d]+RegionSize[d]) ){ - in_region=0; - } - Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d]; - } - if (in_region) { -#if 0 - Integer idx_f = 0; for(int d=0;d +void InsertSliceFast(const Lattice &From,Lattice & To,int slice, int orthog) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + const int words=sizeof(vobj)/sizeof(vector_type); + + ////////////////////////////////////////////////////////////////////////////////////////// + // checks should guarantee that the operations are local + ////////////////////////////////////////////////////////////////////////////////////////// + GridBase *Fg = From.Grid(); + GridBase *Tg = To.Grid(); + assert(!Fg->_isCheckerBoarded); + assert(!Tg->_isCheckerBoarded); + int Nsimd = Fg->Nsimd(); + int nF = Fg->_ndimension; + int nT = Tg->_ndimension; + assert(nF+1 == nT); + + /////////////////////////////////////////////////////////// + // do the index calc on the GPU + /////////////////////////////////////////////////////////// + Coordinate f_ostride = Fg->_ostride; + Coordinate f_istride = Fg->_istride; + Coordinate f_rdimensions = Fg->_rdimensions; + Coordinate t_ostride = Tg->_ostride; + Coordinate t_istride = Tg->_istride; + Coordinate t_rdimensions = Tg->_rdimensions; + Coordinate RegionSize = Fg->_ldimensions; + size_t nsite = 1; + for(int i=0;i +void ExtractSliceFast(Lattice &To,const Lattice & From,int slice, int orthog) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + const int words=sizeof(vobj)/sizeof(vector_type); + + ////////////////////////////////////////////////////////////////////////////////////////// + // checks should guarantee that the operations are local + ////////////////////////////////////////////////////////////////////////////////////////// + GridBase *Fg = From.Grid(); + GridBase *Tg = To.Grid(); + assert(!Fg->_isCheckerBoarded); + assert(!Tg->_isCheckerBoarded); + int Nsimd = Fg->Nsimd(); + int nF = Fg->_ndimension; + int nT = Tg->_ndimension; + assert(nT+1 == nF); + + /////////////////////////////////////////////////////////// + // do the index calc on the GPU + /////////////////////////////////////////////////////////// + Coordinate f_ostride = Fg->_ostride; + Coordinate f_istride = Fg->_istride; + Coordinate f_rdimensions = Fg->_rdimensions; + Coordinate t_ostride = Tg->_ostride; + Coordinate t_istride = Tg->_istride; + Coordinate t_rdimensions = Tg->_rdimensions; + Coordinate RegionSize = Tg->_ldimensions; + size_t nsite = 1; + for(int i=0;i void InsertSlice(const Lattice &lowDim,Lattice & higherDim,int slice, int orthog) @@ -889,9 +1033,7 @@ void ExtractSlice(Lattice &lowDim,const Lattice & higherDim,int slic } - -//Insert subvolume orthogonal to direction 'orthog' with slice index 'slice_lo' from 'lowDim' onto slice index 'slice_hi' of higherDim -//The local dimensions of both 'lowDim' and 'higherDim' orthogonal to 'orthog' should be the same +//Can I implement with local copyregion?? template void InsertSliceLocal(const Lattice &lowDim, Lattice & higherDim,int slice_lo,int slice_hi, int orthog) { @@ -912,121 +1054,18 @@ void InsertSliceLocal(const Lattice &lowDim, Lattice & higherDim,int assert(lg->_ldimensions[d] == hg->_ldimensions[d]); } } - -#if 1 - size_t nsite = lg->lSites()/lg->LocalDimensions()[orthog]; - size_t tbytes = 4*nsite*sizeof(int); - int *table = (int*)malloc(tbytes); - - thread_for(idx,nsite,{ - Coordinate lcoor(nl); - Coordinate hcoor(nh); - lcoor[orthog] = slice_lo; - hcoor[orthog] = slice_hi; - size_t rem = idx; - for(int mu=0;muLocalDimensions()[mu]; rem /= lg->LocalDimensions()[mu]; - lcoor[mu] = hcoor[mu] = xmu; - } - } - int loidx = lg->oIndex(lcoor); - int liidx = lg->iIndex(lcoor); - int hoidx = hg->oIndex(hcoor); - int hiidx = hg->iIndex(hcoor); - int* tt = table + 4*idx; - tt[0] = loidx; - tt[1] = liidx; - tt[2] = hoidx; - tt[3] = hiidx; - }); - - int* table_d = (int*)acceleratorAllocDevice(tbytes); - acceleratorCopyToDevice(table,table_d,tbytes); - - typedef typename vobj::vector_type vector_type; - typedef typename vobj::scalar_type scalar_type; - - autoView(lowDim_v,lowDim,AcceleratorRead); - autoView(higherDim_v,higherDim,AcceleratorWrite); - - accelerator_for(idx,nsite,1,{ - static const int words=sizeof(vobj)/sizeof(vector_type); - int* tt = table_d + 4*idx; - int from_oidx = *tt++; - int from_lane = *tt++; - int to_oidx = *tt++; - int to_lane = *tt; - - const vector_type* from = (const vector_type *)&lowDim_v[from_oidx]; - vector_type* to = (vector_type *)&higherDim_v[to_oidx]; - - scalar_type stmp; - for(int w=0;wlSites(),{ - sobj s; - Coordinate lcoor(nl); - Coordinate hcoor(nh); - lg->LocalIndexToLocalCoor(idx,lcoor); - if( lcoor[orthog] == slice_lo ) { - hcoor=lcoor; - hcoor[orthog] = slice_hi; - peekLocalSite(s,lowDimv,lcoor); - pokeLocalSite(s,higherDimv,hcoor); - } - }); -#endif + Coordinate sz = lg->_ldimensions; + sz[orthog]=1; + Coordinate f_ll(nl,0); f_ll[orthog]=slice_lo; + Coordinate t_ll(nh,0); t_ll[orthog]=slice_hi; + localCopyRegion(lowDim,higherDim,f_ll,t_ll,sz); } template void ExtractSliceLocal(Lattice &lowDim,const Lattice & higherDim,int slice_lo,int slice_hi, int orthog) { - typedef typename vobj::scalar_object sobj; - - GridBase *lg = lowDim.Grid(); - GridBase *hg = higherDim.Grid(); - int nl = lg->_ndimension; - int nh = hg->_ndimension; - - assert(nl == nh); - assert(orthog=0); - - for(int d=0;d_processors[d] == hg->_processors[d]); - assert(lg->_ldimensions[d] == hg->_ldimensions[d]); - } - } - - // the above should guarantee that the operations are local - autoView(lowDimv,lowDim,CpuWrite); - autoView(higherDimv,higherDim,CpuRead); - thread_for(idx,lg->lSites(),{ - sobj s; - Coordinate lcoor(nl); - Coordinate hcoor(nh); - lg->LocalIndexToLocalCoor(idx,lcoor); - if( lcoor[orthog] == slice_lo ) { - hcoor=lcoor; - hcoor[orthog] = slice_hi; - peekLocalSite(s,higherDimv,hcoor); - pokeLocalSite(s,lowDimv,lcoor); - } - }); + InsertSliceLocal(higherDim,lowDim,slice_hi,slice_lo,orthog); } @@ -1052,7 +1091,7 @@ void Replicate(const Lattice &coarse,Lattice & fine) Coordinate fcoor(nd); Coordinate ccoor(nd); - for(int g=0;ggSites();g++){ + for(int64_t g=0;ggSites();g++){ fg->GlobalIndexToGlobalCoor(g,fcoor); for(int d=0;d > & full,Lattice & split) } } +////////////////////////////////////////////////////// +// Faster but less accurate blockProject +////////////////////////////////////////////////////// +template +inline void blockProjectFast(Lattice > &coarseData, + const Lattice &fineData, + const VLattice &Basis) +{ + GridBase * fine = fineData.Grid(); + GridBase * coarse= coarseData.Grid(); + + Lattice > ip(coarse); + + autoView( coarseData_ , coarseData, AcceleratorWrite); + autoView( ip_ , ip, AcceleratorWrite); + RealD t_IP=0; + RealD t_co=0; + for(int v=0;voSites(), vobj::Nsimd(), { + convertType(coarseData_[sc](v),ip_[sc]); + }); + t_co+=usecond(); + } +} + + NAMESPACE_END(Grid); diff --git a/Grid/lattice/PaddedCell.h b/Grid/lattice/PaddedCell.h index 5dfb6b0f..ad1496f5 100644 --- a/Grid/lattice/PaddedCell.h +++ b/Grid/lattice/PaddedCell.h @@ -45,6 +45,188 @@ struct CshiftImplGauge: public CshiftImplBase inline void ScatterSlice(const cshiftVector &buf, + Lattice &lat, + int x, + int dim, + int offset=0) +{ + const int Nsimd=vobj::Nsimd(); + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + GridBase *grid = lat.Grid(); + Coordinate simd = grid->_simd_layout; + int Nd = grid->Nd(); + int block = grid->_slice_block[dim]; + int stride = grid->_slice_stride[dim]; + int nblock = grid->_slice_nblock[dim]; + int rd = grid->_rdimensions[dim]; + + int ox = x%rd; + int ix = x/rd; + + int isites = 1; for(int d=0;d inline void GatherSlice(cshiftVector &buf, + const Lattice &lat, + int x, + int dim, + int offset=0) +{ + const int Nsimd=vobj::Nsimd(); + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + autoView(lat_v, lat, AcceleratorRead); + + GridBase *grid = lat.Grid(); + Coordinate simd = grid->_simd_layout; + int Nd = grid->Nd(); + int block = grid->_slice_block[dim]; + int stride = grid->_slice_stride[dim]; + int nblock = grid->_slice_nblock[dim]; + int rd = grid->_rdimensions[dim]; + + int ox = x%rd; + int ix = x/rd; + + int isites = 1; for(int d=0;dNd(); AllocateGrids(); Coordinate local =unpadded_grid->LocalDimensions(); + Coordinate procs =unpadded_grid->ProcessorGrid(); for(int d=0;d=depth); + if ( procs[d] > 1 ) assert(local[d]>=depth); } } void DeleteGrids(void) { + Coordinate processors=unpadded_grid->_processors; for(int d=0;d 1 ) { + delete grids[d]; + } } grids.resize(0); }; @@ -81,27 +267,36 @@ public: Coordinate processors=unpadded_grid->_processors; Coordinate plocal =unpadded_grid->LocalDimensions(); Coordinate global(dims); - + GridCartesian *old_grid = unpadded_grid; // expand up one dim at a time for(int d=0;d 1 ) { + plocal[d] += 2*depth; + + for(int d=0;d inline Lattice Extract(const Lattice &in) const { + Coordinate processors=unpadded_grid->_processors; + Lattice out(unpadded_grid); Coordinate local =unpadded_grid->LocalDimensions(); - Coordinate fll(dims,depth); // depends on the MPI spread + // depends on the MPI spread + Coordinate fll(dims,depth); Coordinate tll(dims,0); // depends on the MPI spread + for(int d=0;d + inline Lattice ExchangePeriodic(const Lattice &in) const + { + GridBase *old_grid = in.Grid(); + int dims = old_grid->Nd(); + Lattice tmp = in; + for(int d=0;d inline Lattice Expand(int dim, const Lattice &in, const CshiftImplBase &cshift = CshiftImplDefault()) const { + Coordinate processors=unpadded_grid->_processors; GridBase *old_grid = in.Grid(); GridCartesian *new_grid = grids[dim];//These are new grids Lattice padded(new_grid); @@ -129,46 +336,236 @@ public: if(dim==0) conformable(old_grid,unpadded_grid); else conformable(old_grid,grids[dim-1]); - std::cout << " dim "< + inline Lattice ExpandPeriodic(int dim, const Lattice &in) const + { + Coordinate processors=unpadded_grid->_processors; + GridBase *old_grid = in.Grid(); + GridCartesian *new_grid = grids[dim];//These are new grids + Lattice padded(new_grid); + // Lattice shifted(old_grid); + Coordinate local =old_grid->LocalDimensions(); + Coordinate plocal =new_grid->LocalDimensions(); + if(dim==0) conformable(old_grid,unpadded_grid); + else conformable(old_grid,grids[dim-1]); + + // std::cout << " dim "< + void Face_exchange(const Lattice &from, + Lattice &to, + int dimension,int depth) const + { + typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::scalar_object sobj; + + RealD t_gather=0.0; + RealD t_scatter=0.0; + RealD t_comms=0.0; + RealD t_copy=0.0; + + // std::cout << GridLogMessage << "dimension " <_ldimensions; + Coordinate nlds= to.Grid()->_ldimensions; + Coordinate simd= from.Grid()->_simd_layout; + int ld = lds[dimension]; + int nld = to.Grid()->_ldimensions[dimension]; + const int Nsimd = vobj::Nsimd(); + + assert(depth<=lds[dimension]); // A must be on neighbouring node + assert(depth>0); // A caller bug if zero + assert(ld+2*depth==nld); + //////////////////////////////////////////////////////////////////////////// + // Face size and byte calculations + //////////////////////////////////////////////////////////////////////////// + int buffer_size = 1; + for(int d=0;d_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]); + + static cshiftVector send_buf; + static cshiftVector recv_buf; + send_buf.resize(buffer_size*2*depth); + recv_buf.resize(buffer_size*2*depth); + + std::vector fwd_req; + std::vector bwd_req; + + int words = buffer_size; + int bytes = words * sizeof(vobj); + + //////////////////////////////////////////////////////////////////////////// + // Communication coords + //////////////////////////////////////////////////////////////////////////// + int comm_proc = 1; + int xmit_to_rank; + int recv_from_rank; + grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); + + //////////////////////////////////////////////////////////////////////////// + // Gather all surface terms up to depth "d" + //////////////////////////////////////////////////////////////////////////// + RealD t; + RealD t_tot=-usecond(); + int plane=0; + for ( int d=0;d < depth ; d ++ ) { + int tag = d*1024 + dimension*2+0; + + t=usecond(); + GatherSlice(send_buf,from,d,dimension,plane*buffer_size); plane++; + t_gather+=usecond()-t; + + t=usecond(); + grid->SendToRecvFromBegin(fwd_req, + (void *)&send_buf[d*buffer_size], xmit_to_rank, + (void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag); + t_comms+=usecond()-t; + } + for ( int d=0;d < depth ; d ++ ) { + int tag = d*1024 + dimension*2+1; + + t=usecond(); + GatherSlice(send_buf,from,ld-depth+d,dimension,plane*buffer_size); plane++; + t_gather+= usecond() - t; + + t=usecond(); + grid->SendToRecvFromBegin(bwd_req, + (void *)&send_buf[(d+depth)*buffer_size], recv_from_rank, + (void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag); + t_comms+=usecond()-t; + } + + //////////////////////////////////////////////////////////////////////////// + // Copy interior -- overlap this with comms + //////////////////////////////////////////////////////////////////////////// + int Nd = new_grid->Nd(); + Coordinate LL(Nd,0); + Coordinate sz = grid->_ldimensions; + Coordinate toLL(Nd,0); + toLL[dimension]=depth; + t=usecond(); + localCopyRegion(from,to,LL,toLL,sz); + t_copy= usecond() - t; + + //////////////////////////////////////////////////////////////////////////// + // Scatter all faces + //////////////////////////////////////////////////////////////////////////// + plane=0; + + t=usecond(); + grid->CommsComplete(fwd_req); + t_comms+= usecond() - t; + + t=usecond(); + for ( int d=0;d < depth ; d ++ ) { + ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++; + } + t_scatter= usecond() - t; + + t=usecond(); + grid->CommsComplete(bwd_req); + t_comms+= usecond() - t; + + t=usecond(); + for ( int d=0;d < depth ; d ++ ) { + ScatterSlice(recv_buf,to,d,dimension,plane*buffer_size); plane++; + } + t_scatter+= usecond() - t; + t_tot+=usecond(); + + std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << t_gather/1000 << "ms"< scalardata(lsites); std::vector iodata(lsites); // Munge, checksum, byte order in here - IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC, + IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|control, nersc_csum,scidac_csuma,scidac_csumb); GridStopWatch timer; @@ -582,7 +584,8 @@ class BinaryIO { const std::string &format, uint32_t &nersc_csum, uint32_t &scidac_csuma, - uint32_t &scidac_csumb) + uint32_t &scidac_csumb, + int control=BINARYIO_LEXICOGRAPHIC) { typedef typename vobj::scalar_object sobj; typedef typename vobj::Realified::scalar_type word; word w=0; @@ -607,7 +610,7 @@ class BinaryIO { while (attemptsLeft >= 0) { grid->Barrier(); - IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC, + IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|control, nersc_csum,scidac_csuma,scidac_csumb); if (checkWrite) { @@ -617,7 +620,7 @@ class BinaryIO { std::cout << GridLogMessage << "writeLatticeObject: read back object" << std::endl; grid->Barrier(); - IOobject(w,grid,ckiodata,file,ckoffset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC, + IOobject(w,grid,ckiodata,file,ckoffset,format,BINARYIO_READ|control, cknersc_csum,ckscidac_csuma,ckscidac_csumb); if ((cknersc_csum != nersc_csum) or (ckscidac_csuma != scidac_csuma) or (ckscidac_csumb != scidac_csumb)) { diff --git a/Grid/parallelIO/IldgIO.h b/Grid/parallelIO/IldgIO.h index 80d135d2..688d1ac9 100644 --- a/Grid/parallelIO/IldgIO.h +++ b/Grid/parallelIO/IldgIO.h @@ -162,8 +162,14 @@ template void ScidacMetaData(Lattice & field, { uint32_t scidac_checksuma = stoull(scidacChecksum_.suma,0,16); uint32_t scidac_checksumb = stoull(scidacChecksum_.sumb,0,16); - if ( scidac_csuma !=scidac_checksuma) return 0; - if ( scidac_csumb !=scidac_checksumb) return 0; + std::cout << GridLogMessage << " scidacChecksumVerify computed "< - void readLimeLatticeBinaryObject(Lattice &field,std::string record_name) + void readLimeLatticeBinaryObject(Lattice &field,std::string record_name,int control=BINARYIO_LEXICOGRAPHIC) { typedef typename vobj::scalar_object sobj; scidacChecksum scidacChecksum_; @@ -238,7 +244,7 @@ class GridLimeReader : public BinaryIO { uint64_t offset= ftello(File); // std::cout << " ReadLatticeObject from offset "< munge; - BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb); + BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb,control); std::cout << GridLogMessage << "SciDAC checksum A " << std::hex << scidac_csuma << std::dec << std::endl; std::cout << GridLogMessage << "SciDAC checksum B " << std::hex << scidac_csumb << std::dec << std::endl; ///////////////////////////////////////////// @@ -408,7 +414,7 @@ class GridLimeWriter : public BinaryIO // in communicator used by the field.Grid() //////////////////////////////////////////////////// template - void writeLimeLatticeBinaryObject(Lattice &field,std::string record_name) + void writeLimeLatticeBinaryObject(Lattice &field,std::string record_name,int control=BINARYIO_LEXICOGRAPHIC) { //////////////////////////////////////////////////////////////////// // NB: FILE and iostream are jointly writing disjoint sequences in the @@ -459,7 +465,7 @@ class GridLimeWriter : public BinaryIO /////////////////////////////////////////// std::string format = getFormatString(); BinarySimpleMunger munge; - BinaryIO::writeLatticeObject(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb); + BinaryIO::writeLatticeObject(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb,control); /////////////////////////////////////////// // Wind forward and close the record @@ -512,7 +518,8 @@ class ScidacWriter : public GridLimeWriter { //////////////////////////////////////////////// template void writeScidacFieldRecord(Lattice &field,userRecord _userRecord, - const unsigned int recordScientificPrec = 0) + const unsigned int recordScientificPrec = 0, + int control=BINARYIO_LEXICOGRAPHIC) { GridBase * grid = field.Grid(); @@ -534,7 +541,7 @@ class ScidacWriter : public GridLimeWriter { writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML)); } // Collective call - writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA)); // Closes message with checksum + writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA),control); // Closes message with checksum } }; @@ -553,7 +560,8 @@ class ScidacReader : public GridLimeReader { // Write generic lattice field in scidac format //////////////////////////////////////////////// template - void readScidacFieldRecord(Lattice &field,userRecord &_userRecord) + void readScidacFieldRecord(Lattice &field,userRecord &_userRecord, + int control=BINARYIO_LEXICOGRAPHIC) { typedef typename vobj::scalar_object sobj; GridBase * grid = field.Grid(); @@ -571,7 +579,7 @@ class ScidacReader : public GridLimeReader { readLimeObject(header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message readLimeObject(_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML)); readLimeObject(_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML)); - readLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA)); + readLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA),control); } void skipPastBinaryRecord(void) { std::string rec_name(ILDG_BINARY_DATA); diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index 529ea090..dfd0a416 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -170,7 +170,7 @@ public: typedef decltype(coalescedReadGeneralPermute(U_v[0](0),gStencil.GetEntry(0,0)->_permute,Nd)) U3matrix; int Nsites = U_v.size(); - auto gStencil_v = gStencil.View(); + auto gStencil_v = gStencil.View(AcceleratorRead); accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 3-link constructs stencilElement SE0, SE1, SE2, SE3, SE4, SE5; @@ -386,4 +386,4 @@ public: }; -NAMESPACE_END(Grid); \ No newline at end of file +NAMESPACE_END(Grid); diff --git a/Grid/qcd/utils/WilsonLoops.h b/Grid/qcd/utils/WilsonLoops.h index d6c0d621..851ba172 100644 --- a/Grid/qcd/utils/WilsonLoops.h +++ b/Grid/qcd/utils/WilsonLoops.h @@ -488,7 +488,7 @@ public: for(int mu=0;muoSites(), (size_t)ggrid->Nsimd(), { decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss; @@ -1200,7 +1200,7 @@ public: { //view scope autoView( gStaple_v , gStaple, AcceleratorWrite); - auto gStencil_v = gStencil.View(); + auto gStencil_v = gStencil.View(AcceleratorRead); accelerator_for(ss, ggrid->oSites(), (size_t)ggrid->Nsimd(), { decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss; diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index 0a3d176f..976d5568 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -1130,6 +1130,14 @@ static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths inc #endif #endif +// Fixme need coalesced read gpermute +template void gpermute(vobj & inout,int perm){ + vobj tmp=inout; + if (perm & 0x1 ) { permute(inout,tmp,0); tmp=inout;} + if (perm & 0x2 ) { permute(inout,tmp,1); tmp=inout;} + if (perm & 0x4 ) { permute(inout,tmp,2); tmp=inout;} + if (perm & 0x8 ) { permute(inout,tmp,3); tmp=inout;} +} NAMESPACE_END(Grid); diff --git a/Grid/stencil/GeneralLocalStencil.h b/Grid/stencil/GeneralLocalStencil.h index 9af9b834..b6848977 100644 --- a/Grid/stencil/GeneralLocalStencil.h +++ b/Grid/stencil/GeneralLocalStencil.h @@ -32,7 +32,12 @@ NAMESPACE_BEGIN(Grid); struct GeneralStencilEntry { uint64_t _offset; // 4 bytes uint8_t _permute; // 1 bytes // Horrible alignment properties + uint8_t _wrap; // 1 bytes // Horrible alignment properties }; +struct GeneralStencilEntryReordered : public GeneralStencilEntry { + uint64_t _input; +}; + // Could pack to 8 + 4 + 4 = 128 bit and use class GeneralLocalStencilView { @@ -46,7 +51,7 @@ class GeneralLocalStencilView { accelerator_inline GeneralStencilEntry * GetEntry(int point,int osite) const { return & this->_entries_p[point+this->_npoints*osite]; } - + void ViewClose(void){}; }; //////////////////////////////////////// // The Stencil Class itself @@ -61,7 +66,7 @@ protected: public: GridBase *Grid(void) const { return _grid; } - View_type View(void) const { + View_type View(int mode) const { View_type accessor(*( (View_type *) this)); return accessor; } @@ -101,17 +106,23 @@ public: // Simpler version using icoor calculation //////////////////////////////////////////////// SE._permute =0; + SE._wrap=0; for(int d=0;d_fdimensions[d]; int rd = grid->_rdimensions[d]; + int ld = grid->_ldimensions[d]; int ly = grid->_simd_layout[d]; - assert((ly==1)||(ly==2)); + assert((ly==1)||(ly==2)||(ly==grid->Nsimd())); int shift = (shifts[ii][d]+fd)%fd; // make it strictly positive 0.. L-1 int x = Coor[d]; // x in [0... rd-1] as an oSite + if ( (x + shift)%fd != (x+shift)%ld ){ + SE._wrap = 1; + } + int permute_dim = grid->PermuteDim(d); int permute_slice=0; if(permute_dim){ diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index 19411b62..0d9a8874 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -122,7 +122,7 @@ hipStream_t computeStream; void acceleratorInit(void) { int nDevices = 1; - hipGetDeviceCount(&nDevices); + auto discard = hipGetDeviceCount(&nDevices); gpu_props = new hipDeviceProp_t[nDevices]; char * localRankStr = NULL; @@ -149,7 +149,7 @@ void acceleratorInit(void) #define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorHipInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory); #define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d"); - auto r=hipGetDeviceProperties(&gpu_props[i], i); + discard = hipGetDeviceProperties(&gpu_props[i], i); hipDeviceProp_t prop; prop = gpu_props[i]; totalDeviceMem = prop.totalGlobalMem; @@ -186,13 +186,13 @@ void acceleratorInit(void) } int device = rank; #endif - hipSetDevice(device); - hipStreamCreate(©Stream); - hipStreamCreate(&computeStream); + discard = hipSetDevice(device); + discard = hipStreamCreate(©Stream); + discard = hipStreamCreate(&computeStream); const int len=64; char busid[len]; if( rank == world_rank ) { - hipDeviceGetPCIBusId(busid, len, device); + discard = hipDeviceGetPCIBusId(busid, len, device); printf("local rank %d device %d bus id: %s\n", rank, device, busid); } if ( world_rank == 0 ) printf("AcceleratorHipInit: ================================================\n"); diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 392cba61..48062b56 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -117,7 +117,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { #endif } // CUDA specific -inline void cuda_mem(void) +inline void acceleratorMem(void) { size_t free_t,total_t,used_t; cudaMemGetInfo(&free_t,&total_t); @@ -125,6 +125,11 @@ inline void cuda_mem(void) std::cout << " MemoryManager : GPU used "<>>(num1,num2,nsimd,lambda); \ } +#define prof_accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ + { \ + int nt=acceleratorThreads(); \ + typedef uint64_t Iterator; \ + auto lambda = [=] accelerator \ + (Iterator iter1,Iterator iter2,Iterator lane) mutable { \ + __VA_ARGS__; \ + }; \ + dim3 cu_threads(nsimd,acceleratorThreads(),1); \ + dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \ + ProfileLambdaApply<<>>(num1,num2,nsimd,lambda); \ + } #define accelerator_for6dNB(iter1, num1, \ iter2, num2, \ @@ -157,6 +174,20 @@ inline void cuda_mem(void) Lambda6Apply<<>>(num1,num2,num3,num4,num5,num6,lambda); \ } + +#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ + { \ + int nt=acceleratorThreads(); \ + typedef uint64_t Iterator; \ + auto lambda = [=] accelerator \ + (Iterator iter1,Iterator iter2,Iterator lane) mutable { \ + __VA_ARGS__; \ + }; \ + dim3 cu_threads(nsimd,acceleratorThreads(),1); \ + dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \ + LambdaApply<<>>(num1,num2,nsimd,lambda); \ + } + template __global__ void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) { @@ -168,6 +199,17 @@ void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) Lambda(x,y,z); } } +template __global__ +void ProfileLambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) +{ + // Weird permute is to make lane coalesce for large blocks + uint64_t x = threadIdx.y + blockDim.y*blockIdx.x; + uint64_t y = threadIdx.z + blockDim.z*blockIdx.y; + uint64_t z = threadIdx.x; + if ( (x < num1) && (y __global__ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3, @@ -208,6 +250,7 @@ inline void *acceleratorAllocShared(size_t bytes) if( err != cudaSuccess ) { ptr = (void *) NULL; printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err)); + assert(0); } return ptr; }; @@ -234,6 +277,7 @@ inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes } inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); }; + inline int acceleratorIsCommunicable(void *ptr) { // int uvm=0; @@ -265,6 +309,11 @@ NAMESPACE_END(Grid); NAMESPACE_BEGIN(Grid); +inline void acceleratorMem(void) +{ + std::cout <<" SYCL acceleratorMem not implemented"< void acceleratorPut(T& dev,T&host) { - acceleratorCopyDeviceToDeviceAsynch(from,to,bytes); - acceleratorCopySynchronise(); + acceleratorCopyToDevice(&host,&dev,sizeof(T)); +} +template T acceleratorGet(T& dev) +{ + T host; + acceleratorCopyFromDevice(&dev,&host,sizeof(T)); + return host; } + + NAMESPACE_END(Grid); diff --git a/Grid/util/Coordinate.h b/Grid/util/Coordinate.h index 05b83bf9..b79967e5 100644 --- a/Grid/util/Coordinate.h +++ b/Grid/util/Coordinate.h @@ -94,6 +94,13 @@ static constexpr int MaxDims = GRID_MAX_LATTICE_DIMENSION; typedef AcceleratorVector Coordinate; +template +inline bool operator==(const AcceleratorVector &v,const AcceleratorVector &w) +{ + if (v.size()!=w.size()) return false; + for(int i=0;i inline std::ostream & operator<<(std::ostream &os, const AcceleratorVector &v) { diff --git a/Grid/util/FlightRecorder.cc b/Grid/util/FlightRecorder.cc index 4b8e0346..8e805575 100644 --- a/Grid/util/FlightRecorder.cc +++ b/Grid/util/FlightRecorder.cc @@ -285,12 +285,6 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes) XmitLoggingCounter++; } #endif - } else { - uint64_t word = 1; - deviceVector dev(1); - acceleratorCopyToDevice(&word,&dev[0],sizeof(uint64_t)); - acceleratorCopySynchronise(); - MPI_Barrier(MPI_COMM_WORLD); } } void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank) diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index 62ee670c..3a81735d 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -292,6 +292,7 @@ void GridBanner(void) std::cout << "Build " << GRID_BUILD_STR(GRID_BUILD_REF) << std::endl; #endif std::cout << std::endl; + std::cout << std::setprecision(9); } void Grid_init(int *argc,char ***argv) @@ -424,7 +425,7 @@ void Grid_init(int *argc,char ***argv) // Logging //////////////////////////////////// std::vector logstreams; - std::string defaultLog("Error,Warning,Message,Performance"); + std::string defaultLog("Error,Warning,Message"); GridCmdOptionCSL(defaultLog,logstreams); GridLogConfigure(logstreams); @@ -548,6 +549,10 @@ void Grid_init(int *argc,char ***argv) void Grid_finalize(void) { + std::cout< - static accelerator_inline void CoorFromIndex (coor_t& coor,int index,const coor_t &dims){ + static accelerator_inline void CoorFromIndex (coor_t& coor,int64_t index,const coor_t &dims){ int nd= dims.size(); coor.resize(nd); for(int d=0;d - static accelerator_inline void IndexFromCoor (const coor_t& coor,int &index,const coor_t &dims){ + static accelerator_inline void IndexFromCoor (const coor_t& coor,int64_t &index,const coor_t &dims){ int nd=dims.size(); int stride=1; index=0; for(int d=0;d + static accelerator_inline void IndexFromCoor (const coor_t& coor,int &index,const coor_t &dims){ + int64_t index64; + IndexFromCoor(coor,index64,dims); + assert(index64<2*1024*1024*1024LL); + index = (int) index64; + } template - static inline void IndexFromCoorReversed (const coor_t& coor,int &index,const coor_t &dims){ + static inline void IndexFromCoorReversed (const coor_t& coor,int64_t &index,const coor_t &dims){ int nd=dims.size(); int stride=1; index=0; for(int d=nd-1;d>=0;d--){ - index = index+stride*coor[d]; + index = index+(int64_t)stride*coor[d]; stride=stride*dims[d]; } } template - static inline void CoorFromIndexReversed (coor_t& coor,int index,const coor_t &dims){ + static inline void IndexFromCoorReversed (const coor_t& coor,int &index,const coor_t &dims){ + int64_t index64; + IndexFromCoorReversed(coor,index64,dims); + if ( index64>=2*1024*1024*1024LL ){ + std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "< + static inline void CoorFromIndexReversed (coor_t& coor,int64_t index,const coor_t &dims){ int nd= dims.size(); coor.resize(nd); for(int d=nd-1;d>=0;d--){ diff --git a/HMC/site_autocorrelation.cc b/HMC/site_autocorrelation.cc new file mode 100644 index 00000000..d1d1b98e --- /dev/null +++ b/HMC/site_autocorrelation.cc @@ -0,0 +1,90 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: + +Copyright (C) 2017 + +Author: Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include + +template void readFile(T& out, std::string const fname){ + Grid::emptyUserRecord record; + Grid::ScidacReader RD; + RD.open(fname); + RD.readScidacFieldRecord(out,record); + RD.close(); +} + + +int main(int argc, char **argv) { + using namespace Grid; + + Grid_init(&argc, &argv); + GridLogLayout(); + + auto latt_size = GridDefaultLatt(); + auto simd_layout = GridDefaultSimd(Nd, vComplex::Nsimd()); + auto mpi_layout = GridDefaultMpi(); + GridCartesian Grid(latt_size, simd_layout, mpi_layout); + + LatticeComplexD plaq1(&Grid), plaq2(&Grid); + + FieldMetaData header; + + double vol = plaq1.Grid()->gSites(); + + std::string file1(argv[1]); + std::cout << "Reading "< +#include + +NAMESPACE_BEGIN(Grid); +template void writeFile(T& out, std::string const fname){ + emptyUserRecord record; + ScidacWriter WR(out.Grid()->IsBoss()); + WR.open(fname); + WR.writeScidacFieldRecord(out,record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC); + WR.close(); +} +NAMESPACE_END(Grid); +int main(int argc, char **argv) { + using namespace Grid; + + Grid_init(&argc, &argv); + GridLogLayout(); + + auto latt_size = GridDefaultLatt(); + auto simd_layout = GridDefaultSimd(Nd, vComplex::Nsimd()); + auto mpi_layout = GridDefaultMpi(); + GridCartesian Grid(latt_size, simd_layout, mpi_layout); + + LatticeGaugeField Umu(&Grid); + std::vector U(4,&Grid); + LatticeComplexD plaq(&Grid); + + FieldMetaData header; + + double vol = Umu.Grid()->gSites(); + double faces = (1.0 * Nd * (Nd - 1)) / 2.0; + double Ncdiv = 1.0/Nc; + + std::string file1(argv[1]); + std::string file2(argv[2]); + std::cout << "Reading "<(Umu,mu); + } + SU3WilsonLoops::sitePlaquette(plaq,U); + + plaq = plaq *(Ncdiv/faces); + + std::cout << "Writing "< > &linop, Aggregation & Subspace) -- DONE + ExchangeCoarseLinks(); + +iii) Aurora -- christoph's problem -- DONE + Aurora -- Carleton's problem staggered. + +iv) Dennis merge and test Aurora -- DONE (save test) + +v) Merge Ed Bennet's request --DONE + +vi) Repro CG -- get down to the level of single node testing via split grid test + + +========================= + +=============== +- - Slice sum optimisation & A2A - atomic addition -- Dennis - - Also faster non-atomic reduction -- - Remaining PRs - - DDHMC - - MixedPrec is the action eval, high precision - - MixedPrecCleanup is the force eval, low precision @@ -17,7 +61,6 @@ DDHMC -- Multishift Mixed Precision - DONE -- Pole dependent residual - DONE - ======= -- comms threads issue?? -- Part done: Staggered kernel performance on GPU diff --git a/scripts/prequisites.sh b/scripts/prequisites.sh new file mode 100755 index 00000000..c5bd9ed1 --- /dev/null +++ b/scripts/prequisites.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +if [ $1 = "install" ] +then + dir=`pwd` + cd $HOME + git clone -c feature.manyFiles=true https://github.com/spack/spack.git + source $HOME/spack/share/spack/setup-env.sh + + spack install autoconf + spack install automake + spack install c-lime cppflags=-fPIE + spack install fftw + spack install llvm + spack install gmp + spack install mpfr + spack install cuda@11.8 + spack install openmpi + spack install openssl + spack install hdf5 +else + source $HOME/spack/share/spack/setup-env.sh +fi + +spack load autoconf +spack load automake +spack load c-lime +spack load fftw +spack load llvm +spack load gmp +spack load mpfr +spack load cuda@11.8 +spack load openmpi +spack load openssl +spack load hdf5 + +export FFTW=`spack find --paths fftw | grep ^fftw | awk '{print $2}' ` +export HDF5=`spack find --paths hdf5 | grep ^hdf5 | awk '{print $2}' ` +export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' ` +export MPFR=`spack find --paths mpfr | grep ^mpfr | awk '{print $2}' ` +export GMP=`spack find --paths gmp | grep ^gmp | awk '{print $2}' ` +export NVIDIA=$CUDA_HOME +export NVIDIALIB=$NVIDIA/targets/x86_64-linux/lib/ +export LD_LIBRARY_PATH=$NVIDIALIB:$FFTW/lib/:$MPFR/lib:$LD_LIBRARY_PATH diff --git a/systems/Frontier/benchmarks/bench2.slurm b/systems/Frontier/benchmarks/bench2.slurm new file mode 100755 index 00000000..cc82de79 --- /dev/null +++ b/systems/Frontier/benchmarks/bench2.slurm @@ -0,0 +1,43 @@ +#!/bin/bash -l +#SBATCH --job-name=bench +##SBATCH --partition=small-g +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=8 +#SBATCH --cpus-per-task=7 +#SBATCH --gpus-per-node=8 +#SBATCH --time=00:10:00 +#SBATCH --account=phy157_dwf +#SBATCH --gpu-bind=none +#SBATCH --exclusive +#SBATCH --mem=0 + +cat << EOF > select_gpu +#!/bin/bash +export GPU_MAP=(0 1 2 3 7 6 5 4) +export NUMA_MAP=(3 3 1 1 2 2 0 0) +export GPU=\${GPU_MAP[\$SLURM_LOCALID]} +export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]} +export HIP_VISIBLE_DEVICES=\$GPU +unset ROCR_VISIBLE_DEVICES +echo RANK \$SLURM_LOCALID using GPU \$GPU +exec numactl -m \$NUMA -N \$NUMA \$* +EOF + +chmod +x ./select_gpu + +root=$HOME/Frontier/Grid/systems/Frontier/ +source ${root}/sourceme.sh + +export OMP_NUM_THREADS=7 +export MPICH_GPU_SUPPORT_ENABLED=1 +export MPICH_SMP_SINGLE_COPY_MODE=XPMEM + +for vol in 32.32.32.64 +do +srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol > log.shm0.ov.$vol +srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.ov.$vol + +srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol > log.shm0.seq.$vol +srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.seq.$vol +done + diff --git a/systems/Frontier/config-command b/systems/Frontier/config-command index 4f52b3be..a1c113e4 100644 --- a/systems/Frontier/config-command +++ b/systems/Frontier/config-command @@ -15,8 +15,8 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-` --with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \ --disable-fermion-reps \ CXX=hipcc MPICXX=mpicxx \ -CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 -fgpu-sanitize" \ - LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas" +CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \ + LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas" diff --git a/systems/Frontier/mpiwrapper.sh b/systems/Frontier/mpiwrapper.sh new file mode 100755 index 00000000..f6a56698 --- /dev/null +++ b/systems/Frontier/mpiwrapper.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +lrank=$SLURM_LOCALID +lgpu=(0 1 2 3 7 6 5 4) + +export ROCR_VISIBLE_DEVICES=${lgpu[$lrank]} + +echo "`hostname` - $lrank device=$ROCR_VISIBLE_DEVICES " + +$* + + + diff --git a/systems/Frontier/sourceme.sh b/systems/Frontier/sourceme.sh index 987241b4..652e5b45 100644 --- a/systems/Frontier/sourceme.sh +++ b/systems/Frontier/sourceme.sh @@ -1,6 +1,5 @@ . /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh spack load c-lime -#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/gcc-11.2.0/gperftools-2.9.1-72ubwtuc5wcz2meqltbfdb76epufgzo2/lib module load emacs module load PrgEnv-gnu module load rocm diff --git a/systems/Frontier/wrap.sh b/systems/Frontier/wrap.sh new file mode 100755 index 00000000..eb58353c --- /dev/null +++ b/systems/Frontier/wrap.sh @@ -0,0 +1,9 @@ +#!/bin/sh + +export HIP_VISIBLE_DEVICES=$ROCR_VISIBLE_DEVICES +unset ROCR_VISIBLE_DEVICES + +#rank=$SLURM_PROCID +#rocprof -d rocprof.$rank -o rocprof.$rank/results.rank$SLURM_PROCID.csv --sys-trace $@ + +$@ diff --git a/systems/mac-arm/config-command-mpi b/systems/mac-arm/config-command-mpi index 174545d0..be4d23dc 100644 --- a/systems/mac-arm/config-command-mpi +++ b/systems/mac-arm/config-command-mpi @@ -1,3 +1,2 @@ CXXFLAGS=-I/opt/local/include LDFLAGS=-L/opt/local/lib/ CXX=c++-13 MPICXX=mpicxx ../../configure --enable-simd=GEN --enable-comms=mpi-auto --enable-unified=yes --prefix $HOME/QCD/GridInstall --with-lime=/Users/peterboyle/QCD/SciDAC/install/ --with-openssl=$BREW --disable-fermion-reps --disable-gparity --disable-debug - diff --git a/tests/debug/Test_general_coarse.cc b/tests/debug/Test_general_coarse.cc new file mode 100644 index 00000000..28130a3c --- /dev/null +++ b/tests/debug/Test_general_coarse.cc @@ -0,0 +1,319 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_padded_cell.cc + + Copyright (C) 2023 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include + +#include +#include +#include + +using namespace std; +using namespace Grid; + +gridblasHandle_t GridBLAS::gridblasHandle; +int GridBLAS::gridblasInit; + +/////////////////////// +// Tells little dirac op to use MdagM as the .Op() +/////////////////////// +template +class HermOpAdaptor : public LinearOperatorBase +{ + LinearOperatorBase & wrapped; +public: + HermOpAdaptor(LinearOperatorBase &wrapme) : wrapped(wrapme) {}; + void OpDiag (const Field &in, Field &out) { assert(0); } + void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); } + void OpDirAll (const Field &in, std::vector &out){ assert(0); }; + void Op (const Field &in, Field &out){ + wrapped.HermOp(in,out); + } + void AdjOp (const Field &in, Field &out){ + wrapped.HermOp(in,out); + } + void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); } + void HermOp(const Field &in, Field &out){ + wrapped.HermOp(in,out); + } +}; + + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + const int Ls=4; + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), + GridDefaultSimd(Nd,vComplex::Nsimd()), + GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + // Construct a coarsened grid + Coordinate clatt = GridDefaultLatt(); + for(int d=0;d seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + std::vector cseeds({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds); + + LatticeFermion src(FGrid); random(RNG5,src); + LatticeFermion result(FGrid); result=Zero(); + LatticeFermion ref(FGrid); ref=Zero(); + LatticeFermion tmp(FGrid); + LatticeFermion err(FGrid); + LatticeGaugeField Umu(UGrid); + SU::HotConfiguration(RNG4,Umu); + // Umu=Zero(); + + RealD mass=0.1; + RealD M5=1.8; + + DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + + const int nbasis = 62; + const int cb = 0 ; + LatticeFermion prom(FGrid); + + std::vector subspace(nbasis,FGrid); + + std::cout< HermDefOp(Ddwf); + + /////////////////////////////////////////////////// + // Random aggregation space + /////////////////////////////////////////////////// + std::cout< Subspace; + Subspace Aggregates(Coarse5d,FGrid,cb); + Aggregates.CreateSubspaceRandom(RNG5); + + /////////////////////////////////////////////////// + // Build little dirac op + /////////////////////////////////////////////////// + std::cout< LittleDiracOperator; + typedef LittleDiracOperator::CoarseVector CoarseVector; + + NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d); + LittleDiracOperator LittleDiracOp(geom,FGrid,Coarse5d); + LittleDiracOperator LittleDiracOpCol(geom,FGrid,Coarse5d); + + HermOpAdaptor HOA(HermDefOp); + + LittleDiracOp.CoarsenOperator(HOA,Aggregates); + + /////////////////////////////////////////////////// + // Test the operator + /////////////////////////////////////////////////// + CoarseVector c_src (Coarse5d); + CoarseVector c_res (Coarse5d); + CoarseVector c_res_dag(Coarse5d); + CoarseVector c_proj(Coarse5d); + + subspace=Aggregates.subspace; + + // random(CRNG,c_src); + c_src = 1.0; + + blockPromote(c_src,err,subspace); + + prom=Zero(); + for(int b=0;b HermMatrix; + HermMatrix MrhsCoarseOp (mrhs); + + GridParallelRNG rh_CRNG(CoarseMrhs);rh_CRNG.SeedFixedIntegers(cseeds); + ConjugateGradient mrhsCG(1.0e-8,2000,true); + CoarseVector rh_res(CoarseMrhs); + CoarseVector rh_src(CoarseMrhs); + random(rh_CRNG,rh_src); + rh_res= Zero(); + mrhsCG(MrhsCoarseOp,rh_src,rh_res); + } + + std::cout< + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include +//#include +#include + +using namespace std; +using namespace Grid; + +template +void SaveOperator(Coarsened &Operator,std::string file) +{ +#ifdef HAVE_LIME + emptyUserRecord record; + ScidacWriter WR(Operator.Grid()->IsBoss()); + assert(Operator._A.size()==Operator.geom.npoint); + WR.open(file); + for(int p=0;p +void LoadOperator(Coarsened &Operator,std::string file) +{ +#ifdef HAVE_LIME + emptyUserRecord record; + Grid::ScidacReader RD ; + RD.open(file); + assert(Operator._A.size()==Operator.geom.npoint); + for(int p=0;p +void SaveBasis(aggregation &Agg,std::string file) +{ +#ifdef HAVE_LIME + emptyUserRecord record; + ScidacWriter WR(Agg.FineGrid->IsBoss()); + WR.open(file); + for(int b=0;b +void LoadBasis(aggregation &Agg, std::string file) +{ +#ifdef HAVE_LIME + emptyUserRecord record; + ScidacReader RD ; + RD.open(file); + for(int b=0;b class TestSolver : public LinearFunction { +public: + TestSolver() {}; + void operator() (const Field &in, Field &out){ out = Zero(); } +}; + + +RealD InverseApproximation(RealD x){ + return 1.0/x; +} + +// Want Op in CoarsenOp to call MatPcDagMatPc +template +class HermOpAdaptor : public LinearOperatorBase +{ + LinearOperatorBase & wrapped; +public: + HermOpAdaptor(LinearOperatorBase &wrapme) : wrapped(wrapme) {}; + void Op (const Field &in, Field &out) { wrapped.HermOp(in,out); } + void HermOp(const Field &in, Field &out) { wrapped.HermOp(in,out); } + void AdjOp (const Field &in, Field &out){ wrapped.HermOp(in,out); } + void OpDiag (const Field &in, Field &out) { assert(0); } + void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); } + void OpDirAll (const Field &in, std::vector &out) { assert(0); }; + void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); } +}; +template class ChebyshevSmoother : public LinearFunction +{ +public: + using LinearFunction::operator(); + typedef LinearOperatorBase FineOperator; + FineOperator & _SmootherOperator; + Chebyshev Cheby; + ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator) : + _SmootherOperator(SmootherOperator), + Cheby(_lo,_hi,_ord,InverseApproximation) + { + std::cout << GridLogMessage<<" Chebyshev smoother order "<<_ord<<" ["<<_lo<<","<<_hi<<"]"< seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + std::vector cseeds({5,6,7,8}); + + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds); + + ///////////////////////// Configuration ///////////////////////////////// + LatticeGaugeField Umu(UGrid); + + FieldMetaData header; + std::string file("ckpoint_lat.4000"); + NerscIO::readConfiguration(Umu,header,file); + + //////////////////////// Fermion action ////////////////////////////////// + MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c); + + SchurDiagMooeeOperator HermOpEO(Ddwf); + + typedef HermOpAdaptor HermFineMatrix; + HermFineMatrix FineHermOp(HermOpEO); + + LatticeFermion result(FrbGrid); result=Zero(); + + LatticeFermion src(FrbGrid); random(RNG5,src); + + // Run power method on FineHermOp + PowerMethod PM; PM(HermOpEO,src); + + + //////////////////////////////////////////////////////////// + ///////////// Coarse basis and Little Dirac Operator /////// + //////////////////////////////////////////////////////////// + typedef GeneralCoarsenedMatrix LittleDiracOperator; + typedef LittleDiracOperator::CoarseVector CoarseVector; + + NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d); + NearestStencilGeometry5D geom_nn(Coarse5d); + + // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp + typedef Aggregation Subspace; + Subspace Aggregates(Coarse5d,FrbGrid,cb); + + //////////////////////////////////////////////////////////// + // Need to check about red-black grid coarsening + //////////////////////////////////////////////////////////// + LittleDiracOperator LittleDiracOp(geom,FrbGrid,Coarse5d); + + bool load=false; + if ( load ) { + LoadBasis(Aggregates,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.scidac"); + LoadOperator(LittleDiracOp,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.scidac"); + } else { + Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis, + 95.0,0.1, + // 400,200,200 -- 48 iters + // 600,200,200 -- 38 iters, 162s + // 600,200,100 -- 38 iters, 169s + // 600,200,50 -- 88 iters. 370s + 800, + 200, + 100, + 0.0); + LittleDiracOp.CoarsenOperator(FineHermOp,Aggregates); + SaveBasis(Aggregates,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.scidac"); + SaveOperator(LittleDiracOp,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.scidac"); + } + + // Try projecting to one hop only + LittleDiracOperator LittleDiracOpProj(geom_nn,FrbGrid,Coarse5d); + LittleDiracOpProj.ProjectNearestNeighbour(0.01,LittleDiracOp); // smaller shift 0.02? n + + typedef HermitianLinearOperator HermMatrix; + HermMatrix CoarseOp (LittleDiracOp); + HermMatrix CoarseOpProj (LittleDiracOpProj); + + ////////////////////////////////////////// + // Build a coarse lanczos + ////////////////////////////////////////// + Chebyshev IRLCheby(0.2,40.0,71); // 1 iter + FunctionHermOp IRLOpCheby(IRLCheby,CoarseOp); + PlainHermOp IRLOp (CoarseOp); + int Nk=48; + int Nm=64; + int Nstop=Nk; + ImplicitlyRestartedLanczos IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-5,20); + + int Nconv; + std::vector eval(Nm); + std::vector evec(Nm,Coarse5d); + CoarseVector c_src(Coarse5d); + //c_src=1.0; + random(CRNG,c_src); + + CoarseVector c_res(Coarse5d); + CoarseVector c_ref(Coarse5d); + + PowerMethod cPM; cPM(CoarseOp,c_src); + + IRL.calc(eval,evec,c_src,Nconv); + DeflatedGuesser DeflCoarseGuesser(evec,eval); + + ////////////////////////////////////////// + // Build a coarse space solver + ////////////////////////////////////////// + int maxit=20000; + ConjugateGradient CG(1.0e-8,maxit,false); + ConjugateGradient CGfine(1.0e-8,10000,false); + ZeroGuesser CoarseZeroGuesser; + + // HPDSolver HPDSolve(CoarseOp,CG,CoarseZeroGuesser); + HPDSolver HPDSolve(CoarseOp,CG,DeflCoarseGuesser); + c_res=Zero(); + HPDSolve(c_src,c_res); c_ref = c_res; + std::cout << GridLogMessage<<"src norm "< HPDSolveProj(CoarseOpProj,CG,DeflCoarseGuesser); + c_res=Zero(); + HPDSolveProj(c_src,c_res); + std::cout << GridLogMessage<<"src norm "< + CoarseSmoother(1.0,37.,8,CoarseOpProj); // just go to sloppy 0.1 convergence + // CoarseSmoother(0.1,37.,8,CoarseOpProj); // + // CoarseSmoother(0.5,37.,6,CoarseOpProj); // 8 iter 0.36s + // CoarseSmoother(0.5,37.,12,CoarseOpProj); // 8 iter, 0.55s + // CoarseSmoother(0.5,37.,8,CoarseOpProj);// 7-9 iter + // CoarseSmoother(1.0,37.,8,CoarseOpProj); // 0.4 - 0.5s solve to 0.04, 7-9 iter + // ChebyshevSmoother CoarseSmoother(0.5,36.,10,CoarseOpProj); // 311 + + //////////////////////////////////////////////////////// + // CG, Cheby mode spacing 200,200 + // Unprojected Coarse CG solve to 1e-8 : 190 iters, 4.9s + // Unprojected Coarse CG solve to 4e-2 : 33 iters, 0.8s + // Projected Coarse CG solve to 1e-8 : 100 iters, 0.36s + //////////////////////////////////////////////////////// + // CoarseSmoother(1.0,48.,8,CoarseOpProj); 48 evecs + //////////////////////////////////////////////////////// + // ADEF1 Coarse solve to 1e-8 : 44 iters, 2.34s 2.1x gain + // ADEF1 Coarse solve to 4e-2 : 7 iters, 0.4s + // HDCG 38 iters 162s + // + // CoarseSmoother(1.0,40.,8,CoarseOpProj); 48 evecs + // ADEF1 Coarse solve to 1e-8 : 37 iters, 2.0s 2.1x gain + // ADEF1 Coarse solve to 4e-2 : 6 iters, 0.36s + // HDCG 38 iters 169s + + TwoLevelADEF1defl + cADEF1(1.0e-8, 500, + CoarseOp, + CoarseSmoother, + evec,eval); + + c_res=Zero(); + cADEF1(c_src,c_res); + std::cout << GridLogMessage<<"src norm "< Smoother(10.0,100.0,10,FineHermOp); //499 + // ChebyshevSmoother Smoother(3.0,100.0,10,FineHermOp); //383 + // ChebyshevSmoother Smoother(1.0,100.0,10,FineHermOp); //328 + // std::vector los({0.5,1.0,3.0}); // 147/142/146 nbasis 1 + // std::vector los({1.0,2.0}); // Nbasis 24: 88,86 iterations + // std::vector los({2.0,4.0}); // Nbasis 32 == 52, iters + // std::vector los({2.0,4.0}); // Nbasis 40 == 36,36 iters + + // + // Turns approx 2700 iterations into 340 fine multiplies with Nbasis 40 + // Need to measure cost of coarse space. + // + // -- i) Reduce coarse residual -- 0.04 + // -- ii) Lanczos on coarse space -- done + // -- iii) Possible 1 hop project and/or preconditioning it - easy - PrecCG it and + // use a limited stencil. Reread BFM code to check on evecs / deflation strategy with prec + // + std::vector los({3.0}); // Nbasis 40 == 36,36 iters + + // std::vector ords({7,8,10}); // Nbasis 40 == 40,38,36 iters (320,342,396 mults) + std::vector ords({7}); // Nbasis 40 == 40 iters (320 mults) + + for(int l=0;l CGsloppy(4.0e-2,maxit,false); + HPDSolver HPDSolveSloppy(CoarseOp,CGsloppy,DeflCoarseGuesser); + + // ChebyshevSmoother Smoother(lo,92,10,FineHermOp); // 36 best case + ChebyshevSmoother Smoother(lo,92,ords[o],FineHermOp); // 311 + + ////////////////////////////////////////// + // Build a HDCG solver + ////////////////////////////////////////// + TwoLevelADEF2 + HDCG(1.0e-8, 100, + FineHermOp, + Smoother, + HPDSolveSloppy, + HPDSolve, + Aggregates); + + TwoLevelADEF2 + HDCGdefl(1.0e-8, 100, + FineHermOp, + Smoother, + cADEF1, + HPDSolve, + Aggregates); + + result=Zero(); + HDCGdefl(src,result); + + result=Zero(); + HDCG(src,result); + + + } + } + + // Standard CG + result=Zero(); + CGfine(HermOpEO, src, result); + + Grid_finalize(); + return 0; +} diff --git a/tests/debug/Test_general_coarse_hdcg_phys.cc b/tests/debug/Test_general_coarse_hdcg_phys.cc new file mode 100644 index 00000000..3ec42fad --- /dev/null +++ b/tests/debug/Test_general_coarse_hdcg_phys.cc @@ -0,0 +1,444 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_general_coarse_hdcg.cc + + Copyright (C) 2023 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include +//#include +#include + +using namespace std; +using namespace Grid; + +template +void SaveOperator(Coarsened &Operator,std::string file) +{ +#ifdef HAVE_LIME + emptyUserRecord record; + ScidacWriter WR(Operator.Grid()->IsBoss()); + assert(Operator._A.size()==Operator.geom.npoint); + WR.open(file); + for(int p=0;p +void LoadOperator(Coarsened &Operator,std::string file) +{ +#ifdef HAVE_LIME + emptyUserRecord record; + Grid::ScidacReader RD ; + RD.open(file); + assert(Operator._A.size()==Operator.geom.npoint); + for(int p=0;p +void ReLoadOperator(Coarsened &Operator,std::string file) +{ +#ifdef HAVE_LIME + emptyUserRecord record; + Grid::ScidacReader RD ; + RD.open(file); + assert(Operator._A.size()==Operator.geom.npoint); + for(int p=0;p +void SaveBasis(aggregation &Agg,std::string file) +{ +#ifdef HAVE_LIME + emptyUserRecord record; + ScidacWriter WR(Agg.FineGrid->IsBoss()); + WR.open(file); + for(int b=0;b +void LoadBasis(aggregation &Agg, std::string file) +{ +#ifdef HAVE_LIME + emptyUserRecord record; + ScidacReader RD ; + RD.open(file); + for(int b=0;b +class HermOpAdaptor : public LinearOperatorBase +{ + LinearOperatorBase & wrapped; +public: + HermOpAdaptor(LinearOperatorBase &wrapme) : wrapped(wrapme) {}; + void Op (const Field &in, Field &out) { wrapped.HermOp(in,out); } + void HermOp(const Field &in, Field &out) { wrapped.HermOp(in,out); } + void AdjOp (const Field &in, Field &out){ wrapped.HermOp(in,out); } + void OpDiag (const Field &in, Field &out) { assert(0); } + void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); } + void OpDirAll (const Field &in, std::vector &out) { assert(0); }; + void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); } +}; +/* +template class ChebyshevSmoother : public LinearFunction +{ +public: + using LinearFunction::operator(); + typedef LinearOperatorBase FineOperator; + FineOperator & _SmootherOperator; + Chebyshev Cheby; + ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator) : + _SmootherOperator(SmootherOperator), + Cheby(_lo,_hi,_ord,InverseApproximation) + { + std::cout << GridLogMessage<<" Chebyshev smoother order "<<_ord<<" ["<<_lo<<","<<_hi<<"]"< class CGSmoother : public LinearFunction +{ +public: + using LinearFunction::operator(); + typedef LinearOperatorBase FineOperator; + FineOperator & _SmootherOperator; + int iters; + CGSmoother(int _iters, FineOperator &SmootherOperator) : + _SmootherOperator(SmootherOperator), + iters(_iters) + { + std::cout << GridLogMessage<<" Mirs smoother order "< CG(0.0,iters,false); // non-converge is just fine in a smoother + CG(_SmootherOperator,in,out); + } +}; + + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + const int Ls=24; + const int nbasis = 62; + const int cb = 0 ; + RealD mass=0.00078; + RealD M5=1.8; + RealD b=1.5; + RealD c=0.5; + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), + GridDefaultSimd(Nd,vComplex::Nsimd()), + GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + // Construct a coarsened grid with 4^4 cell + Coordinate Block({4,4,6,4}); + Coordinate clatt = GridDefaultLatt(); + for(int d=0;d seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + std::vector cseeds({5,6,7,8}); + + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds); + + ///////////////////////// Configuration ///////////////////////////////// + LatticeGaugeField Umu(UGrid); + + FieldMetaData header; + std::string file("ckpoint_lat.1000"); + NerscIO::readConfiguration(Umu,header,file); + + //////////////////////// Fermion action ////////////////////////////////// + MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c); + + SchurDiagMooeeOperator HermOpEO(Ddwf); + + typedef HermOpAdaptor HermFineMatrix; + HermFineMatrix FineHermOp(HermOpEO); + + LatticeFermion result(FrbGrid); result=Zero(); + + LatticeFermion src(FrbGrid); random(RNG5,src); + + // Run power method on FineHermOp + PowerMethod PM; PM(HermOpEO,src); + + //////////////////////////////////////////////////////////// + ///////////// Coarse basis and Little Dirac Operator /////// + //////////////////////////////////////////////////////////// + typedef GeneralCoarsenedMatrix LittleDiracOperator; + typedef LittleDiracOperator::CoarseVector CoarseVector; + + NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d); + NearestStencilGeometry5D geom_nn(Coarse5d); + + // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp + typedef Aggregation Subspace; + Subspace Aggregates(Coarse5d,FrbGrid,cb); + + //////////////////////////////////////////////////////////// + // Need to check about red-black grid coarsening + //////////////////////////////////////////////////////////// + LittleDiracOperator LittleDiracOp(geom,FrbGrid,Coarse5d); + + std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.rat.scidac.62"); + std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys48.rat.scidac.62"); + std::string ldop_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.phys48.rat.scidac.62"); + bool load_agg=true; + bool load_refine=true; + bool load_mat=true; + if ( load_agg ) { + LoadBasis(Aggregates,subspace_file); + } else { + + // NBASIS=40 + // Best so far: ord 2000 [0.01,95], 500,500 -- 466 iters + // slurm-398626.out:Grid : Message : 141.295253 s : 500 filt [1] 0.000103622063 + + + //Grid : Message : 33.870465 s : Chebyshev subspace pass-1 : ord 2000 [0.001,95] + //Grid : Message : 33.870485 s : Chebyshev subspace pass-2 : nbasis40 min 1000 step 1000 lo0 + //slurm-1482200.out : filt ~ 0.004 -- not as low mode projecting -- took 626 iters + + // To try: 2000 [0.1,95] ,2000,500,500 -- slurm-1482213.out 586 iterations + + // To try: 2000 [0.01,95] ,2000,500,500 -- 469 (think I bumped 92 to 95) (??) + // To try: 2000 [0.025,95],2000,500,500 + // To try: 2000 [0.005,95],2000,500,500 + + // NBASIS=44 -- HDCG paper was 64 vectors; AMD compiler craps out at 48 + // To try: 2000 [0.01,95] ,2000,500,500 -- 419 lowest slurm-1482355.out + // To try: 2000 [0.025,95] ,2000,500,500 -- 487 + // To try: 2000 [0.005,95] ,2000,500,500 + /* + Smoother [3,92] order 16 +slurm-1482355.out:Grid : Message : 35.239686 s : Chebyshev subspace pass-1 : ord 2000 [0.01,95] +slurm-1482355.out:Grid : Message : 35.239714 s : Chebyshev subspace pass-2 : nbasis44 min 500 step 500 lo0 +slurm-1482355.out:Grid : Message : 5561.305552 s : HDCG: Pcg converged in 419 iterations and 2616.202598 s + +slurm-1482367.out:Grid : Message : 43.157235 s : Chebyshev subspace pass-1 : ord 2000 [0.025,95] +slurm-1482367.out:Grid : Message : 43.157257 s : Chebyshev subspace pass-2 : nbasis44 min 500 step 500 lo0 +slurm-1482367.out:Grid : Message : 6169.469330 s : HDCG: Pcg converged in 487 iterations and 3131.185821 s + */ + /* + Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis, + 95.0,0.0075, + 2500, + 500, + 500, + 0.0); + */ + + /* + Aggregates.CreateSubspaceChebyshevPowerLaw(RNG5,HermOpEO,nbasis, + 95.0, + 2000); + */ + + Aggregates.CreateSubspaceMultishift(RNG5,HermOpEO, + 0.0003,1.0e-5,2000); // Lo, tol, maxit + /* + Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis, + 95.0,0.05, + 2000, + 500, + 500, + 0.0); + */ + /* + Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis, + 95.0,0.01, + 2000, + 500, + 500, + 0.0); + */ + // Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,95.,0.01,1500); -- running slurm-1484934.out nbasis 56 + + // Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,95.,0.01,1500); <== last run + SaveBasis(Aggregates,subspace_file); + } + + int refine=1; + if(refine){ + if ( load_refine ) { + LoadBasis(Aggregates,refine_file); + } else { + // HDCG used Pcg to refine + Aggregates.RefineSubspace(HermOpEO,0.001,1.0e-3,3000); + SaveBasis(Aggregates,refine_file); + } + } + + Aggregates.Orthogonalise(); + if ( load_mat ) { + LoadOperator(LittleDiracOp,ldop_file); + } else { + LittleDiracOp.CoarsenOperator(FineHermOp,Aggregates); + SaveOperator(LittleDiracOp,ldop_file); + } + + // I/O test: + CoarseVector c_src(Coarse5d); random(CRNG,c_src); + CoarseVector c_res(Coarse5d); + CoarseVector c_ref(Coarse5d); + + ////////////////////////////////////////// + // Build a coarse lanczos + ////////////////////////////////////////// + typedef HermitianLinearOperator HermMatrix; + HermMatrix CoarseOp (LittleDiracOp); + + int Nk=192; + int Nm=256; + int Nstop=Nk; + + Chebyshev IRLCheby(0.005,40.0,201); + // Chebyshev IRLCheby(0.010,45.0,201); // 1 iter + FunctionHermOp IRLOpCheby(IRLCheby,CoarseOp); + PlainHermOp IRLOp (CoarseOp); + + ImplicitlyRestartedLanczos IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1e-5,10); + + int Nconv; + std::vector eval(Nm); + std::vector evec(Nm,Coarse5d); + + PowerMethod cPM; cPM(CoarseOp,c_src); + + IRL.calc(eval,evec,c_src,Nconv); + + ////////////////////////////////////////// + // Deflated guesser + ////////////////////////////////////////// + DeflatedGuesser DeflCoarseGuesser(evec,eval); + + int maxit=30000; + ConjugateGradient CG(1.0e-10,maxit,false); + ConjugateGradient CGfine(1.0e-8,30000,false); + + ////////////////////////////////////////// + // HDCG + ////////////////////////////////////////// + + std::vector los({2.0,2.5}); // Nbasis 40 == 36,36 iters + std::vector ords({9}); // Nbasis 40 == 40 iters (320 mults) + + for(int l=0;l CGsloppy(4.0e-2,maxit,false); + HPDSolver HPDSolveSloppy(CoarseOp,CGsloppy,DeflCoarseGuesser); + HPDSolver HPDSolve(CoarseOp,CG,DeflCoarseGuesser); + + ////////////////////////////////////////// + // IRS shifted smoother based on CG + ////////////////////////////////////////// + RealD MirsShift = lo; + ShiftedHermOpLinearOperator ShiftedFineHermOp(HermOpEO,MirsShift); + CGSmoother CGsmooth(ords[o],ShiftedFineHermOp) ; + + ////////////////////////////////////////// + // Build a HDCG solver + ////////////////////////////////////////// + TwoLevelADEF2 + HDCG(1.0e-8, 700, + FineHermOp, + CGsmooth, + HPDSolveSloppy, + HPDSolve, + Aggregates); + + result=Zero(); + HDCG(src,result); + + } + } + + // Standard CG + result=Zero(); + CGfine(HermOpEO, src, result); + + Grid_finalize(); + return 0; +} diff --git a/tests/debug/Test_general_coarse_hdcg_phys48.cc b/tests/debug/Test_general_coarse_hdcg_phys48.cc new file mode 100644 index 00000000..3d0270c4 --- /dev/null +++ b/tests/debug/Test_general_coarse_hdcg_phys48.cc @@ -0,0 +1,513 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_general_coarse_hdcg.cc + + Copyright (C) 2023 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include +#include + +using namespace std; +using namespace Grid; + +class HDCGwrapper { + +}; + +/* +template +void SaveOperator(Coarsened &Operator,std::string file) +{ +#ifdef HAVE_LIME + emptyUserRecord record; + ScidacWriter WR(Operator.Grid()->IsBoss()); + assert(Operator._A.size()==Operator.geom.npoint); + WR.open(file); + for(int p=0;p +void LoadOperator(Coarsened &Operator,std::string file) +{ +#ifdef HAVE_LIME + emptyUserRecord record; + Grid::ScidacReader RD ; + RD.open(file); + assert(Operator._A.size()==Operator.geom.npoint); + for(int p=0;p +void ReLoadOperator(Coarsened &Operator,std::string file) +{ +#ifdef HAVE_LIME + emptyUserRecord record; + Grid::ScidacReader RD ; + RD.open(file); + assert(Operator._A.size()==Operator.geom.npoint); + for(int p=0;p +void SaveBasis(aggregation &Agg,std::string file) +{ +#ifdef HAVE_LIME + emptyUserRecord record; + ScidacWriter WR(Agg.FineGrid->IsBoss()); + WR.open(file); + for(int b=0;b +void LoadBasis(aggregation &Agg, std::string file) +{ +#ifdef HAVE_LIME + emptyUserRecord record; + ScidacReader RD ; + RD.open(file); + for(int b=0;b +void SaveEigenvectors(std::vector &eval, + std::vector &evec, + std::string evec_file, + std::string eval_file) +{ +#ifdef HAVE_LIME + emptyUserRecord record; + ScidacWriter WR(evec[0].Grid()->IsBoss()); + WR.open(evec_file); + for(int b=0;b +void LoadEigenvectors(std::vector &eval, + std::vector &evec, + std::string evec_file, + std::string eval_file) +{ +#ifdef HAVE_LIME + XmlReader RDx(eval_file); + read(RDx,"evals",eval); + emptyUserRecord record; + + Grid::ScidacReader RD ; + RD.open(evec_file); + assert(evec.size()==eval.size()); + for(int k=0;k +class HermOpAdaptor : public LinearOperatorBase +{ + LinearOperatorBase & wrapped; +public: + HermOpAdaptor(LinearOperatorBase &wrapme) : wrapped(wrapme) {}; + void Op (const Field &in, Field &out) { wrapped.HermOp(in,out); } + void HermOp(const Field &in, Field &out) { wrapped.HermOp(in,out); } + void AdjOp (const Field &in, Field &out){ wrapped.HermOp(in,out); } + void OpDiag (const Field &in, Field &out) { assert(0); } + void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); } + void OpDirAll (const Field &in, std::vector &out) { assert(0); }; + void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); } +}; + +template class CGSmoother : public LinearFunction +{ +public: + using LinearFunction::operator(); + typedef LinearOperatorBase FineOperator; + FineOperator & _SmootherOperator; + int iters; + CGSmoother(int _iters, FineOperator &SmootherOperator) : + _SmootherOperator(SmootherOperator), + iters(_iters) + { + std::cout << GridLogMessage<<" Mirs smoother order "< CG(0.0,iters,false); // non-converge is just fine in a smoother + + out=Zero(); + + CG(_SmootherOperator,in,out); + } +}; + + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + const int Ls=24; + const int nbasis = 62; + const int cb = 0 ; + RealD mass=0.00078; + RealD M5=1.8; + RealD b=1.5; + RealD c=0.5; + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), + GridDefaultSimd(Nd,vComplex::Nsimd()), + GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + // Construct a coarsened grid with 4^4 cell + Coordinate Block({4,4,6,4}); + Coordinate clatt = GridDefaultLatt(); + for(int d=0;d seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + std::vector cseeds({5,6,7,8}); + + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds); + + ///////////////////////// Configuration ///////////////////////////////// + LatticeGaugeField Umu(UGrid); + + FieldMetaData header; + std::string file("ckpoint_lat.1000"); + NerscIO::readConfiguration(Umu,header,file); + + //////////////////////// Fermion action ////////////////////////////////// + MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c); + + SchurDiagMooeeOperator HermOpEO(Ddwf); + + typedef HermOpAdaptor HermFineMatrix; + HermFineMatrix FineHermOp(HermOpEO); + + // Run power method on FineHermOp + // PowerMethod PM; PM(HermOpEO,src); + //////////////////////////////////////////////////////////// + ///////////// Coarse basis and Little Dirac Operator /////// + //////////////////////////////////////////////////////////// + typedef GeneralCoarsenedMatrix LittleDiracOperator; + typedef LittleDiracOperator::CoarseVector CoarseVector; + + NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d); + + // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp + typedef Aggregation Subspace; + Subspace Aggregates(Coarse5d,FrbGrid,cb); + + //////////////////////////////////////////////////////////// + // Need to check about red-black grid coarsening + //////////////////////////////////////////////////////////// + // LittleDiracOperator LittleDiracOp(geom,FrbGrid,Coarse5d); + + std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.new.62"); + std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys48.hdcg.62"); + std::string ldop_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.phys48.new.62"); + std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/evecs.scidac"); + std::string eval_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/eval.xml"); + bool load_agg=false; + bool load_refine=false; + bool load_mat=false; + bool load_evec=false; + std::cout << GridLogMessage <<" Restoring from checkpoint "< coarseCG(4.0e-2,20000,true); + + const int nrhs=vComplex::Nsimd()*3; // 12 + + Coordinate mpi=GridDefaultMpi(); + Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]}); + Coordinate rhLatt({nrhs,1,clatt[0],clatt[1],clatt[2],clatt[3]}); + Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1}); + + GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi); + typedef MultiGeneralCoarsenedMatrix MultiGeneralCoarsenedMatrix_t; + MultiGeneralCoarsenedMatrix_t mrhs(geom,CoarseMrhs); + + std::cout << "**************************************"< MrhsHermMatrix; + // FunctionHermOp IRLOpCheby(IRLCheby,CoarseOp); + // PlainHermOp IRLOp (CoarseOp); + Chebyshev IRLCheby(0.006,42.0,301); // 1 iter + MrhsHermMatrix MrhsCoarseOp (mrhs); + + CoarseVector pm_src(CoarseMrhs); + pm_src = ComplexD(1.0); + PowerMethod cPM; cPM(MrhsCoarseOp,pm_src); + + int Nk=192; + int Nm=384; + int Nstop=Nk; + int Nconv_test_interval=1; + + ImplicitlyRestartedBlockLanczosCoarse IRL(MrhsCoarseOp, + Coarse5d, + CoarseMrhs, + nrhs, + IRLCheby, + Nstop, + Nconv_test_interval, + nrhs, + Nk, + Nm, + 1e-5,10); + + int Nconv; + std::vector eval(Nm); + std::vector evec(Nm,Coarse5d); + std::vector c_src(nrhs,Coarse5d); + + /////////////////////// + // Deflation guesser object + /////////////////////// + MultiRHSDeflation MrhsGuesser; + + ////////////////////////////////////////// + // Block projector for coarse/fine + ////////////////////////////////////////// + MultiRHSBlockProject MrhsProjector; + + ////////////////////////// + // Extra HDCG parameters + ////////////////////////// + int maxit=3000; + ConjugateGradient CG(5.0e-2,maxit,false); + RealD lo=2.0; + int ord = 7; + + DoNothingGuesser DoNothing; + HPDSolver HPDSolveMrhs(MrhsCoarseOp,CG,DoNothing); + HPDSolver HPDSolveMrhsRefine(MrhsCoarseOp,CG,DoNothing); + + ///////////////////////////////////////////////// + // Mirs smoother + ///////////////////////////////////////////////// + RealD MirsShift = lo; + ShiftedHermOpLinearOperator ShiftedFineHermOp(HermOpEO,MirsShift); + CGSmoother CGsmooth(ord,ShiftedFineHermOp) ; + + + if ( load_refine ) { + LoadBasis(Aggregates,refine_file); + } else { +#if 1 + // Make a copy as subspace gets block orthogonalised + // HDCG used Pcg to refine + int Refineord = 11; + // Not as good as refining with shifted CG (169 iters), but 10% + // Datapoints + //- refining to 0.001 and shift 0.0 is expensive, but gets to 180 outer iterations + //- refining to 0.001 and shift 0.001 is cheap, but gets to 240 outer iterations + //- refining to 0.0005 and shift 0.0005 is cheap, but gets to 230 outer iterations + //- refining to 0.001 and shift 0.0001 220 iterations + //- refining to 0.001 and shift 0.00003 + RealD RefineShift = 0.00003; + RealD RefineTol = 0.001; + ShiftedHermOpLinearOperator RefineFineHermOp(HermOpEO,RefineShift); + + mrhs.CoarsenOperator(RefineFineHermOp,Aggregates,Coarse5d); + + MrhsProjector.Allocate(nbasis,FrbGrid,Coarse5d); + + MrhsProjector.ImportBasis(Aggregates.subspace); + + // Lanczos with random start + for(int r=0;r CGsmooth(Refineord,ShiftedFineHermOp) ; + TwoLevelADEF2mrhs + HDCGmrhsRefine(RefineTol, 500, + RefineFineHermOp, + CGsmooth, + HPDSolveMrhs, // Used in M1 + HPDSolveMrhs, // Used in Vstart + MrhsProjector, + MrhsGuesser, + CoarseMrhs); + + // Reload the first pass aggregates, because we orthogonalised them + LoadBasis(Aggregates,subspace_file); + + Aggregates.RefineSubspaceHDCG(HermOpEO, + HDCGmrhsRefine, + nrhs); + +#else + Aggregates.RefineSubspace(HermOpEO,0.001,1.0e-3,3000); // 172 iters +#endif + + SaveBasis(Aggregates,refine_file); + } + Aggregates.Orthogonalise(); + + /* + if ( load_mat ) { + LoadOperator(LittleDiracOp,ldop_file); + } else { + LittleDiracOp.CoarsenOperator(FineHermOp,Aggregates); + SaveOperator(LittleDiracOp,ldop_file); + } + */ + + std::cout << "**************************************"< + HDCGmrhs(1.0e-8, 500, + FineHermOp, + CGsmooth, + HPDSolveMrhs, // Used in M1 + HPDSolveMrhs, // Used in Vstart + MrhsProjector, + MrhsGuesser, + CoarseMrhs); + + std::vector src_mrhs(nrhs,FrbGrid); + std::vector res_mrhs(nrhs,FrbGrid); + + for(int r=0;r + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include +#include + +using namespace std; +using namespace Grid; + +template +void SaveBasis(aggregation &Agg,std::string file) +{ +#ifdef HAVE_LIME + emptyUserRecord record; + ScidacWriter WR(Agg.FineGrid->IsBoss()); + WR.open(file); + for(int b=0;b +void LoadBasis(aggregation &Agg, std::string file) +{ +#ifdef HAVE_LIME + emptyUserRecord record; + ScidacReader RD ; + RD.open(file); + for(int b=0;b +void SaveEigenvectors(std::vector &eval, + std::vector &evec, + std::string evec_file, + std::string eval_file) +{ +#ifdef HAVE_LIME + emptyUserRecord record; + ScidacWriter WR(evec[0].Grid()->IsBoss()); + WR.open(evec_file); + for(int b=0;b +void LoadEigenvectors(std::vector &eval, + std::vector &evec, + std::string evec_file, + std::string eval_file) +{ +#ifdef HAVE_LIME + XmlReader RDx(eval_file); + read(RDx,"evals",eval); + emptyUserRecord record; + + Grid::ScidacReader RD ; + RD.open(evec_file); + assert(evec.size()==eval.size()); + for(int k=0;k +class HermOpAdaptor : public LinearOperatorBase +{ + LinearOperatorBase & wrapped; +public: + HermOpAdaptor(LinearOperatorBase &wrapme) : wrapped(wrapme) {}; + void Op (const Field &in, Field &out) { wrapped.HermOp(in,out); } + void HermOp(const Field &in, Field &out) { wrapped.HermOp(in,out); } + void AdjOp (const Field &in, Field &out){ wrapped.HermOp(in,out); } + void OpDiag (const Field &in, Field &out) { assert(0); } + void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); } + void OpDirAll (const Field &in, std::vector &out) { assert(0); }; + void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); } +}; + +template class CGSmoother : public LinearFunction +{ +public: + using LinearFunction::operator(); + typedef LinearOperatorBase FineOperator; + FineOperator & _SmootherOperator; + int iters; + CGSmoother(int _iters, FineOperator &SmootherOperator) : + _SmootherOperator(SmootherOperator), + iters(_iters) + { + std::cout << GridLogMessage<<" Mirs smoother order "< CG(0.0,iters,false); // non-converge is just fine in a smoother + + out=Zero(); + + CG(_SmootherOperator,in,out); + } +}; + + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + const int Ls=24; + const int nbasis = 60; + const int cb = 0 ; + RealD mass=0.00078; + RealD M5=1.8; + RealD b=1.5; + RealD c=0.5; + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), + GridDefaultSimd(Nd,vComplex::Nsimd()), + GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + // Construct a coarsened grid with 4^4 cell + Coordinate Block({4,4,4,4}); + Coordinate clatt = GridDefaultLatt(); + for(int d=0;d seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + std::vector cseeds({5,6,7,8}); + + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds); + + ///////////////////////// Configuration ///////////////////////////////// + LatticeGaugeField Umu(UGrid); + + FieldMetaData header; + std::string file("ckpoint_lat.1000"); + NerscIO::readConfiguration(Umu,header,file); + + //////////////////////// Fermion action ////////////////////////////////// + MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c); + + SchurDiagMooeeOperator HermOpEO(Ddwf); + + typedef HermOpAdaptor HermFineMatrix; + HermFineMatrix FineHermOp(HermOpEO); + + //////////////////////////////////////////////////////////// + ///////////// Coarse basis and Little Dirac Operator /////// + //////////////////////////////////////////////////////////// + typedef GeneralCoarsenedMatrix LittleDiracOperator; + typedef LittleDiracOperator::CoarseVector CoarseVector; + + NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d); + + typedef Aggregation Subspace; + Subspace Aggregates(Coarse5d,FrbGrid,cb); + + //////////////////////////////////////////////////////////// + // Need to check about red-black grid coarsening + //////////////////////////////////////////////////////////// + + std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.mixed.2500.60"); + // std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.new.62"); + std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys48.mixed.2500.60"); + std::string ldop_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.phys48.mixed.60"); + std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/evecs.scidac"); + std::string eval_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/eval.xml"); + bool load_agg=true; + bool load_refine=true; + bool load_mat=false; + bool load_evec=false; + + int refine=1; + if ( load_agg ) { + if ( !(refine) || (!load_refine) ) { + LoadBasis(Aggregates,subspace_file); + } + } else { + // Aggregates.CreateSubspaceMultishift(RNG5,HermOpEO, + // 0.0003,1.0e-5,2000); // Lo, tol, maxit + // Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,95.,0.01,1500);// <== last run + Aggregates.CreateSubspaceChebyshevNew(RNG5,HermOpEO,95.); + SaveBasis(Aggregates,subspace_file); + } + + std::cout << "**************************************"< coarseCG(4.0e-2,20000,true); + + const int nrhs=12; + + Coordinate mpi=GridDefaultMpi(); + Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]}); + Coordinate rhLatt({nrhs,1,clatt[0],clatt[1],clatt[2],clatt[3]}); + Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1}); + + GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi); + typedef MultiGeneralCoarsenedMatrix MultiGeneralCoarsenedMatrix_t; + MultiGeneralCoarsenedMatrix_t mrhs(geom,CoarseMrhs); + + std::cout << "**************************************"< MrhsHermMatrix; + Chebyshev IRLCheby(0.005,42.0,301); // 1 iter + MrhsHermMatrix MrhsCoarseOp (mrhs); + + CoarseVector pm_src(CoarseMrhs); + pm_src = ComplexD(1.0); + PowerMethod cPM; cPM(MrhsCoarseOp,pm_src); + + int Nk=192; + int Nm=384; + int Nstop=Nk; + int Nconv_test_interval=1; + + ImplicitlyRestartedBlockLanczosCoarse IRL(MrhsCoarseOp, + Coarse5d, + CoarseMrhs, + nrhs, + IRLCheby, + Nstop, + Nconv_test_interval, + nrhs, + Nk, + Nm, + 1e-5,10); + + int Nconv; + std::vector eval(Nm); + std::vector evec(Nm,Coarse5d); + std::vector c_src(nrhs,Coarse5d); + + /////////////////////// + // Deflation guesser object + /////////////////////// + MultiRHSDeflation MrhsGuesser; + + ////////////////////////////////////////// + // Block projector for coarse/fine + ////////////////////////////////////////// + MultiRHSBlockProject MrhsProjector; + + ////////////////////////// + // Extra HDCG parameters + ////////////////////////// + int maxit=3000; + ConjugateGradient CG(5.0e-2,maxit,false); + RealD lo=2.0; + int ord = 7; + + DoNothingGuesser DoNothing; + HPDSolver HPDSolveMrhs(MrhsCoarseOp,CG,DoNothing); + HPDSolver HPDSolveMrhsRefine(MrhsCoarseOp,CG,DoNothing); + + ///////////////////////////////////////////////// + // Mirs smoother + ///////////////////////////////////////////////// + RealD MirsShift = lo; + ShiftedHermOpLinearOperator ShiftedFineHermOp(HermOpEO,MirsShift); + CGSmoother CGsmooth(ord,ShiftedFineHermOp) ; + + + if ( load_refine ) { + LoadBasis(Aggregates,refine_file); + } else { + Aggregates.RefineSubspace(HermOpEO,0.001,1.0e-3,3000); // 172 iters + SaveBasis(Aggregates,refine_file); + } + Aggregates.Orthogonalise(); + + std::cout << "**************************************"< + HDCGmrhs(1.0e-8, 500, + FineHermOp, + CGsmooth, + HPDSolveMrhs, // Used in M1 + HPDSolveMrhs, // Used in Vstart + MrhsProjector, + MrhsGuesser, + CoarseMrhs); + + std::vector src_mrhs(nrhs,FrbGrid); + std::vector res_mrhs(nrhs,FrbGrid); + + for(int r=0;r CGfine(1.0e-8,30000,false); + CGfine(HermOpEO, src, result); + } +#endif + Grid_finalize(); + return 0; +} diff --git a/tests/debug/Test_general_coarse_pvdagm.cc b/tests/debug/Test_general_coarse_pvdagm.cc new file mode 100644 index 00000000..b0b9d4d8 --- /dev/null +++ b/tests/debug/Test_general_coarse_pvdagm.cc @@ -0,0 +1,267 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_padded_cell.cc + + Copyright (C) 2023 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include + +#include +#include +#include + +using namespace std; +using namespace Grid; + +template +class HermOpAdaptor : public LinearOperatorBase +{ + LinearOperatorBase & wrapped; +public: + HermOpAdaptor(LinearOperatorBase &wrapme) : wrapped(wrapme) {}; + void OpDiag (const Field &in, Field &out) { assert(0); } + void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); } + void OpDirAll (const Field &in, std::vector &out){ assert(0); }; + void Op (const Field &in, Field &out){ + wrapped.HermOp(in,out); + } + void AdjOp (const Field &in, Field &out){ + wrapped.HermOp(in,out); + } + void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); } + void HermOp(const Field &in, Field &out){ + wrapped.HermOp(in,out); + } + +}; + +template +class PVdagMLinearOperator : public LinearOperatorBase { + Matrix &_Mat; + Matrix &_PV; +public: + PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){}; + + void OpDiag (const Field &in, Field &out) { assert(0); } + void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); } + void OpDirAll (const Field &in, std::vector &out){ assert(0); }; + void Op (const Field &in, Field &out){ + Field tmp(in.Grid()); + _Mat.M(in,tmp); + _PV.Mdag(tmp,out); + } + void AdjOp (const Field &in, Field &out){ + Field tmp(in.Grid()); + _PV.M(tmp,out); + _Mat.Mdag(in,tmp); + } + void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); } + void HermOp(const Field &in, Field &out){ + std::cout << "HermOp"< class DumbOperator : public LinearOperatorBase { +public: + LatticeComplex scale; + DumbOperator(GridBase *grid) : scale(grid) + { + scale = 0.0; + LatticeComplex scalesft(grid); + LatticeComplex scaletmp(grid); + for(int d=0;d<4;d++){ + Lattice > x(grid); LatticeCoordinate(x,d+1); + LatticeCoordinate(scaletmp,d+1); + scalesft = Cshift(scaletmp,d+1,1); + scale = 100.0*scale + where( mod(x ,2)==(Integer)0, scalesft,scaletmp); + } + std::cout << " scale\n" << scale << std::endl; + } + // Support for coarsening to a multigrid + void OpDiag (const Field &in, Field &out) {}; + void OpDir (const Field &in, Field &out,int dir,int disp){}; + void OpDirAll (const Field &in, std::vector &out) {}; + + void Op (const Field &in, Field &out){ + out = scale * in; + } + void AdjOp (const Field &in, Field &out){ + out = scale * in; + } + void HermOp(const Field &in, Field &out){ + double n1, n2; + HermOpAndNorm(in,out,n1,n2); + } + void HermOpAndNorm(const Field &in, Field &out,double &n1,double &n2){ + ComplexD dot; + + out = scale * in; + + dot= innerProduct(in,out); + n1=real(dot); + + dot = innerProduct(out,out); + n2=real(dot); + } +}; + + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + const int Ls=2; + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + // Construct a coarsened grid + Coordinate clatt = GridDefaultLatt(); + for(int d=0;d seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + std::vector cseeds({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds); + + LatticeFermion src(FGrid); random(RNG5,src); + LatticeFermion result(FGrid); result=Zero(); + LatticeFermion ref(FGrid); ref=Zero(); + LatticeFermion tmp(FGrid); + LatticeFermion err(FGrid); + LatticeGaugeField Umu(UGrid); + + FieldMetaData header; + std::string file("ckpoint_lat.4000"); + NerscIO::readConfiguration(Umu,header,file); + //Umu = 1.0; + + RealD mass=0.5; + RealD M5=1.8; + + DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + DomainWallFermionD Dpv(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5); + + const int nbasis = 1; + const int cb = 0 ; + LatticeFermion prom(FGrid); + + typedef GeneralCoarsenedMatrix LittleDiracOperator; + typedef LittleDiracOperator::CoarseVector CoarseVector; + + NextToNearestStencilGeometry5D geom(Coarse5d); + + std::cout< PVdagM(Ddwf,Dpv); + HermOpAdaptor HOA(PVdagM); + + // Run power method on HOA?? + PowerMethod PM; PM(HOA,src); + + // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp + typedef Aggregation Subspace; + Subspace AggregatesPD(Coarse5d,FGrid,cb); + AggregatesPD.CreateSubspaceChebyshev(RNG5, + HOA, + nbasis, + 5000.0, + 0.02, + 100, + 50, + 50, + 0.0); + + LittleDiracOperator LittleDiracOpPV(geom,FGrid,Coarse5d); + LittleDiracOpPV.CoarsenOperator(PVdagM,AggregatesPD); + + std::cout< subspace(nbasis,FGrid); + subspace=AggregatesPD.subspace; + + Complex one(1.0); + c_src = one; // 1 in every element for vector 1. + blockPromote(c_src,err,subspace); + + prom=Zero(); + for(int b=0;b