From 070b61f08fb6ec761a8ba9d4719b57ead0e3e9a9 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 6 Mar 2024 14:04:33 -0500 Subject: [PATCH] Simplifying the MultiRHS solver to make it do SRHS *and* MRHS --- Grid/algorithms/iterative/Deflation.h | 157 --------- .../multigrid/GeneralCoarsenedMatrix.h | 14 +- .../GeneralCoarsenedMatrixMultiRHS.h | 239 +++++++++++--- TODO | 43 ++- .../debug/Test_general_coarse_hdcg_phys48.cc | 312 ++---------------- 5 files changed, 287 insertions(+), 478 deletions(-) delete mode 100644 Grid/algorithms/iterative/Deflation.h diff --git a/Grid/algorithms/iterative/Deflation.h b/Grid/algorithms/iterative/Deflation.h deleted file mode 100644 index 1a8f97c9..00000000 --- a/Grid/algorithms/iterative/Deflation.h +++ /dev/null @@ -1,157 +0,0 @@ - /************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h - - Copyright (C) 2015 - -Author: Peter Boyle - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ -#ifndef GRID_DEFLATION_H -#define GRID_DEFLATION_H - -namespace Grid { - -template -class ZeroGuesser: public LinearFunction { -public: - using LinearFunction::operator(); - virtual void operator()(const Field &src, Field &guess) { guess = Zero(); }; -}; -template -class DoNothingGuesser: public LinearFunction { -public: - using LinearFunction::operator(); - virtual void operator()(const Field &src, Field &guess) { }; -}; -template -class SourceGuesser: public LinearFunction { -public: - using LinearFunction::operator(); - virtual void operator()(const Field &src, Field &guess) { guess = src; }; -}; - -//////////////////////////////// -// Fine grid deflation -//////////////////////////////// -template -class DeflatedGuesser: public LinearFunction { -private: - const std::vector &evec; - const std::vector &eval; - const unsigned int N; - -public: - using LinearFunction::operator(); - - DeflatedGuesser(const std::vector & _evec,const std::vector & _eval) - : DeflatedGuesser(_evec, _eval, _evec.size()) - {} - - DeflatedGuesser(const std::vector & _evec, const std::vector & _eval, const unsigned int _N) - : evec(_evec), eval(_eval), N(_N) - { - assert(evec.size()==eval.size()); - assert(N <= evec.size()); - } - - virtual void operator()(const Field &src,Field &guess) { - guess = Zero(); - for (int i=0;i -class LocalCoherenceDeflatedGuesser: public LinearFunction { -private: - const std::vector &subspace; - const std::vector &evec_coarse; - const std::vector &eval_coarse; -public: - - using LinearFunction::operator(); - LocalCoherenceDeflatedGuesser(const std::vector &_subspace, - const std::vector &_evec_coarse, - const std::vector &_eval_coarse) - : subspace(_subspace), - evec_coarse(_evec_coarse), - eval_coarse(_eval_coarse) - { - } - - void operator()(const FineField &src,FineField &guess) { - int N = (int)evec_coarse.size(); - CoarseField src_coarse(evec_coarse[0].Grid()); - CoarseField guess_coarse(evec_coarse[0].Grid()); guess_coarse = Zero(); - blockProject(src_coarse,src,subspace); - for (int i=0;i &src,std::vector &guess) { - int Nevec = (int)evec_coarse.size(); - int Nsrc = (int)src.size(); - // make temp variables - std::vector src_coarse(Nsrc,evec_coarse[0].Grid()); - std::vector guess_coarse(Nsrc,evec_coarse[0].Grid()); - //Preporcessing - std::cout << GridLogMessage << "Start BlockProject for loop" << std::endl; - for (int j=0;jNd(); Coordinate zero_shift(Nd,0); @@ -102,6 +102,7 @@ public: assert(nfound==geom.npoint); ExchangeCoarseLinks(); } + */ GeneralCoarsenedMatrix(NonLocalStencilGeometry &_geom,GridBase *FineGrid, GridCartesian * CoarseGrid) : geom(_geom), @@ -459,6 +460,9 @@ public: CoarseScalar InnerProd(CoarseGrid()); blockOrthogonalise(InnerProd,Subspace.subspace); + for(int s=0;sGlobalDimensions(); @@ -494,6 +498,7 @@ public: } phase=exp(phase*ci); Mkl(k,l) = phase; + std::cout<<" Mkl "<oSites(); @@ -583,6 +591,10 @@ public: // PopulateAdag(); } + for(int p=0;p Cvec; typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field typedef Lattice FineField; + typedef Lattice FineComplexField; typedef CoarseVector Field; //////////////////// // Data members //////////////////// GridCartesian * _CoarseGridMulti; - GridCartesian * _CoarseGrid; - GeneralCoarseOp & _Op; NonLocalStencilGeometry geom; + NonLocalStencilGeometry geom_srhs; PaddedCell Cell; GeneralLocalStencil Stencil; @@ -77,20 +77,57 @@ public: GridBase * Grid(void) { return _CoarseGridMulti; }; // this is all the linalg routines need to know GridCartesian * CoarseGrid(void) { return _CoarseGridMulti; }; // this is all the linalg routines need to know - MultiGeneralCoarsenedMatrix(GeneralCoarseOp & Op,GridCartesian *CoarseGridMulti) : - _Op(Op), - _CoarseGrid(Op.CoarseGrid()), + // Can be used to do I/O on the operator matrices externally + void SetMatrix (int p,CoarseMatrix & A) + { + assert(A.size()==geom_srhs.npoint); + GridtoBLAS(A[p],BLAS_A[p]); + } + void GetMatrix (int p,CoarseMatrix & A) + { + assert(A.size()==geom_srhs.npoint); + BLAStoGrid(A[p],BLAS_A[p]); + } + /* + void CopyMatrix (GeneralCoarseOp &_Op) + { + for(int p=0;plSites(); - int32_t unpadded_sites = _CoarseGrid->lSites(); + int32_t padded_sites = Cell.grids.back()->lSites(); + int32_t unpadded_sites = CoarseGridMulti->lSites(); int32_t nrhs = CoarseGridMulti->FullDimensions()[0]; // # RHS int32_t orhs = nrhs/CComplex::Nsimd(); + padded_sites = padded_sites/nrhs; + unpadded_sites = unpadded_sites/nrhs; + ///////////////////////////////////////////////// // Device data vector storage ///////////////////////////////////////////////// @@ -98,9 +135,9 @@ public: for(int p=0;p void GridtoBLAS(const Lattice &from,deviceVector &to) { -#if 0 - std::vector tmp; - unvectorizeToLexOrdArray(tmp,from); - assert(tmp.size()==from.Grid()->lSites()); - assert(tmp.size()==to.size()); - to.resize(tmp.size()); - acceleratorCopyToDevice(&tmp[0],&to[0],sizeof(typename vobj::scalar_object)*tmp.size()); -#else typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; @@ -206,17 +233,9 @@ public: to[w] = stmp; } }); -#endif } template void BLAStoGrid(Lattice &grid,deviceVector &in) { -#if 0 - std::vector tmp; - tmp.resize(in.size()); - assert(in.size()==grid.Grid()->lSites()); - acceleratorCopyFromDevice(&in[0],&tmp[0],sizeof(typename vobj::scalar_object)*in.size()); - vectorizeFromLexOrdArray(tmp,grid); -#else typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; @@ -261,15 +280,152 @@ public: putlane(to[w], stmp, to_lane); } }); -#endif } - void CopyMatrix (void) + void CoarsenOperator(LinearOperatorBase > &linop, + Aggregation & Subspace, + GridBase *CoarseGrid) { - for(int p=0;pGlobalDimensions(); + int Nd = CoarseGrid->Nd(); + /* + * Here, k,l index which possible momentum/shift within the N-points connected by MdagM. + * Matrix index i is mapped to this shift via + * geom.shifts[i] + * + * conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block] + * = \sum_{l in ball} e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} > + * = \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l} + * = M_{kl} A_ji^{b.b+l} + * + * Must assemble and invert matrix M_k,l = e^[i q_k . delta_l] + * + * Where q_k = delta_k . (2*M_PI/global_nb[mu]) + * + * Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j} + */ + Eigen::MatrixXcd Mkl = Eigen::MatrixXcd::Zero(npoint,npoint); + Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint); + ComplexD ci(0.0,1.0); + for(int k=0;k phaF(npoint,grid); + std::vector pha(npoint,CoarseGrid); + + CoarseVector coarseInner(CoarseGrid); + + typedef typename CComplex::scalar_type SComplex; + FineComplexField one(grid); one=SComplex(1.0); + FineComplexField zz(grid); zz = Zero(); + for(int p=0;p _A; + _A.resize(geom_srhs.npoint,CoarseGrid); + + std::vector ComputeProj(npoint,CoarseGrid); + CoarseVector FT(CoarseGrid); + for(int i=0;ioSites(); + autoView( A_v , _A[k], AcceleratorWrite); + autoView( FT_v , FT, AcceleratorRead); + accelerator_for(sss, osites, 1, { + for(int j=0;jGlobalDimensions()[0]; + assert(nrhs>=1); + RealD flops,bytes; int64_t osites=in.Grid()->oSites(); // unpadded - int64_t unpadded_vol = _CoarseGrid->lSites(); + int64_t unpadded_vol = CoarseGrid()->lSites()/nrhs; flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd(); bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0] + 2.0*osites*sizeof(siteVector)*npoint; - int64_t nrhs =pin.Grid()->GlobalDimensions()[0]; - assert(nrhs>=1); t_GtoB=-usecond(); GridtoBLAS(pin,BLAS_B); @@ -339,7 +496,7 @@ public: BLAStoGrid(out,BLAS_C); t_BtoG+=usecond(); t_tot+=usecond(); - + /* std::cout << GridLogMessage << "New Mrhs coarse DONE "< &out){assert(0);}; - }; NAMESPACE_END(Grid); diff --git a/TODO b/TODO index 750deb55..5e8da5d1 100644 --- a/TODO +++ b/TODO @@ -1,6 +1,44 @@ -- - Slice sum optimisation & A2A - atomic addition +i) Clean up CoarsenedMatrix, GeneralCoarsenedMatrix, GeneralCoarsenedMatrixMultiRHS + + -- Ideally want a SINGLE implementation that does MultiRHS **AND** works with one RHS. + + -- -- Getting there. One RHS is hard due to vectorisation & hardwired coarse5d layout + -- Compromise: Wrap it in a copy in/out for a slice. + + -- Bad for Lanczos: need to do a BLOCK Lanczos instead. Longer term. + + -- **** Make the test do ONLY the single RHS. **** + -- I/O for the matrix elements required. + -- Make the Adef2 build an eigenvector deflater and a block projector + -- + + -- Work with Regensburg on tests. + -- Plan interface preserving the coarsened matrix interface (??) + +-- Move functionality from GeneralCoarsenedMatrix INTO GeneralCoarsenedMatrixMultiRHS -- DONE + -- Don't immediately delete original + -- Instead make the new one self contained, then delete. + -- New DWF inverter test. + + // void PopulateAdag(void) + void CoarsenOperator(LinearOperatorBase > &linop, Aggregation & Subspace) -- DONE + ExchangeCoarseLinks(); + +iii) Aurora -- christoph's problem -- DONE + Aurora -- Carleton's problem staggered. + +iv) Dennis merge and test Aurora -- DONE (save test) + +v) Merge Ed Bennet's request --DONE + +vi) Repro CG -- get down to the level of single node testing via split grid test + + +========================= + +=============== +- - Slice sum optimisation & A2A - atomic addition -- Dennis - - Also faster non-atomic reduction -- - Remaining PRs - - DDHMC - - MixedPrec is the action eval, high precision - - MixedPrecCleanup is the force eval, low precision @@ -17,7 +55,6 @@ DDHMC -- Multishift Mixed Precision - DONE -- Pole dependent residual - DONE - ======= -- comms threads issue?? -- Part done: Staggered kernel performance on GPU diff --git a/tests/debug/Test_general_coarse_hdcg_phys48.cc b/tests/debug/Test_general_coarse_hdcg_phys48.cc index a5fc18e4..f67d651d 100644 --- a/tests/debug/Test_general_coarse_hdcg_phys48.cc +++ b/tests/debug/Test_general_coarse_hdcg_phys48.cc @@ -208,9 +208,6 @@ public: }; -gridblasHandle_t GridBLAS::gridblasHandle; -int GridBLAS::gridblasInit; - int main (int argc, char ** argv) { Grid_init(&argc,&argv); @@ -281,7 +278,6 @@ int main (int argc, char ** argv) typedef LittleDiracOperator::CoarseVector CoarseVector; NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d); - NearestStencilGeometry5D geom_nn(Coarse5d); // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp typedef Aggregation Subspace; @@ -309,75 +305,12 @@ int main (int argc, char ** argv) LoadBasis(Aggregates,subspace_file); } } else { - - // NBASIS=40 - // Best so far: ord 2000 [0.01,95], 500,500 -- 466 iters - // slurm-398626.out:Grid : Message : 141.295253 s : 500 filt [1] 0.000103622063 - - - //Grid : Message : 33.870465 s : Chebyshev subspace pass-1 : ord 2000 [0.001,95] - //Grid : Message : 33.870485 s : Chebyshev subspace pass-2 : nbasis40 min 1000 step 1000 lo0 - //slurm-1482200.out : filt ~ 0.004 -- not as low mode projecting -- took 626 iters - - // To try: 2000 [0.1,95] ,2000,500,500 -- slurm-1482213.out 586 iterations - - // To try: 2000 [0.01,95] ,2000,500,500 -- 469 (think I bumped 92 to 95) (??) - // To try: 2000 [0.025,95],2000,500,500 - // To try: 2000 [0.005,95],2000,500,500 - - // NBASIS=44 -- HDCG paper was 64 vectors; AMD compiler craps out at 48 - // To try: 2000 [0.01,95] ,2000,500,500 -- 419 lowest slurm-1482355.out - // To try: 2000 [0.025,95] ,2000,500,500 -- 487 - // To try: 2000 [0.005,95] ,2000,500,500 - /* - Smoother [3,92] order 16 -slurm-1482355.out:Grid : Message : 35.239686 s : Chebyshev subspace pass-1 : ord 2000 [0.01,95] -slurm-1482355.out:Grid : Message : 35.239714 s : Chebyshev subspace pass-2 : nbasis44 min 500 step 500 lo0 -slurm-1482355.out:Grid : Message : 5561.305552 s : HDCG: Pcg converged in 419 iterations and 2616.202598 s - -slurm-1482367.out:Grid : Message : 43.157235 s : Chebyshev subspace pass-1 : ord 2000 [0.025,95] -slurm-1482367.out:Grid : Message : 43.157257 s : Chebyshev subspace pass-2 : nbasis44 min 500 step 500 lo0 -slurm-1482367.out:Grid : Message : 6169.469330 s : HDCG: Pcg converged in 487 iterations and 3131.185821 s - */ - /* - Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis, - 95.0,0.0075, - 2500, - 500, - 500, - 0.0); - */ - - /* - Aggregates.CreateSubspaceChebyshevPowerLaw(RNG5,HermOpEO,nbasis, - 95.0, - 2000); - */ - Aggregates.CreateSubspaceMultishift(RNG5,HermOpEO, 0.0003,1.0e-5,2000); // Lo, tol, maxit - /* - Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis, - 95.0,0.05, - 2000, - 500, - 500, - 0.0); - */ - /* - Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis, - 95.0,0.01, - 2000, - 500, - 500, - 0.0); - */ - // Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,95.,0.01,1500); -- running slurm-1484934.out nbasis 56 // Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,95.,0.01,1500); <== last run SaveBasis(Aggregates,subspace_file); } - MemoryManager::Print(); if(refine){ if ( load_refine ) { @@ -388,15 +321,15 @@ slurm-1482367.out:Grid : Message : 6169.469330 s : HDCG: Pcg converged in 487 it SaveBasis(Aggregates,refine_file); } } - MemoryManager::Print(); + Aggregates.Orthogonalise(); if ( load_mat ) { LoadOperator(LittleDiracOp,ldop_file); } else { LittleDiracOp.CoarsenOperator(FineHermOp,Aggregates); - SaveOperator(LittleDiracOp,ldop_file); + // SaveOperator(LittleDiracOp,ldop_file); } - + // I/O test: CoarseVector c_src(Coarse5d); random(CRNG,c_src); CoarseVector c_res(Coarse5d); @@ -428,31 +361,42 @@ slurm-1482367.out:Grid : Message : 6169.469330 s : HDCG: Pcg converged in 487 it std::cout< HermMatrix; - HermMatrix CoarseOp (LittleDiracOp); - // HermMatrix CoarseOpProj (LittleDiracOpProj); + std::cout << "**************************************"< coarseCG(4.0e-2,20000,true); + + const int nrhs=vComplex::Nsimd()*3; + + Coordinate mpi=GridDefaultMpi(); + Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]}); + Coordinate rhLatt({nrhs,1,clatt[0],clatt[1],clatt[2],clatt[3]}); + Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1}); + + GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi); + // MultiGeneralCoarsenedMatrix mrhs(LittleDiracOp,CoarseMrhs); + typedef MultiGeneralCoarsenedMatrix MultiGeneralCoarsenedMatrix_t; + MultiGeneralCoarsenedMatrix_t mrhs(geom,CoarseMrhs); + // mrhs.CopyMatrix(LittleDiracOp); + // mrhs.SetMatrix(LittleDiracOp.); + mrhs.CoarsenOperator(FineHermOp,Aggregates,Coarse5d); + // mrhs.CheckMatrix(LittleDiracOp); - MemoryManager::Print(); ////////////////////////////////////////// // Build a coarse lanczos ////////////////////////////////////////// - // Chebyshev IRLCheby(0.012,40.0,201); //500 HDCG iters - // int Nk=512; // Didn't save much - // int Nm=640; - // int Nstop=400; + std::cout << "**************************************"< IRLCheby(0.005,40.0,201); //319 HDCG iters @ 128//160 nk. - // int Nk=128; - // int Nm=160; + typedef HermitianLinearOperator HermMatrix; + HermMatrix CoarseOp (LittleDiracOp); - // Chebyshev IRLCheby(0.005,40.0,201); //319 HDCG iters @ 128//160 nk. - // Chebyshev IRLCheby(0.04,40.0,201); int Nk=192; int Nm=256; int Nstop=Nk; @@ -491,121 +435,13 @@ slurm-1482367.out:Grid : Message : 6169.469330 s : HDCG: Pcg converged in 487 it ConjugateGradient CGfine(1.0e-8,30000,false); ZeroGuesser CoarseZeroGuesser; - - // HPDSolver HPDSolve(CoarseOp,CG,CoarseZeroGuesser); HPDSolver HPDSolve(CoarseOp,CG,DeflCoarseGuesser); c_res=Zero(); - // HPDSolve(c_src,c_res); c_ref = c_res; - // std::cout << GridLogMessage<<"src norm "< HPDSolveProj(CoarseOpProj,CG,DeflCoarseGuesser); - // c_res=Zero(); - // HPDSolveProj(c_src,c_res); - // std::cout << GridLogMessage<<"src norm "< CoarseSmoother(1.0,37.,8,CoarseOpProj); // just go to sloppy 0.1 convergence - // CoarseSmoother(0.1,37.,8,CoarseOpProj); // - // CoarseSmoother(0.5,37.,6,CoarseOpProj); // 8 iter 0.36s - // CoarseSmoother(0.5,37.,12,CoarseOpProj); // 8 iter, 0.55s - // CoarseSmoother(0.5,37.,8,CoarseOpProj);// 7-9 iter - // CoarseSmoother(1.0,37.,8,CoarseOpProj); // 0.4 - 0.5s solve to 0.04, 7-9 iter - // ChebyshevSmoother CoarseSmoother(0.5,36.,10,CoarseOpProj); // 311 - - //////////////////////////////////////////////////////// - // CG, Cheby mode spacing 200,200 - // Unprojected Coarse CG solve to 1e-8 : 190 iters, 4.9s - // Unprojected Coarse CG solve to 4e-2 : 33 iters, 0.8s - // Projected Coarse CG solve to 1e-8 : 100 iters, 0.36s - //////////////////////////////////////////////////////// - // CoarseSmoother(1.0,48.,8,CoarseOpProj); 48 evecs - //////////////////////////////////////////////////////// - // ADEF1 Coarse solve to 1e-8 : 44 iters, 2.34s 2.1x gain - // ADEF1 Coarse solve to 4e-2 : 7 iters, 0.4s - // HDCG 38 iters 162s - // - // CoarseSmoother(1.0,40.,8,CoarseOpProj); 48 evecs - // ADEF1 Coarse solve to 1e-8 : 37 iters, 2.0s 2.1x gain - // ADEF1 Coarse solve to 4e-2 : 6 iters, 0.36s - // HDCG 38 iters 169s - - /* - TwoLevelADEF1defl - cADEF1(1.0e-8, 500, - CoarseOp, - CoarseSmoother, - evec,eval); - */ - // c_res=Zero(); - // cADEF1(c_src,c_res); - // std::cout << GridLogMessage<<"src norm "< Smoother(10.0,100.0,10,FineHermOp); //499 - // ChebyshevSmoother Smoother(3.0,100.0,10,FineHermOp); //383 - // ChebyshevSmoother Smoother(1.0,100.0,10,FineHermOp); //328 - // std::vector los({0.5,1.0,3.0}); // 147/142/146 nbasis 1 - // std::vector los({1.0,2.0}); // Nbasis 24: 88,86 iterations - // std::vector los({2.0,4.0}); // Nbasis 32 == 52, iters - // std::vector los({2.0,4.0}); // Nbasis 40 == 36,36 iters - - // - // Turns approx 2700 iterations into 340 fine multiplies with Nbasis 40 - // Need to measure cost of coarse space. - // - // -- i) Reduce coarse residual -- 0.04 - // -- ii) Lanczos on coarse space -- done - // -- iii) Possible 1 hop project and/or preconditioning it - easy - PrecCG it and - // use a limited stencil. Reread BFM code to check on evecs / deflation strategy with prec - // - // - // - // - - MemoryManager::Print(); - ////////////////////////////////////// - // mrhs coarse solve - // Create a higher dim coarse grid - ////////////////////////////////////////////////////////////////////////////////////// - ConjugateGradient coarseCG(4.0e-2,20000,true); - - const int nrhs=vComplex::Nsimd()*3; - - Coordinate mpi=GridDefaultMpi(); - Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]}); - Coordinate rhLatt({nrhs,1,clatt[0],clatt[1],clatt[2],clatt[3]}); - Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1}); - - GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi); - MultiGeneralCoarsenedMatrix mrhs(LittleDiracOp,CoarseMrhs); - typedef decltype(mrhs) MultiGeneralCoarsenedMatrix_t; + /////////// MRHS test .//////////// typedef HermitianLinearOperator MrhsHermMatrix; MrhsHermMatrix MrhsCoarseOp (mrhs); - MemoryManager::Print(); + #if 1 { CoarseVector rh_res(CoarseMrhs); @@ -644,6 +480,7 @@ slurm-1482367.out:Grid : Message : 6169.469330 s : HDCG: Pcg converged in 487 it InsertSlice(c_src,rh_src,r,0); } + std::cout << " Calling the multiRHS coarse CG"< los({2.0,2.5}); // Nbasis 40 == 36,36 iters - // std::vector los({2.0}); - // std::vector los({2.5}); - - // std::vector ords({7,8,10}); // Nbasis 40 == 40,38,36 iters (320,342,396 mults) - // std::vector ords({7}); // Nbasis 40 == 40 iters (320 mults) - // std::vector ords({9}); // Nbasis 40 == 40 iters (320 mults) - - // 148 outer - // std::vector los({1.0}); - // std::vector ords({24}); - - // 162 outer - // std::vector los({2.5}); - // std::vector ords({9}); - - // ??? outer std::vector los({2.0}); std::vector ords({7}); /* - Smoother opt @56 nbasis, 0.04 convergence, 192 evs - ord lo - - 16 0.1 no converge -- likely sign indefinite - 32 0.1 no converge -- likely sign indefinite(?) - - 16 0.5 422 - 32 0.5 302 - - 8 1.0 575 - 12 1.0 449 - 16 1.0 375 - 32 1.0 302 - - 12 3.0 476 - 16 3.0 319 - 32 3.0 306 - Powerlaw setup 62 vecs slurm-1494943.out:Grid : Message : 4874.186617 s : HDCG: Pcg converged in 171 iterations and 1706.548006 s 1.0 32 slurm-1494943.out:Grid : Message : 6490.121648 s : HDCG: Pcg converged in 194 iterations and 1616.219654 s 1.0 16 @@ -727,38 +528,7 @@ slurm-1494242.out:Grid : Message : 6588.727977 s : HDCG: Pcg converged in 205 it -- CG smoother O(16): 290 -- Cheby smoother O(16): 218 -- getting close to the deflation level I expect 169 from BFM paper @O(7) smoother and 64 nbasis -Grid : Message : 2790.797194 s : HDCG: Pcg converged in 190 iterations and 1049.563182 s 1.0 32 -Grid : Message : 3766.374396 s : HDCG: Pcg converged in 218 iterations and 975.455668 s 1.0 16 -Grid : Message : 4888.746190 s : HDCG: Pcg converged in 191 iterations and 1122.252055 s 0.5 32 -Grid : Message : 5956.679661 s : HDCG: Pcg converged in 231 iterations and 1067.812850 s 0.5 16 - -Grid : Message : 2767.405829 s : HDCG: Pcg converged in 218 iterations and 967.214067 s -- 16 -Grid : Message : 3816.165905 s : HDCG: Pcg converged in 251 iterations and 1048.636269 s -- 12 -Grid : Message : 5121.206572 s : HDCG: Pcg converged in 318 iterations and 1304.916168 s -- 8 - - -[paboyle@login2.crusher debug]$ grep -v Memory slurm-402426.out | grep converged | grep HDCG -- [1.0,16] cheby -Grid : Message : 5185.521063 s : HDCG: Pcg converged in 377 iterations and 1595.843529 s - -[paboyle@login2.crusher debug]$ grep HDCG slurm-402184.out | grep onver -Grid : Message : 3760.438160 s : HDCG: Pcg converged in 422 iterations and 2129.243141 s -Grid : Message : 5660.588015 s : HDCG: Pcg converged in 308 iterations and 1900.026821 s - - -Grid : Message : 4238.206528 s : HDCG: Pcg converged in 575 iterations and 2657.430676 s -Grid : Message : 6345.880344 s : HDCG: Pcg converged in 449 iterations and 2108.505208 s - -grep onverg slurm-401663.out | grep HDCG -Grid : Message : 3900.817781 s : HDCG: Pcg converged in 476 iterations and 1992.591311 s -Grid : Message : 5647.202699 s : HDCG: Pcg converged in 306 iterations and 1746.838660 s - - -[paboyle@login2.crusher debug]$ grep converged slurm-401775.out | grep HDCG -Grid : Message : 3583.177025 s : HDCG: Pcg converged in 375 iterations and 1800.896037 s -Grid : Message : 5348.342243 s : HDCG: Pcg converged in 302 iterations and 1765.045018 s - Conclusion: higher order smoother is doing better. Much better. Use a Krylov smoother instead Mirs as in BFM version. - */ // MemoryManager::Print(); @@ -774,14 +544,6 @@ Conclusion: higher order smoother is doing better. Much better. Use a Krylov smo // ChebyshevSmoother Smoother(lo,92,10,FineHermOp); // 36 best case ChebyshevSmoother ChebySmooth(lo,95,ords[o],FineHermOp); // 311 - /* - * CG smooth 11 iter: - slurm-403825.out:Grid : Message : 4369.824339 s : HDCG: fPcg converged in 215 iterations 3.0 - slurm-403908.out:Grid : Message : 3955.897470 s : HDCG: fPcg converged in 236 iterations 1.0 - slurm-404273.out:Grid : Message : 3843.792191 s : HDCG: fPcg converged in 210 iterations 2.0 - * CG smooth 9 iter: - */ - // RealD MirsShift = lo; ShiftedHermOpLinearOperator ShiftedFineHermOp(HermOpEO,MirsShift); CGSmoother CGsmooth(ords[o],ShiftedFineHermOp) ; @@ -820,16 +582,14 @@ Conclusion: higher order smoother is doing better. Much better. Use a Krylov smo CoarseMrhs, // Grid needed to Mrhs grid Aggregates); - MemoryManager::Print(); std::cout << "Calling mRHS HDCG"<Barrier(); - MemoryManager::Print(); std::vector src_mrhs(nrhs,FrbGrid); std::cout << " mRHS source"< res_mrhs(nrhs,FrbGrid); std::cout << " mRHS result"<0)src_mrhs[r]=src_mrhs[0];