From 09946cf1ba72b56b97743db550857f8c6de25b6d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 15 Nov 2023 18:03:05 -0500 Subject: [PATCH] Improved, works on 48^3 moving to multiRHS optimisations --- tests/debug/Test_general_coarse_hdcg_phys.cc | 296 ++++++++++++++++--- 1 file changed, 254 insertions(+), 42 deletions(-) diff --git a/tests/debug/Test_general_coarse_hdcg_phys.cc b/tests/debug/Test_general_coarse_hdcg_phys.cc index 7e32cc10..61ff998b 100644 --- a/tests/debug/Test_general_coarse_hdcg_phys.cc +++ b/tests/debug/Test_general_coarse_hdcg_phys.cc @@ -67,6 +67,22 @@ void LoadOperator(Coarsened &Operator,std::string file) Operator.ExchangeCoarseLinks(); #endif } +template +void ReLoadOperator(Coarsened &Operator,std::string file) +{ +#ifdef HAVE_LIME + emptyUserRecord record; + Grid::ScidacReader RD ; + RD.open(file); + assert(Operator._A.size()==Operator.geom.npoint); + for(int p=0;p void SaveBasis(aggregation &Agg,std::string file) { @@ -96,14 +112,6 @@ void LoadBasis(aggregation &Agg, std::string file) #endif } - -template class TestSolver : public LinearFunction { -public: - TestSolver() {}; - void operator() (const Field &in, Field &out){ out = Zero(); } -}; - - RealD InverseApproximation(RealD x){ return 1.0/x; } @@ -123,7 +131,7 @@ public: void OpDirAll (const Field &in, std::vector &out) { assert(0); }; void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); } }; -template class ChebyshevSmoother : public LinearFunction +template class ChebyshevSmoother : public LinearFunction { public: using LinearFunction::operator(); @@ -144,12 +152,35 @@ public: } }; +template class CGSmoother : public LinearFunction +{ +public: + using LinearFunction::operator(); + typedef LinearOperatorBase FineOperator; + FineOperator & _SmootherOperator; + int iters; + CGSmoother(int _iters, FineOperator &SmootherOperator) : + _SmootherOperator(SmootherOperator), + iters(_iters) + { + std::cout << GridLogMessage<<" Mirs smoother order "< CG(0.0,iters,false); // non-converge is just fine in a smoother + CG(_SmootherOperator,in,out); + } +}; + + int main (int argc, char ** argv) { Grid_init(&argc,&argv); const int Ls=24; - const int nbasis = 40; + const int nbasis = 62; + // const int nbasis = 56; + // const int nbasis = 44; const int cb = 0 ; RealD mass=0.00078; RealD M5=1.8; @@ -164,10 +195,12 @@ int main (int argc, char ** argv) GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); // Construct a coarsened grid with 4^4 cell + Coordinate Block({4,4,6,4}); Coordinate clatt = GridDefaultLatt(); for(int d=0;d HermFineMatrix; HermFineMatrix FineHermOp(HermOpEO); - + LatticeFermion result(FrbGrid); result=Zero(); LatticeFermion src(FrbGrid); random(RNG5,src); // Run power method on FineHermOp PowerMethod PM; PM(HermOpEO,src); - //////////////////////////////////////////////////////////// ///////////// Coarse basis and Little Dirac Operator /////// @@ -223,29 +255,110 @@ int main (int argc, char ** argv) //////////////////////////////////////////////////////////// LittleDiracOperator LittleDiracOp(geom,FrbGrid,Coarse5d); - std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys.nolex.scidac"); - std::string ldop_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.phys.nolex.scidac"); + std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.rat.scidac.62"); + std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys48.rat.scidac.62"); + std::string ldop_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.phys48.rat.scidac.62"); bool load_agg=true; + bool load_refine=true; bool load_mat=true; if ( load_agg ) { LoadBasis(Aggregates,subspace_file); } else { + + // NBASIS=40 + // Best so far: ord 2000 [0.01,95], 500,500 -- 466 iters + // slurm-398626.out:Grid : Message : 141.295253 s : 500 filt [1] 0.000103622063 + + + //Grid : Message : 33.870465 s : Chebyshev subspace pass-1 : ord 2000 [0.001,95] + //Grid : Message : 33.870485 s : Chebyshev subspace pass-2 : nbasis40 min 1000 step 1000 lo0 + //slurm-1482200.out : filt ~ 0.004 -- not as low mode projecting -- took 626 iters + + // To try: 2000 [0.1,95] ,2000,500,500 -- slurm-1482213.out 586 iterations + + // To try: 2000 [0.01,95] ,2000,500,500 -- 469 (think I bumped 92 to 95) (??) + // To try: 2000 [0.025,95],2000,500,500 + // To try: 2000 [0.005,95],2000,500,500 + + // NBASIS=44 -- HDCG paper was 64 vectors; AMD compiler craps out at 48 + // To try: 2000 [0.01,95] ,2000,500,500 -- 419 lowest slurm-1482355.out + // To try: 2000 [0.025,95] ,2000,500,500 -- 487 + // To try: 2000 [0.005,95] ,2000,500,500 + /* + Smoother [3,92] order 16 +slurm-1482355.out:Grid : Message : 35.239686 s : Chebyshev subspace pass-1 : ord 2000 [0.01,95] +slurm-1482355.out:Grid : Message : 35.239714 s : Chebyshev subspace pass-2 : nbasis44 min 500 step 500 lo0 +slurm-1482355.out:Grid : Message : 5561.305552 s : HDCG: Pcg converged in 419 iterations and 2616.202598 s + +slurm-1482367.out:Grid : Message : 43.157235 s : Chebyshev subspace pass-1 : ord 2000 [0.025,95] +slurm-1482367.out:Grid : Message : 43.157257 s : Chebyshev subspace pass-2 : nbasis44 min 500 step 500 lo0 +slurm-1482367.out:Grid : Message : 6169.469330 s : HDCG: Pcg converged in 487 iterations and 3131.185821 s + */ + /* + Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis, + 95.0,0.0075, + 2500, + 500, + 500, + 0.0); + */ + + /* + Aggregates.CreateSubspaceChebyshevPowerLaw(RNG5,HermOpEO,nbasis, + 95.0, + 2000); + */ + + Aggregates.CreateSubspaceMultishift(RNG5,HermOpEO, + 0.0003,1.0e-5,2000); // Lo, tol, maxit + /* Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis, 95.0,0.05, - 1000, - 200, - 200, + 2000, + 500, + 500, 0.0); + */ + /* + Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis, + 95.0,0.01, + 2000, + 500, + 500, + 0.0); + */ + // Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,95.,0.01,1500); -- running slurm-1484934.out nbasis 56 + + // Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,95.,0.01,1500); <== last run SaveBasis(Aggregates,subspace_file); } + + int refine=1; + if(refine){ + if ( load_refine ) { + LoadBasis(Aggregates,refine_file); + } else { + // HDCG used Pcg to refine + Aggregates.RefineSubspace(HermOpEO,0.001,1.0e-3,3000); + SaveBasis(Aggregates,refine_file); + } + } + + Aggregates.Orthogonalise(); if ( load_mat ) { LoadOperator(LittleDiracOp,ldop_file); } else { LittleDiracOp.CoarsenOperator(FineHermOp,Aggregates); SaveOperator(LittleDiracOp,ldop_file); } - + + // I/O test: + CoarseVector c_src(Coarse5d); random(CRNG,c_src); + CoarseVector c_res(Coarse5d); + CoarseVector c_ref(Coarse5d); + // Try projecting to one hop only + // LittleDiracOp.ShiftMatrix(1.0e-4); LittleDiracOperator LittleDiracOpProj(geom_nn,FrbGrid,Coarse5d); LittleDiracOpProj.ProjectNearestNeighbour(0.01,LittleDiracOp); // smaller shift 0.02? n @@ -256,21 +369,28 @@ int main (int argc, char ** argv) ////////////////////////////////////////// // Build a coarse lanczos ////////////////////////////////////////// - // Chebyshev IRLCheby(0.01,44.0,201); // 1 iter - Chebyshev IRLCheby(0.005,44.0,401); // 1 iter + // Chebyshev IRLCheby(0.012,40.0,201); //500 HDCG iters + // int Nk=512; // Didn't save much + // int Nm=640; + // int Nstop=400; + + // Chebyshev IRLCheby(0.005,40.0,201); //319 HDCG iters @ 128//160 nk. + // int Nk=128; + // int Nm=160; + Chebyshev IRLCheby(0.005,40.0,201); //319 HDCG iters @ 128//160 nk. + int Nk=192; + int Nm=256; + int Nstop=Nk; + + // Chebyshev IRLCheby(0.010,45.0,201); // 1 iter FunctionHermOp IRLOpCheby(IRLCheby,CoarseOp); PlainHermOp IRLOp (CoarseOp); - int Nk=160; - int Nm=240; - int Nstop=Nk; - ImplicitlyRestartedLanczos IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-3,20); + + ImplicitlyRestartedLanczos IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1e-5,10); int Nconv; std::vector eval(Nm); std::vector evec(Nm,Coarse5d); - CoarseVector c_src(Coarse5d); c_src=1.0; - CoarseVector c_res(Coarse5d); - CoarseVector c_ref(Coarse5d); PowerMethod cPM; cPM(CoarseOp,c_src); @@ -280,9 +400,9 @@ int main (int argc, char ** argv) ////////////////////////////////////////// // Build a coarse space solver ////////////////////////////////////////// - int maxit=20000; - ConjugateGradient CG(1.0e-8,maxit,false); - ConjugateGradient CGfine(1.0e-8,10000,false); + int maxit=30000; + ConjugateGradient CG(1.0e-10,maxit,false); + ConjugateGradient CGfine(1.0e-8,30000,false); ZeroGuesser CoarseZeroGuesser; // HPDSolver HPDSolve(CoarseOp,CG,CoarseZeroGuesser); @@ -306,8 +426,7 @@ int main (int argc, char ** argv) ////////////////////////////////////////////////////////////////////// // Coarse ADEF1 with deflation space ////////////////////////////////////////////////////////////////////// - ChebyshevSmoother - CoarseSmoother(1.0,37.,8,CoarseOpProj); // just go to sloppy 0.1 convergence + ChebyshevSmoother CoarseSmoother(1.0,37.,8,CoarseOpProj); // just go to sloppy 0.1 convergence // CoarseSmoother(0.1,37.,8,CoarseOpProj); // // CoarseSmoother(0.5,37.,6,CoarseOpProj); // 8 iter 0.36s // CoarseSmoother(0.5,37.,12,CoarseOpProj); // 8 iter, 0.55s @@ -375,41 +494,134 @@ int main (int argc, char ** argv) // -- iii) Possible 1 hop project and/or preconditioning it - easy - PrecCG it and // use a limited stencil. Reread BFM code to check on evecs / deflation strategy with prec // - std::vector los({3.0}); // Nbasis 40 == 36,36 iters + // + // + // + + std::vector los({2.0,2.5}); // Nbasis 40 == 36,36 iters // std::vector ords({7,8,10}); // Nbasis 40 == 40,38,36 iters (320,342,396 mults) - std::vector ords({7}); // Nbasis 40 == 40 iters (320 mults) + // std::vector ords({7}); // Nbasis 40 == 40 iters (320 mults) + std::vector ords({9}); // Nbasis 40 == 40 iters (320 mults) + /* + Smoother opt @56 nbasis, 0.04 convergence, 192 evs + ord lo + + 16 0.1 no converge -- likely sign indefinite + 32 0.1 no converge -- likely sign indefinite(?) + + 16 0.5 422 + 32 0.5 302 + + 8 1.0 575 + 12 1.0 449 + 16 1.0 375 + 32 1.0 302 + + 12 3.0 476 + 16 3.0 319 + 32 3.0 306 + + Powerlaw setup 62 vecs +slurm-1494943.out:Grid : Message : 4874.186617 s : HDCG: Pcg converged in 171 iterations and 1706.548006 s 1.0 32 +slurm-1494943.out:Grid : Message : 6490.121648 s : HDCG: Pcg converged in 194 iterations and 1616.219654 s 1.0 16 + + Cheby setup: 56vecs + -- CG smoother O(16): 487 + +Power law setup, 56 vecs -- lambda^-5 +slurm-1494383.out:Grid : Message : 4377.173265 s : HDCG: Pcg converged in 204 iterations and 1153.548935 s 1.0 32 + +Power law setup, 56 vecs -- lambda^-3 + +slurm-1494242.out:Grid : Message : 4370.464814 s : HDCG: Pcg converged in 204 iterations and 1143.494776 s 1.0 32 +slurm-1494242.out:Grid : Message : 5432.414820 s : HDCG: Pcg converged in 237 iterations and 1061.455882 s 1.0 16 +slurm-1494242.out:Grid : Message : 6588.727977 s : HDCG: Pcg converged in 205 iterations and 1156.565210 s 0.5 32 + + Power law setup, 56 vecs -- lambda^-4 + -- CG smoother O(16): 290 + -- Cheby smoother O(16): 218 -- getting close to the deflation level I expect 169 from BFM paper @O(7) smoother and 64 nbasis + +Grid : Message : 2790.797194 s : HDCG: Pcg converged in 190 iterations and 1049.563182 s 1.0 32 +Grid : Message : 3766.374396 s : HDCG: Pcg converged in 218 iterations and 975.455668 s 1.0 16 +Grid : Message : 4888.746190 s : HDCG: Pcg converged in 191 iterations and 1122.252055 s 0.5 32 +Grid : Message : 5956.679661 s : HDCG: Pcg converged in 231 iterations and 1067.812850 s 0.5 16 + +Grid : Message : 2767.405829 s : HDCG: Pcg converged in 218 iterations and 967.214067 s -- 16 +Grid : Message : 3816.165905 s : HDCG: Pcg converged in 251 iterations and 1048.636269 s -- 12 +Grid : Message : 5121.206572 s : HDCG: Pcg converged in 318 iterations and 1304.916168 s -- 8 + + +[paboyle@login2.crusher debug]$ grep -v Memory slurm-402426.out | grep converged | grep HDCG -- [1.0,16] cheby +Grid : Message : 5185.521063 s : HDCG: Pcg converged in 377 iterations and 1595.843529 s + +[paboyle@login2.crusher debug]$ grep HDCG slurm-402184.out | grep onver +Grid : Message : 3760.438160 s : HDCG: Pcg converged in 422 iterations and 2129.243141 s +Grid : Message : 5660.588015 s : HDCG: Pcg converged in 308 iterations and 1900.026821 s + + +Grid : Message : 4238.206528 s : HDCG: Pcg converged in 575 iterations and 2657.430676 s +Grid : Message : 6345.880344 s : HDCG: Pcg converged in 449 iterations and 2108.505208 s + +grep onverg slurm-401663.out | grep HDCG +Grid : Message : 3900.817781 s : HDCG: Pcg converged in 476 iterations and 1992.591311 s +Grid : Message : 5647.202699 s : HDCG: Pcg converged in 306 iterations and 1746.838660 s + + +[paboyle@login2.crusher debug]$ grep converged slurm-401775.out | grep HDCG +Grid : Message : 3583.177025 s : HDCG: Pcg converged in 375 iterations and 1800.896037 s +Grid : Message : 5348.342243 s : HDCG: Pcg converged in 302 iterations and 1765.045018 s + +Conclusion: higher order smoother is doing better. Much better. Use a Krylov smoother instead Mirs as in BFM version. + + */ + // for(int l=0;l CGsloppy(5.0e-2,maxit,false); + ConjugateGradient CGsloppy(4.0e-2,maxit,false); HPDSolver HPDSolveSloppy(CoarseOp,CGsloppy,DeflCoarseGuesser); // ChebyshevSmoother Smoother(lo,92,10,FineHermOp); // 36 best case - ChebyshevSmoother Smoother(lo,92,ords[o],FineHermOp); // 311 + ChebyshevSmoother ChebySmooth(lo,95,ords[o],FineHermOp); // 311 + /* + * CG smooth 11 iter: + slurm-403825.out:Grid : Message : 4369.824339 s : HDCG: fPcg converged in 215 iterations 3.0 + slurm-403908.out:Grid : Message : 3955.897470 s : HDCG: fPcg converged in 236 iterations 1.0 + slurm-404273.out:Grid : Message : 3843.792191 s : HDCG: fPcg converged in 210 iterations 2.0 + * CG smooth 9 iter: + */ + // + RealD MirsShift = lo; + ShiftedHermOpLinearOperator ShiftedFineHermOp(HermOpEO,MirsShift); + CGSmoother CGsmooth(ords[o],ShiftedFineHermOp) ; + ////////////////////////////////////////// // Build a HDCG solver ////////////////////////////////////////// TwoLevelADEF2 - HDCG(1.0e-8, 100, + HDCG(1.0e-8, 700, FineHermOp, - Smoother, + // ChebySmooth, + CGsmooth, HPDSolveSloppy, HPDSolve, Aggregates); - TwoLevelADEF2 - HDCGdefl(1.0e-8, 100, + /* + TwoLevelADEF2 + HDCGdefl(1.0e-8, 700, FineHermOp, Smoother, cADEF1, HPDSolve, Aggregates); + */ // result=Zero(); // HDCGdefl(src,result);