From 42cd9eda712b7e83d9f5bda5be6a942e7cfd8169 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 29 May 2026 13:36:57 -0400 Subject: [PATCH] Some improvements that should have been there if in synch with develop, and also some staggered hdcg type work --- Grid/lattice/Lattice_slicesum_core.h | 2 +- benchmarks/Benchmark_staggered.cc | 25 +- benchmarks/Benchmark_usqcd.cc | 176 ++++++++- .../Frontier/benchmarks/Benchmark_usqcd.csv | 141 ++++--- systems/Frontier/config-command | 10 +- systems/Frontier/sourceme.sh | 22 +- tests/debug/Test_general_coarse.cc | 4 +- tests/debug/Test_staggered_hdcg_ildg.cc | 373 ++++++++++++++++++ 8 files changed, 660 insertions(+), 93 deletions(-) create mode 100644 tests/debug/Test_staggered_hdcg_ildg.cc diff --git a/Grid/lattice/Lattice_slicesum_core.h b/Grid/lattice/Lattice_slicesum_core.h index 5d96c4fb..f3bd51ad 100644 --- a/Grid/lattice/Lattice_slicesum_core.h +++ b/Grid/lattice/Lattice_slicesum_core.h @@ -59,7 +59,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data, #if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 13) #define GRID_CUB_SUM_OP ::cuda::std::plus<>{} #else - #define GRID_CUB_SUM_OP ::cub::Sum() + #define GRID_CUB_SUM_OP ::gpucub::Sum() #endif gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, GRID_CUB_SUM_OP, zero_init, computeStream); diff --git a/benchmarks/Benchmark_staggered.cc b/benchmarks/Benchmark_staggered.cc index 65e04f27..a419d600 100644 --- a/benchmarks/Benchmark_staggered.cc +++ b/benchmarks/Benchmark_staggered.cc @@ -33,7 +33,6 @@ Author: paboyle using namespace std; using namespace Grid; - ; int main (int argc, char ** argv) { @@ -97,20 +96,38 @@ int main (int argc, char ** argv) RealD c2=-1.0/24.0; RealD u0=1.0; ImprovedStaggeredFermionD Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0,params); + NaiveStaggeredFermionD Dn(Umu,Grid,RBGrid,mass,c1,u0,params); std::cout< seeds4({1,2,3,4}); + GridParallelRNG RNG4(FGrid); RNG4.SeedFixedIntegers(seeds4); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + RealD mass=0.1; + RealD c1=9.0/8.0; + RealD c2=-1.0/24.0; + RealD u0=1.0; + + typedef NaiveStaggeredFermionF Action; + typedef typename Action::FermionField Fermion; + typedef LatticeGaugeFieldF Gauge; + + Gauge Umu(FGrid); SU::HotConfiguration(RNG4,Umu); + + typename Action::ImplParams params; + Action Ds(Umu,*FGrid,*FrbGrid,mass,c1,u0,params); + + ///////// Source preparation //////////// + Fermion src (FGrid); random(RNG4,src); + Fermion src_e (FrbGrid); + Fermion src_o (FrbGrid); + Fermion r_e (FrbGrid); + Fermion r_o (FrbGrid); + Fermion r_eo (FGrid); + + { + + pickCheckerboard(Even,src_e,src); + pickCheckerboard(Odd,src_o,src); + + const int num_cases = 2; + std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); + + controls Cases [] = { + { StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }, + { StaggeredKernelsStatic::OptHandUnroll, StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }, + { StaggeredKernelsStatic::OptInlineAsm , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent } + }; + + for(int c=0;cBarrier(); + for(int i=0;iBarrier(); + double t1=usecond(); + + uint64_t no = 50; + uint64_t ni = 100; + + // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(no); + for(uint64_t i=0;iBarrier(); + + double volume=1; for(int mu=0;mumflops_best ) mflops_best = mflops; + if ( mflops clover; std::vector dwf4; std::vector staggered; + std::vector naive_staggered; int Ls=1; if (do_dslash){ @@ -914,13 +1070,21 @@ int main (int argc, char ** argv) staggered.push_back(result); } + std::cout< using namespace std; using namespace Grid; -gridblasHandle_t GridBLAS::gridblasHandle; -int GridBLAS::gridblasInit; +//gridblasHandle_t GridBLAS::gridblasHandle; +//int GridBLAS::gridblasInit; /////////////////////// // Tells little dirac op to use MdagM as the .Op() diff --git a/tests/debug/Test_staggered_hdcg_ildg.cc b/tests/debug/Test_staggered_hdcg_ildg.cc new file mode 100644 index 00000000..852443fa --- /dev/null +++ b/tests/debug/Test_staggered_hdcg_ildg.cc @@ -0,0 +1,373 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: tests/debug/Test_staggered_hdcg.cc + + Authors: Thomas Blum, Peter Boyle + + HDCG (Hierarchical Deflation Conjugate Gradient) multigrid solver + for naive staggered fermions, based on arXiv:2409.03904. + + Adapts the DWF HDCG infrastructure (Test_general_coarse_hdcg_phys48.cc) to: + - NaiveStaggeredFermion (nearest-neighbour only, no Naik 3-hop term) + - 4D SchurStaggeredOperator: Mpc = m^2 - D_oe * D_eo (hermitian, positive-definite) + - vColourVector fine field type (staggered has colour but no spin) + - NextToNearestStencilGeometry4D: 33-point coarse stencil + + Stencil count: D_oe*D_eo has 2-hop fine range. With blocking B >= 2 the coarse + shifts have L1-distance <= 2, giving 33 stencil points in 4D: + 1 (identity) + 8 (+-e_mu) + 24 (+-e_mu +- e_nu). + NaiveStaggeredFermion has no Naik term, so any B >= 2 suffices. + To extend to ImprovedStaggeredFermion later, use B >= 6. + + Reference: arXiv:2409.03904 (mrhs hermitian multigrid for DWF). + + Usage (after build): + ./Test_staggered_hdcg --grid 16.16.16.16 --mpi 1.1.1.1 + +*************************************************************************************/ +#include +#include +#include +#include + +using namespace Grid; + +// Non-converging CG used as a smoother (fixed number of iterations) +template +class CGSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + FineOperator &_Op; + int iters; + CGSmoother(int _iters, FineOperator &Op) : _Op(Op), iters(_iters) {} + void operator()(const Field &in, Field &out) + { + ConjugateGradient CG(0.0, iters, false); + out = Zero(); + CG(_Op, in, out); + } +}; + +int main(int argc, char **argv) +{ + fprintf(stderr, "TRACE: entering main\n"); fflush(stderr); + Grid_init(&argc, &argv); + fprintf(stderr, "TRACE: Grid_init done\n"); fflush(stderr); + + //-------------------------------------------------------------------- + // Parameters — tune for production + //-------------------------------------------------------------------- + const int nbasis = 24; // near-null space dimension + const int cb = 0; // even checkerboard + + RealD mass = 0.00184; + + // NaiveStaggeredFermion: nearest-neighbour hop only (no Naik term). + // c1 = coefficient of the hopping term (1.0 = standard normalisation). + // u0 = tadpole factor (1.0 = no tadpole improvement). + RealD c1 = 1.0; + RealD u0 = 1.0; + + //-------------------------------------------------------------------- + // Grids + // Fine: UGrid (4D full), UrbGrid (4D red-black) + // Coarse: Coarse4d with dimensions = GridDefaultLatt() / Block + // + // Recommended: GridDefaultLatt() >= 16^4, Block = {4,4,4,4} + // NaiveStaggeredFermion works with any Block >= {2,2,2,2} + //-------------------------------------------------------------------- + fprintf(stderr, "TRACE: making UGrid\n"); fflush(stderr); + GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid( + GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi()); + fprintf(stderr, "TRACE: making UrbGrid\n"); fflush(stderr); + GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + + Coordinate Block({4, 4, 4, 4}); + Coordinate clatt = GridDefaultLatt(); + for (int d = 0; d < clatt.size(); d++) clatt[d] /= Block[d]; + Coordinate csimd = GridDefaultSimd(Nd, vComplex::Nsimd()); + Coordinate cmpi = GridDefaultMpi(); + fprintf(stderr, "TRACE: making Coarse4d clatt=%d %d %d %d simd=%d %d %d %d mpi=%d %d %d %d Nsimd=%d\n", + clatt[0],clatt[1],clatt[2],clatt[3], + csimd[0],csimd[1],csimd[2],csimd[3], + cmpi[0],cmpi[1],cmpi[2],cmpi[3], + (int)vComplex::Nsimd()); fflush(stderr); + + GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt, csimd, cmpi); + fprintf(stderr, "TRACE: Coarse4d made\n"); fflush(stderr); + + //-------------------------------------------------------------------- + // RNG + gauge field + //-------------------------------------------------------------------- + fprintf(stderr, "TRACE: RNG4\n"); fflush(stderr); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers({1,2,3,4}); + fprintf(stderr, "TRACE: RNGrb\n"); fflush(stderr); + GridParallelRNG RNGrb(UGrid); RNGrb.SeedFixedIntegers({5,6,7,8}); // must use full grid, not UrbGrid + fprintf(stderr, "TRACE: Umu\n"); fflush(stderr); + LatticeGaugeField Umu(UGrid); + int HotStart = 0; + if ( HotStart ) { + fprintf(stderr, "TRACE: HotConfig\n"); fflush(stderr); + SU::HotConfiguration(RNG4, Umu); + } else { + FieldMetaData header; + std::string file("./configuration.ildg"); + IldgReader IR; + IR.open(file); + IR.readConfiguration(Umu,header); + IR.close(); + } + + fprintf(stderr, "TRACE: NaiveStaggeredFermionD\n"); fflush(stderr); + NaiveStaggeredFermionD Ds(Umu, *UGrid, *UrbGrid, mass, c1, u0); + fprintf(stderr, "TRACE: SchurStaggeredOperator\n"); fflush(stderr); + SchurStaggeredOperator HermOp(Ds); + fprintf(stderr, "TRACE: HermOp done\n"); fflush(stderr); + + //-------------------------------------------------------------------- + // Subspace: inverse-iteration near-null vectors + // + // CreateSubspace applies CG (4 solves, tol=1e-4) to random noise vectors, + // converging naturally to the low modes of HermOp without needing spectral + // bound tuning. Switch to CreateSubspaceChebyshevNew once the spectrum is + // well characterised (hi ~ 5.0 for naive staggered SchurStaggeredOperator). + //-------------------------------------------------------------------- + typedef Aggregation Subspace; + Subspace Aggregates(Coarse4d, UrbGrid, cb); + + Aggregates.CreateSubspace(RNGrb, HermOp); + Aggregates.Orthogonalise(); + + //-------------------------------------------------------------------- + // Coarse geometry: NextToNearestStencilGeometry4D + // hops=2 -> 33 stencil points in 4D + //-------------------------------------------------------------------- + NextToNearestStencilGeometry4D geom(Coarse4d); + + std::cout << GridLogMessage << "Coarse stencil: " << geom.npoint << " points" << std::endl; + + //-------------------------------------------------------------------- + // Single-RHS coarse operator (used for correctness check below) + //-------------------------------------------------------------------- + typedef GeneralCoarsenedMatrix LittleDiracOp; + typedef LittleDiracOp::CoarseVector CoarseVector; + + LittleDiracOp LDO(geom, UrbGrid, Coarse4d); + LDO.CoarsenOperator(HermOp, Aggregates); + + //-------------------------------------------------------------------- + // Correctness check: P M_fine P^T c ≈ M_coarse c + // + // Promote a random coarse vector into the fine subspace, apply the + // fine operator, project back, and compare with the coarse operator + // applied directly. Error should be at the level of subspace + // approximation quality (smaller = better basis vectors). + //-------------------------------------------------------------------- + { + GridParallelRNG RNGc(Coarse4d); RNGc.SeedFixedIntegers({9,10,11,12}); + CoarseVector c_src(Coarse4d), c_ldop(Coarse4d), c_proj(Coarse4d); + random(RNGc, c_src); + + LatticeStaggeredFermionD f_v(UrbGrid), f_Mv(UrbGrid); + Aggregates.PromoteFromSubspace(c_src, f_v); + HermOp.Op(f_v, f_Mv); + Aggregates.ProjectToSubspace(c_proj, f_Mv); + + LDO.M(c_src, c_ldop); + + c_proj -= c_ldop; + RealD err = norm2(c_proj) / norm2(c_ldop); + std::cout << GridLogMessage + << "Coarsen check |P*M_fine - M_coarse| / |M_coarse| = " << err << std::endl; + } + + //-------------------------------------------------------------------- + // Multi-RHS coarse grid + // + // The extra leading dimension holds nrhs right-hand sides packed into + // SIMD lanes, matching the pattern of Test_general_coarse_hdcg_phys48. + //-------------------------------------------------------------------- + const int nrhs = vComplex::Nsimd() * 2; + + Coordinate mpi = GridDefaultMpi(); + Coordinate rhMpi ({1, mpi[0], mpi[1], mpi[2], mpi[3]}); + Coordinate rhLatt({nrhs, clatt[0], clatt[1], clatt[2], clatt[3]}); + Coordinate rhSimd({vComplex::Nsimd(), 1, 1, 1, 1}); + + GridCartesian *CoarseMrhs = new GridCartesian(rhLatt, rhSimd, rhMpi); + + typedef MultiGeneralCoarsenedMatrix MultiCoarseOp; + MultiCoarseOp mrhs(geom, CoarseMrhs); + mrhs.CoarsenOperator(HermOp, Aggregates, Coarse4d); + + //-------------------------------------------------------------------- + // Coarse-grid Lanczos for deflation + //-------------------------------------------------------------------- + typedef HermitianLinearOperator MrhsHermOp; + MrhsHermOp MrhsCoarseOp(mrhs); + + // Estimate spectral bounds for Lanczos Chebyshev filter + CoarseVector pm_src(CoarseMrhs); pm_src = ComplexD(1.0); + PowerMethod cPM; + RealD lambda_max = cPM(MrhsCoarseOp, pm_src); + // Chebyshev filter window [lo, hi]: + // lo must sit in the spectral gap between the Nstop-th and (Nstop+1)-th + // coarse eigenvalues so that only the target modes receive cosh amplification. + // + // From a pilot run (16^4 fine, 4^4 coarse, mass=0.05, hot config): + // Group 1 (near-null, 24 modes): lambda in [0.002647, 0.002746] ~= mass^2 + // Spectral gap: factor 60 (lambda_24/lambda_23 = 0.165/0.00275) + // Group 2 (second group): lambda in [0.165, 0.179] + // + // lo = 0.02 sits in the spectral gap (factor 7x above lambda_23=0.00275, + // factor 8x below lambda_24=0.165). + // hi = lambda_max_coarse * 1.1 ~= 2.121 + // y(lambda_0=0.002647) ~ -1.016 -> T_70 ~ 1.7e5 (cosh(70*0.182)) + // y(lambda_23=0.002746) ~ -1.015 -> T_70 ~ 1.6e5 + // Relative spread across near-null cluster: ~4.3% + // y(lambda_24=0.165) ~ -0.862 -> inside [lo,hi] -> |T_70| <= 1 + // + // order=71 (degree 70) is needed to give ~4% relative spread across the + // near-null cluster of 24 nearly-degenerate eigenvalues; order=31 (tried) + // gave only ~1.7% spread, insufficient for Nk=24/Nm=48 to converge. + // Absolute amplification ~1e5; what matters for IRL convergence is the + // relative spread, not the absolute value. + // lo=0.005 failed (T_70~53, 0/24 modes in 10 restarts). + // lo=0.01 worked but needed 2 restarts (13/24 then 24/24); lo=0.02 converges in 1. + RealD lambda_lo = 0.02; + + std::cout << GridLogMessage << "Chebyshev filter: lo=" << lambda_lo + << " hi=" << lambda_max*1.1 << " order=71" << std::endl; + + Chebyshev IRLCheby(lambda_lo, lambda_max * 1.1, 71); + + // 24 near-null modes (eigenvalues ~mass^2) converge to resid^2~1e-28 + // in the first Lanczos restart. The remaining modes (~0.165) are a + // second spectral group that needs more Krylov vectors; handle them + // separately once the basic HDCG solve is validated. + int Nk = 24; + int Nm = 48; + int Nstop = Nk; + + GridParallelRNG CRNG(Coarse4d); CRNG.SeedFixedIntegers({13,14,15,16}); + + ImplicitlyRestartedBlockLanczosCoarse + IRL(MrhsCoarseOp, Coarse4d, CoarseMrhs, nrhs, IRLCheby, + Nstop, /*conv_test_interval*/1, nrhs, Nk, Nm, 1.0e-5, 10); + + int Nconv; + std::vector eval(Nm); + std::vector evec(Nm, Coarse4d); // evec on f_grid (single-RHS coarse) + std::vector c_srcs(nrhs, Coarse4d); // src on same grid as evec + for (int r = 0; r < nrhs; r++) random(CRNG, c_srcs[r]); + IRL.calc(eval, evec, c_srcs, Nconv, LanczosType::irbl); + + //-------------------------------------------------------------------- + // HDCG solver assembly + //-------------------------------------------------------------------- + MultiRHSDeflation MrhsGuesser; + MrhsGuesser.ImportEigenBasis(evec, eval); + + // MrhsProjector maps between fine (UrbGrid) and coarse (Coarse4d) spaces + MultiRHSBlockProject MrhsProjector; + MrhsProjector.Allocate(nbasis, UrbGrid, Coarse4d); + MrhsProjector.ImportBasis(Aggregates.subspace); + + ConjugateGradient CoarseCG(5.0e-2, 5000, false); + DoNothingGuesser DoNothing; + HPDSolver HPDSolve(MrhsCoarseOp, CoarseCG, DoNothing); + + // Spectral radius of the fine operator, needed for the smoother shift. + // Use a random checkerboard vector (UrbGrid) as starting guess for PowerMethod. + LatticeStaggeredFermionD fine_pm_src(UrbGrid); + random(RNGrb, fine_pm_src); + PowerMethod finePM; + RealD fine_lambda_max = finePM(HermOp, fine_pm_src); + + // Shifted smoother: CG on (HermOp + shift*I) with shift = lambda_max / 100. + // + // The O(8) CG polynomial has 8 roots. With this shift all 8 roots lie in the + // interval [shift, lambda_max + shift] ~ [0.046, 4.65], so the polynomial + // focuses entirely on the HIGH-frequency part of the spectrum and leaves + // near-null modes (lambda << shift) essentially untouched (polynomial ~ 1 there). + // + // This is the right target because the coarse-grid correction always introduces + // high-frequency spectral leakage: the blocked coarse-grid degrees of freedom + // are piecewise constant across coarse cells and therefore have sharp edges at + // cell boundaries (like lego-block edges). Smoothness is measured by the + // covariant Dirac derivative, so promoting the coarse solution back to the + // fine grid inevitably excites high-frequency components — just as a step + // function always carries high-frequency Fourier content. The smoother must + // repair exactly these high modes. + // + // The smoother and the coarse-grid correction are applied alternately: together + // they both lift the low eigenvalues and pull down the upper eigenvalues of the + // composite preconditioned operator, reducing the condition number seen by the + // outer HDCG iterations. + // + // DWF HDCG convention; using mass^2 = 0.0025 was far too small: it scattered + // the 8 roots over [0.005, 4.6] and diluted their effect on the high modes. + RealD smootherShift = fine_lambda_max / 200.0; + std::cout << GridLogMessage << "Smoother shift: lambda_max_fine/200 = " + << fine_lambda_max << "/200 = " << smootherShift << std::endl; + ShiftedHermOpLinearOperator ShiftedOp(HermOp, smootherShift); + CGSmoother smoother(8, ShiftedOp); + + TwoLevelADEF2mrhs + HDCG(1.0e-8, 500, + HermOp, + smoother, + HPDSolve, // M1 (coarse correction) + HPDSolve, // Vstart (initial guess projection) + MrhsProjector, + MrhsGuesser, + CoarseMrhs); + + //-------------------------------------------------------------------- + // Solve: nrhs right-hand sides simultaneously + //-------------------------------------------------------------------- + std::vector src(nrhs, UrbGrid); + std::vector sol(nrhs, UrbGrid); + + GridParallelRNG RNGrb2(UGrid); RNGrb2.SeedFixedIntegers({17,18,19,20}); // must use full grid, not UrbGrid + for (int r = 0; r < nrhs; r++) { + random(RNGrb2, src[r]); + sol[r] = Zero(); + } + + //-------------------------------------------------------------------- + // Baseline: standard single-RHS CG on HermOp (no preconditioning) + // Run before HDCG to establish the unpreconditioned iteration count + // and wall-clock time for direct comparison. + //-------------------------------------------------------------------- + { + ConjugateGradient CG(1.0e-8, 100000, false); + std::vector cg_sol(nrhs, UrbGrid); + for (int r = 0; r < nrhs; r++) cg_sol[r] = Zero(); + + RealD t0 = usecond(); + int total_iters = 0; + for (int r = 0; r < nrhs; r++) { + std::cout << GridLogMessage << "====== CG baseline RHS " << r + << " ======" << std::endl; + CG(HermOp, src[r], cg_sol[r]); + total_iters += CG.IterationsToComplete; + } + RealD t1 = usecond(); + std::cout << GridLogMessage << "CG baseline: " << nrhs << " RHS, " + << total_iters << " total iterations, " + << (t1 - t0) / 1.0e6 << " s total, " + << (t1 - t0) / 1.0e6 / nrhs << " s/RHS" << std::endl; + } + + //-------------------------------------------------------------------- + // HDCG solve + //-------------------------------------------------------------------- + HDCG(src, sol); + + Grid_finalize(); + return 0; +}