From 520b90259dce6bac473f8c68091d894bd4f74b45 Mon Sep 17 00:00:00 2001 From: Thomas Blum Date: Wed, 27 May 2026 15:52:49 -0400 Subject: [PATCH] Add staggered HDCG multigrid test and mac-arm Homebrew build scripts Test_staggered_hdcg.cc implements a two-level ADEF2 multigrid solver for NaiveStaggeredFermion using SchurStaggeredOperator, following the mrhs hermitian multigrid approach of arXiv:2409.03904. Uses a 33-point coarse stencil (NextToNearestStencilGeometry4D) with nbasis=24, block={4,4,4,4}, and Chebyshev subspace generation with hi=5.0 (lambda_max ~4.6). Also adds systems/mac-arm/sourceme-homebrew.sh and config-command-homebrew for building Grid on Apple Silicon with Homebrew-installed dependencies. Co-Authored-By: Claude Sonnet 4.6 --- CLAUDE.md | 17 ++ systems/mac-arm/config-command-homebrew | 18 ++ systems/mac-arm/sourceme-homebrew.sh | 15 ++ tests/debug/Test_staggered_hdcg.cc | 267 ++++++++++++++++++++++++ 4 files changed, 317 insertions(+) create mode 100644 systems/mac-arm/config-command-homebrew create mode 100644 systems/mac-arm/sourceme-homebrew.sh create mode 100644 tests/debug/Test_staggered_hdcg.cc diff --git a/CLAUDE.md b/CLAUDE.md index 4b6acf2a..5e990b5f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -30,6 +30,9 @@ Key configure options: | `--enable-Nc=` | `3` (default), `2`, `4`, `5` | | `--with-gmp=`, `--with-mpfr=`, `--with-fftw=`, `--with-lime=` | paths to libs | | `--enable-hdf5`, `--enable-mkl`, `--enable-lapack` | optional features | +| `--enable-debug=yes` | adds `-g`, removes `-O3` | + +Use `make V=1` for verbose compiler output (shows full flags; required for bug reports). Platform recipes from `README.md`: - **KNL**: `--enable-simd=KNL --enable-comms=mpi3-auto --enable-mkl` @@ -96,3 +99,17 @@ GPU support is injected via macros (`accelerator_for`, `accelerator_for2dNB`). T - The `RealD`/`RealF`/`ComplexD`/`ComplexF` typedefs are used everywhere; avoid raw `double`/`float`. - Logging uses `Grid_log`, `Grid_error` macros (from `Grid/log/`); performance-critical paths use the `GRID_TRACE` / timer macros from `Grid/perfmon/`. - Reductions across MPI ranks go through `GridBase::GlobalSum` / `GlobalMax`; never reduce with bare MPI calls inside library code. + +## Skills + +`skills/` contains seven user-invocable Claude Code skills encoding deep domain knowledge for HPC work in this repo. Invoke with `/skill-name` or ask Claude to use them by name: + +| Skill | When to use | +|-------|-------------| +| `gpu-memory-performance` | Bandwidth/occupancy problems — `acceleratorThreads()` pitfalls, `coalescedRead/Write`, fused vs staged HBM patterns | +| `gpu-runtime-correctness` | Wrong answers, non-deterministic results, premature `q.wait()` returns | +| `communication-overlap` | Designing GPU+MPI overlap pipelines; replacing broken accelerator-aware MPI paths with host-staged 7-phase pipeline | +| `mpi-heterogeneous` | Collective hangs, buffer aliasing in `MPI_Sendrecv`, heterogeneous topology bugs | +| `hang-diagnosis` | Distinguishing ioctl hangs, infinite poll loops, collective deadlocks, and silent wrong-answer races | +| `correctness-verification` | Reproducibility checksums, double-wait testing, bisecting non-deterministic failures | +| `compiler-validation` | Confirming compiler/optimisation flags are safe before production runs | diff --git a/systems/mac-arm/config-command-homebrew b/systems/mac-arm/config-command-homebrew new file mode 100644 index 00000000..e3693601 --- /dev/null +++ b/systems/mac-arm/config-command-homebrew @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +# Configure Grid from a build directory two levels below the source root, +# e.g.: mkdir -p systems/mac-arm/build && cd systems/mac-arm/build && bash ../config-command-homebrew +# +# Prerequisites: source sourceme-homebrew.sh first. + +CXX=mpicxx ../../configure \ + --enable-simd=GEN \ + --enable-comms=mpi-auto \ + --enable-Sp=yes \ + --enable-unified=yes \ + --prefix="${HOME}/Grid-install" \ + --with-lime="${CLIME}" \ + --with-openssl="${OPENSSL}" \ + --with-gmp="${GMP}" \ + --with-mpfr="${MPFR}" \ + --with-fftw="${FFTW}" \ + --disable-debug diff --git a/systems/mac-arm/sourceme-homebrew.sh b/systems/mac-arm/sourceme-homebrew.sh new file mode 100644 index 00000000..f8a2b9ab --- /dev/null +++ b/systems/mac-arm/sourceme-homebrew.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +# Environment for building Grid on Apple Silicon Mac with Homebrew dependencies. +# Usage: source systems/mac-arm/sourceme-homebrew.sh + +HOMEBREW=/opt/homebrew + +export GMP=${HOMEBREW}/opt/gmp +export MPFR=${HOMEBREW}/opt/mpfr +export FFTW=${HOMEBREW}/opt/fftw +export OPENSSL=${HOMEBREW}/opt/openssl@3 +export CLIME=/usr/local + +export PATH="${HOMEBREW}/opt/open-mpi/bin:${HOMEBREW}/bin:${PATH}" +export LDFLAGS="-L${OPENSSL}/lib" +export CPPFLAGS="-I${OPENSSL}/include" diff --git a/tests/debug/Test_staggered_hdcg.cc b/tests/debug/Test_staggered_hdcg.cc new file mode 100644 index 00000000..ed78333d --- /dev/null +++ b/tests/debug/Test_staggered_hdcg.cc @@ -0,0 +1,267 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: tests/debug/Test_staggered_hdcg.cc + + Authors: Thomas Blum, Peter Boyle + + HDCG (Hierarchical Deflation Conjugate Gradient) multigrid solver + for naive staggered fermions, based on arXiv:2409.03904. + + Adapts the DWF HDCG infrastructure (Test_general_coarse_hdcg_phys48.cc) to: + - NaiveStaggeredFermion (nearest-neighbour only, no Naik 3-hop term) + - 4D SchurStaggeredOperator: Mpc = m^2 - D_oe * D_eo (hermitian, positive-definite) + - vColourVector fine field type (staggered has colour but no spin) + - NextToNearestStencilGeometry4D: 33-point coarse stencil + + Stencil count: D_oe*D_eo has 2-hop fine range. With blocking B >= 2 the coarse + shifts have L1-distance <= 2, giving 33 stencil points in 4D: + 1 (identity) + 8 (+-e_mu) + 24 (+-e_mu +- e_nu). + NaiveStaggeredFermion has no Naik term, so any B >= 2 suffices. + To extend to ImprovedStaggeredFermion later, use B >= 6. + + Reference: arXiv:2409.03904 (mrhs hermitian multigrid for DWF). + + Usage (after build): + ./Test_staggered_hdcg --grid 16.16.16.16 --mpi 1.1.1.1 + +*************************************************************************************/ +#include +#include +#include + +using namespace Grid; + +// Non-converging CG used as a smoother (fixed number of iterations) +template +class CGSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + FineOperator &_Op; + int iters; + CGSmoother(int _iters, FineOperator &Op) : _Op(Op), iters(_iters) {} + void operator()(const Field &in, Field &out) + { + ConjugateGradient CG(0.0, iters, false); + out = Zero(); + CG(_Op, in, out); + } +}; + +int main(int argc, char **argv) +{ + Grid_init(&argc, &argv); + + //-------------------------------------------------------------------- + // Parameters — tune for production + //-------------------------------------------------------------------- + const int nbasis = 24; // near-null space dimension + const int cb = 0; // even checkerboard + + RealD mass = 0.05; + + // NaiveStaggeredFermion: nearest-neighbour hop only (no Naik term). + // c1 = coefficient of the hopping term (1.0 = standard normalisation). + // u0 = tadpole factor (1.0 = no tadpole improvement). + RealD c1 = 1.0; + RealD u0 = 1.0; + + //-------------------------------------------------------------------- + // Grids + // Fine: UGrid (4D full), UrbGrid (4D red-black) + // Coarse: Coarse4d with dimensions = GridDefaultLatt() / Block + // + // Recommended: GridDefaultLatt() >= 16^4, Block = {4,4,4,4} + // NaiveStaggeredFermion works with any Block >= {2,2,2,2} + //-------------------------------------------------------------------- + GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid( + GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi()); + GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + + Coordinate Block({4, 4, 4, 4}); + Coordinate clatt = GridDefaultLatt(); + for (int d = 0; d < clatt.size(); d++) clatt[d] /= Block[d]; + + GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid( + clatt, GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi()); + + //-------------------------------------------------------------------- + // RNG + gauge field + //-------------------------------------------------------------------- + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers({1,2,3,4}); + GridParallelRNG RNGrb(UrbGrid); RNGrb.SeedFixedIntegers({5,6,7,8}); + + LatticeGaugeField Umu(UGrid); + // Hot gauge start for testing; replace with NerscIO::readConfiguration for production + SU::HotConfiguration(RNG4, Umu); + + //-------------------------------------------------------------------- + // Naive staggered fermion + Schur hermitian operator + // + // NaiveStaggeredFermion: nearest-neighbour hops only, no Naik term. + // SchurStaggeredOperator::Op = HermOp = Mpc = m^2 - D_oe * D_eo + // Positive-definite hermitian; CG and HDCG apply directly. + // No HermOpAdaptor wrapper needed (Op == HermOp for SchurStaggeredOperator). + //-------------------------------------------------------------------- + NaiveStaggeredFermionD Ds(Umu, *UGrid, *UrbGrid, mass, c1, u0); + SchurStaggeredOperator HermOp(Ds); + + //-------------------------------------------------------------------- + // Subspace: Chebyshev-filtered near-null vectors + // + // hi = spectral upper bound of HermOp = m^2 + lambda_max(D_oe*D_eo). + // For naive staggered (c1=1), lambda_max(SchurStaggeredOperator) ~ 4.6. + // Use 5.0 to give a small margin; refine with PowerMethod for each config. + // PowerMethod for production. Progressive Chebyshev filtering + // (ChebyshevNew) uses two-pass polynomial filter. + //-------------------------------------------------------------------- + typedef Aggregation Subspace; + Subspace Aggregates(Coarse4d, UrbGrid, cb); + + RealD hi = 5.0; + Aggregates.CreateSubspaceChebyshevNew(RNGrb, HermOp, hi); + Aggregates.Orthogonalise(); + + //-------------------------------------------------------------------- + // Coarse geometry: NextToNearestStencilGeometry4D + // hops=2 -> 33 stencil points in 4D + //-------------------------------------------------------------------- + NextToNearestStencilGeometry4D geom(Coarse4d); + + std::cout << GridLogMessage << "Coarse stencil: " << geom.npoint << " points" << std::endl; + + //-------------------------------------------------------------------- + // Single-RHS coarse operator (used for correctness check below) + //-------------------------------------------------------------------- + typedef GeneralCoarsenedMatrix LittleDiracOp; + typedef LittleDiracOp::CoarseVector CoarseVector; + + LittleDiracOp LDO(geom, UrbGrid, Coarse4d); + LDO.CoarsenOperator(HermOp, Aggregates); + + //-------------------------------------------------------------------- + // Correctness check: P M_fine P^T c ≈ M_coarse c + // + // Promote a random coarse vector into the fine subspace, apply the + // fine operator, project back, and compare with the coarse operator + // applied directly. Error should be at the level of subspace + // approximation quality (smaller = better basis vectors). + //-------------------------------------------------------------------- + { + GridParallelRNG RNGc(Coarse4d); RNGc.SeedFixedIntegers({9,10,11,12}); + CoarseVector c_src(Coarse4d), c_ldop(Coarse4d), c_proj(Coarse4d); + random(RNGc, c_src); + + LatticeStaggeredFermionD f_v(UrbGrid), f_Mv(UrbGrid); + Aggregates.PromoteFromSubspace(c_src, f_v); + HermOp.Op(f_v, f_Mv); + Aggregates.ProjectToSubspace(c_proj, f_Mv); + + LDO.M(c_src, c_ldop); + + c_proj -= c_ldop; + RealD err = norm2(c_proj) / norm2(c_ldop); + std::cout << GridLogMessage + << "Coarsen check |P*M_fine - M_coarse| / |M_coarse| = " << err << std::endl; + } + + //-------------------------------------------------------------------- + // Multi-RHS coarse grid + // + // The extra leading dimension holds nrhs right-hand sides packed into + // SIMD lanes, matching the pattern of Test_general_coarse_hdcg_phys48. + //-------------------------------------------------------------------- + const int nrhs = vComplex::Nsimd() * 2; + + Coordinate mpi = GridDefaultMpi(); + Coordinate rhMpi ({1, mpi[0], mpi[1], mpi[2], mpi[3]}); + Coordinate rhLatt({nrhs, clatt[0], clatt[1], clatt[2], clatt[3]}); + Coordinate rhSimd({vComplex::Nsimd(), 1, 1, 1, 1}); + + GridCartesian *CoarseMrhs = new GridCartesian(rhLatt, rhSimd, rhMpi); + + typedef MultiGeneralCoarsenedMatrix MultiCoarseOp; + MultiCoarseOp mrhs(geom, CoarseMrhs); + mrhs.CoarsenOperator(HermOp, Aggregates, Coarse4d); + + //-------------------------------------------------------------------- + // Coarse-grid Lanczos for deflation + //-------------------------------------------------------------------- + typedef HermitianLinearOperator MrhsHermOp; + MrhsHermOp MrhsCoarseOp(mrhs); + + // Estimate spectral bounds for Lanczos Chebyshev filter + CoarseVector pm_src(CoarseMrhs); pm_src = ComplexD(1.0); + PowerMethod cPM; + RealD lambda_max = cPM(MrhsCoarseOp, pm_src); + RealD lambda_lo = mass * mass * 0.5; + + Chebyshev IRLCheby(lambda_lo, lambda_max * 1.1, 71); + + int Nk = 48; + int Nm = 96; + int Nstop = Nk; + + GridParallelRNG CRNG(CoarseMrhs); CRNG.SeedFixedIntegers({13,14,15,16}); + + ImplicitlyRestartedBlockLanczosCoarse + IRL(MrhsCoarseOp, Coarse4d, CoarseMrhs, nrhs, IRLCheby, + Nstop, /*conv_test_interval*/1, nrhs, Nk, Nm, 1.0e-5, 10); + + int Nconv; + std::vector eval(Nm); + std::vector evec(Nm, Coarse4d); + std::vector c_srcs(nrhs, CoarseMrhs); + for (int r = 0; r < nrhs; r++) random(CRNG, c_srcs[r]); + IRL.calc(eval, evec, c_srcs, Nconv, LanczosType::irbl); + + //-------------------------------------------------------------------- + // HDCG solver assembly + //-------------------------------------------------------------------- + MultiRHSDeflation MrhsGuesser; + MrhsGuesser.ImportEigenBasis(evec, eval); + + // MrhsProjector maps between fine (UrbGrid) and coarse (Coarse4d) spaces + MultiRHSBlockProject MrhsProjector; + MrhsProjector.Allocate(nbasis, UrbGrid, Coarse4d); + MrhsProjector.ImportBasis(Aggregates.subspace); + + ConjugateGradient CoarseCG(5.0e-2, 5000, false); + DoNothingGuesser DoNothing; + HPDSolver HPDSolve(MrhsCoarseOp, CoarseCG, DoNothing); + + // Shifted smoother: CG on (HermOp + shift) damps low modes efficiently. + // Shift ~ m^2 sits just above the spectral gap. + RealD smootherShift = mass * mass; + ShiftedHermOpLinearOperator ShiftedOp(HermOp, smootherShift); + CGSmoother smoother(8, ShiftedOp); + + TwoLevelADEF2mrhs + HDCG(1.0e-8, 500, + HermOp, + smoother, + HPDSolve, // M1 (coarse correction) + HPDSolve, // Vstart (initial guess projection) + MrhsProjector, + MrhsGuesser, + CoarseMrhs); + + //-------------------------------------------------------------------- + // Solve: nrhs right-hand sides simultaneously + //-------------------------------------------------------------------- + std::vector src(nrhs, UrbGrid); + std::vector sol(nrhs, UrbGrid); + + GridParallelRNG RNGrb2(UrbGrid); RNGrb2.SeedFixedIntegers({17,18,19,20}); + for (int r = 0; r < nrhs; r++) { + random(RNGrb2, src[r]); + sol[r] = Zero(); + } + + HDCG(src, sol); + + Grid_finalize(); + return 0; +}