Add staggered HDCG multigrid test and mac-arm Homebrew build scripts

Test_staggered_hdcg.cc implements a two-level ADEF2 multigrid solver for NaiveStaggeredFermion using SchurStaggeredOperator, following the mrhs hermitian multigrid approach of arXiv:2409.03904. Uses a 33-point coarse stencil (NextToNearestStencilGeometry4D) with nbasis=24, block={4,4,4,4}, and Chebyshev subspace generation with hi=5.0 (lambda_max ~4.6). Also adds systems/mac-arm/sourceme-homebrew.sh and config-command-homebrew for building Grid on Apple Silicon with Homebrew-installed dependencies. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-07-27 05:53:27 +01:00 · 2026-05-27 15:52:49 -04:00
parent b58a1508fa
commit 520b90259d
4 changed files with 317 additions and 0 deletions
@@ -30,6 +30,9 @@ Key configure options:
 | `--enable-Nc=` | `3` (default), `2`, `4`, `5` |
 | `--with-gmp=`, `--with-mpfr=`, `--with-fftw=`, `--with-lime=` | paths to libs |
 | `--enable-hdf5`, `--enable-mkl`, `--enable-lapack` | optional features |
 | `--enable-debug=yes` | adds `-g`, removes `-O3` |
 Use `make V=1` for verbose compiler output (shows full flags; required for bug reports).
 Platform recipes from `README.md`:
 - **KNL**: `--enable-simd=KNL --enable-comms=mpi3-auto --enable-mkl`
@@ -96,3 +99,17 @@ GPU support is injected via macros (`accelerator_for`, `accelerator_for2dNB`). T
 - The `RealD`/`RealF`/`ComplexD`/`ComplexF` typedefs are used everywhere; avoid raw `double`/`float`.
 - Logging uses `Grid_log`, `Grid_error` macros (from `Grid/log/`); performance-critical paths use the `GRID_TRACE` / timer macros from `Grid/perfmon/`.
 - Reductions across MPI ranks go through `GridBase::GlobalSum` / `GlobalMax`; never reduce with bare MPI calls inside library code.
 ## Skills
 `skills/` contains seven user-invocable Claude Code skills encoding deep domain knowledge for HPC work in this repo. Invoke with `/skill-name` or ask Claude to use them by name:
 | Skill | When to use |
 |-------|-------------|
 | `gpu-memory-performance` | Bandwidth/occupancy problems — `acceleratorThreads()` pitfalls, `coalescedRead/Write`, fused vs staged HBM patterns |
 | `gpu-runtime-correctness` | Wrong answers, non-deterministic results, premature `q.wait()` returns |
 | `communication-overlap` | Designing GPU+MPI overlap pipelines; replacing broken accelerator-aware MPI paths with host-staged 7-phase pipeline |
 | `mpi-heterogeneous` | Collective hangs, buffer aliasing in `MPI_Sendrecv`, heterogeneous topology bugs |
 | `hang-diagnosis` | Distinguishing ioctl hangs, infinite poll loops, collective deadlocks, and silent wrong-answer races |
 | `correctness-verification` | Reproducibility checksums, double-wait testing, bisecting non-deterministic failures |
 | `compiler-validation` | Confirming compiler/optimisation flags are safe before production runs |
@@ -0,0 +1,18 @@
 #!/usr/bin/env bash
 # Configure Grid from a build directory two levels below the source root,
 # e.g.: mkdir -p systems/mac-arm/build && cd systems/mac-arm/build && bash ../config-command-homebrew
 #
 # Prerequisites: source sourceme-homebrew.sh first.
 CXX=mpicxx ../../configure \
    --enable-simd=GEN \
    --enable-comms=mpi-auto \
    --enable-Sp=yes \
    --enable-unified=yes \
    --prefix="${HOME}/Grid-install" \
    --with-lime="${CLIME}" \
    --with-openssl="${OPENSSL}" \
    --with-gmp="${GMP}" \
    --with-mpfr="${MPFR}" \
    --with-fftw="${FFTW}" \
    --disable-debug
@@ -0,0 +1,15 @@
 #!/usr/bin/env bash
 # Environment for building Grid on Apple Silicon Mac with Homebrew dependencies.
 # Usage: source systems/mac-arm/sourceme-homebrew.sh
 HOMEBREW=/opt/homebrew
 export GMP=${HOMEBREW}/opt/gmp
 export MPFR=${HOMEBREW}/opt/mpfr
 export FFTW=${HOMEBREW}/opt/fftw
 export OPENSSL=${HOMEBREW}/opt/openssl@3
 export CLIME=/usr/local
 export PATH="${HOMEBREW}/opt/open-mpi/bin:${HOMEBREW}/bin:${PATH}"
 export LDFLAGS="-L${OPENSSL}/lib"
 export CPPFLAGS="-I${OPENSSL}/include"
@@ -0,0 +1,267 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: tests/debug/Test_staggered_hdcg.cc
    Authors: Thomas Blum, Peter Boyle
    HDCG (Hierarchical Deflation Conjugate Gradient) multigrid solver
    for naive staggered fermions, based on arXiv:2409.03904.
    Adapts the DWF HDCG infrastructure (Test_general_coarse_hdcg_phys48.cc) to:
      - NaiveStaggeredFermion (nearest-neighbour only, no Naik 3-hop term)
      - 4D SchurStaggeredOperator:  Mpc = m^2 - D_oe * D_eo  (hermitian, positive-definite)
      - vColourVector fine field type (staggered has colour but no spin)
      - NextToNearestStencilGeometry4D: 33-point coarse stencil
    Stencil count: D_oe*D_eo has 2-hop fine range.  With blocking B >= 2 the coarse
    shifts have L1-distance <= 2, giving 33 stencil points in 4D:
      1 (identity) + 8 (+-e_mu) + 24 (+-e_mu +- e_nu).
    NaiveStaggeredFermion has no Naik term, so any B >= 2 suffices.
    To extend to ImprovedStaggeredFermion later, use B >= 6.
    Reference: arXiv:2409.03904 (mrhs hermitian multigrid for DWF).
    Usage (after build):
      ./Test_staggered_hdcg --grid 16.16.16.16 --mpi 1.1.1.1
 *************************************************************************************/
 #include <Grid/Grid.h>
 #include <Grid/algorithms/iterative/AdefMrhs.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
 using namespace Grid;
 // Non-converging CG used as a smoother (fixed number of iterations)
 template<class Field>
 class CGSmoother : public LinearFunction<Field>
 {
 public:
  typedef LinearOperatorBase<Field> FineOperator;
  FineOperator &_Op;
  int iters;
  CGSmoother(int _iters, FineOperator &Op) : _Op(Op), iters(_iters) {}
  void operator()(const Field &in, Field &out)
  {
    ConjugateGradient<Field> CG(0.0, iters, false);
    out = Zero();
    CG(_Op, in, out);
  }
 };
 int main(int argc, char **argv)
 {
  Grid_init(&argc, &argv);
  //--------------------------------------------------------------------
  // Parameters — tune for production
  //--------------------------------------------------------------------
  const int nbasis = 24;   // near-null space dimension
  const int cb     = 0;    // even checkerboard
  RealD mass = 0.05;
  // NaiveStaggeredFermion: nearest-neighbour hop only (no Naik term).
  // c1 = coefficient of the hopping term (1.0 = standard normalisation).
  // u0 = tadpole factor (1.0 = no tadpole improvement).
  RealD c1 = 1.0;
  RealD u0 = 1.0;
  //--------------------------------------------------------------------
  // Grids
  // Fine:   UGrid (4D full), UrbGrid (4D red-black)
  // Coarse: Coarse4d  with dimensions = GridDefaultLatt() / Block
  //
  // Recommended: GridDefaultLatt() >= 16^4, Block = {4,4,4,4}
  // NaiveStaggeredFermion works with any Block >= {2,2,2,2}
  //--------------------------------------------------------------------
  GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(
      GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi());
  GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  Coordinate Block({4, 4, 4, 4});
  Coordinate clatt = GridDefaultLatt();
  for (int d = 0; d < clatt.size(); d++) clatt[d] /= Block[d];
  GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(
      clatt, GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi());
  //--------------------------------------------------------------------
  // RNG + gauge field
  //--------------------------------------------------------------------
  GridParallelRNG RNG4(UGrid);    RNG4.SeedFixedIntegers({1,2,3,4});
  GridParallelRNG RNGrb(UrbGrid); RNGrb.SeedFixedIntegers({5,6,7,8});
  LatticeGaugeField Umu(UGrid);
  // Hot gauge start for testing; replace with NerscIO::readConfiguration for production
  SU<Nc>::HotConfiguration(RNG4, Umu);
  //--------------------------------------------------------------------
  // Naive staggered fermion + Schur hermitian operator
  //
  // NaiveStaggeredFermion: nearest-neighbour hops only, no Naik term.
  // SchurStaggeredOperator::Op = HermOp = Mpc = m^2 - D_oe * D_eo
  // Positive-definite hermitian; CG and HDCG apply directly.
  // No HermOpAdaptor wrapper needed (Op == HermOp for SchurStaggeredOperator).
  //--------------------------------------------------------------------
  NaiveStaggeredFermionD Ds(Umu, *UGrid, *UrbGrid, mass, c1, u0);
  SchurStaggeredOperator<NaiveStaggeredFermionD, LatticeStaggeredFermionD> HermOp(Ds);
  //--------------------------------------------------------------------
  // Subspace: Chebyshev-filtered near-null vectors
  //
  // hi = spectral upper bound of HermOp = m^2 + lambda_max(D_oe*D_eo).
  // For naive staggered (c1=1), lambda_max(SchurStaggeredOperator) ~ 4.6.
  // Use 5.0 to give a small margin; refine with PowerMethod for each config.
  // PowerMethod for production.  Progressive Chebyshev filtering
  // (ChebyshevNew) uses two-pass polynomial filter.
  //--------------------------------------------------------------------
  typedef Aggregation<vColourVector, vTComplex, nbasis> Subspace;
  Subspace Aggregates(Coarse4d, UrbGrid, cb);
  RealD hi = 5.0;
  Aggregates.CreateSubspaceChebyshevNew(RNGrb, HermOp, hi);
  Aggregates.Orthogonalise();
  //--------------------------------------------------------------------
  // Coarse geometry: NextToNearestStencilGeometry4D
  //   hops=2  ->  33 stencil points in 4D
  //--------------------------------------------------------------------
  NextToNearestStencilGeometry4D geom(Coarse4d);
  std::cout << GridLogMessage << "Coarse stencil: " << geom.npoint << " points" << std::endl;
  //--------------------------------------------------------------------
  // Single-RHS coarse operator (used for correctness check below)
  //--------------------------------------------------------------------
  typedef GeneralCoarsenedMatrix<vColourVector, vTComplex, nbasis> LittleDiracOp;
  typedef LittleDiracOp::CoarseVector CoarseVector;
  LittleDiracOp LDO(geom, UrbGrid, Coarse4d);
  LDO.CoarsenOperator(HermOp, Aggregates);
  //--------------------------------------------------------------------
  // Correctness check: P M_fine P^T c  ≈  M_coarse c
  //
  // Promote a random coarse vector into the fine subspace, apply the
  // fine operator, project back, and compare with the coarse operator
  // applied directly.  Error should be at the level of subspace
  // approximation quality (smaller = better basis vectors).
  //--------------------------------------------------------------------
  {
    GridParallelRNG RNGc(Coarse4d); RNGc.SeedFixedIntegers({9,10,11,12});
    CoarseVector c_src(Coarse4d), c_ldop(Coarse4d), c_proj(Coarse4d);
    random(RNGc, c_src);
    LatticeStaggeredFermionD f_v(UrbGrid), f_Mv(UrbGrid);
    Aggregates.PromoteFromSubspace(c_src, f_v);
    HermOp.Op(f_v, f_Mv);
    Aggregates.ProjectToSubspace(c_proj, f_Mv);
    LDO.M(c_src, c_ldop);
    c_proj -= c_ldop;
    RealD err = norm2(c_proj) / norm2(c_ldop);
    std::cout << GridLogMessage
              << "Coarsen check |P*M_fine - M_coarse| / |M_coarse| = " << err << std::endl;
  }
  //--------------------------------------------------------------------
  // Multi-RHS coarse grid
  //
  // The extra leading dimension holds nrhs right-hand sides packed into
  // SIMD lanes, matching the pattern of Test_general_coarse_hdcg_phys48.
  //--------------------------------------------------------------------
  const int nrhs = vComplex::Nsimd() * 2;
  Coordinate mpi   = GridDefaultMpi();
  Coordinate rhMpi ({1,    mpi[0],  mpi[1],  mpi[2],  mpi[3]});
  Coordinate rhLatt({nrhs, clatt[0], clatt[1], clatt[2], clatt[3]});
  Coordinate rhSimd({vComplex::Nsimd(), 1, 1, 1, 1});
  GridCartesian *CoarseMrhs = new GridCartesian(rhLatt, rhSimd, rhMpi);
  typedef MultiGeneralCoarsenedMatrix<vColourVector, vTComplex, nbasis> MultiCoarseOp;
  MultiCoarseOp mrhs(geom, CoarseMrhs);
  mrhs.CoarsenOperator(HermOp, Aggregates, Coarse4d);
  //--------------------------------------------------------------------
  // Coarse-grid Lanczos for deflation
  //--------------------------------------------------------------------
  typedef HermitianLinearOperator<MultiCoarseOp, CoarseVector> MrhsHermOp;
  MrhsHermOp MrhsCoarseOp(mrhs);
  // Estimate spectral bounds for Lanczos Chebyshev filter
  CoarseVector pm_src(CoarseMrhs); pm_src = ComplexD(1.0);
  PowerMethod<CoarseVector> cPM;
  RealD lambda_max = cPM(MrhsCoarseOp, pm_src);
  RealD lambda_lo  = mass * mass * 0.5;
  Chebyshev<CoarseVector> IRLCheby(lambda_lo, lambda_max * 1.1, 71);
  int Nk    = 48;
  int Nm    = 96;
  int Nstop = Nk;
  GridParallelRNG CRNG(CoarseMrhs); CRNG.SeedFixedIntegers({13,14,15,16});
  ImplicitlyRestartedBlockLanczosCoarse<CoarseVector>
    IRL(MrhsCoarseOp, Coarse4d, CoarseMrhs, nrhs, IRLCheby,
        Nstop, /*conv_test_interval*/1, nrhs, Nk, Nm, 1.0e-5, 10);
  int Nconv;
  std::vector<RealD>        eval(Nm);
  std::vector<CoarseVector> evec(Nm, Coarse4d);
  std::vector<CoarseVector> c_srcs(nrhs, CoarseMrhs);
  for (int r = 0; r < nrhs; r++) random(CRNG, c_srcs[r]);
  IRL.calc(eval, evec, c_srcs, Nconv, LanczosType::irbl);
  //--------------------------------------------------------------------
  // HDCG solver assembly
  //--------------------------------------------------------------------
  MultiRHSDeflation<CoarseVector> MrhsGuesser;
  MrhsGuesser.ImportEigenBasis(evec, eval);
  // MrhsProjector maps between fine (UrbGrid) and coarse (Coarse4d) spaces
  MultiRHSBlockProject<LatticeStaggeredFermionD> MrhsProjector;
  MrhsProjector.Allocate(nbasis, UrbGrid, Coarse4d);
  MrhsProjector.ImportBasis(Aggregates.subspace);
  ConjugateGradient<CoarseVector> CoarseCG(5.0e-2, 5000, false);
  DoNothingGuesser<CoarseVector>  DoNothing;
  HPDSolver<CoarseVector> HPDSolve(MrhsCoarseOp, CoarseCG, DoNothing);
  // Shifted smoother: CG on (HermOp + shift) damps low modes efficiently.
  // Shift ~ m^2 sits just above the spectral gap.
  RealD smootherShift = mass * mass;
  ShiftedHermOpLinearOperator<LatticeStaggeredFermionD> ShiftedOp(HermOp, smootherShift);
  CGSmoother<LatticeStaggeredFermionD> smoother(8, ShiftedOp);
  TwoLevelADEF2mrhs<LatticeStaggeredFermionD, CoarseVector>
    HDCG(1.0e-8, 500,
         HermOp,
         smoother,
         HPDSolve,   // M1 (coarse correction)
         HPDSolve,   // Vstart (initial guess projection)
         MrhsProjector,
         MrhsGuesser,
         CoarseMrhs);
  //--------------------------------------------------------------------
  // Solve: nrhs right-hand sides simultaneously
  //--------------------------------------------------------------------
  std::vector<LatticeStaggeredFermionD> src(nrhs, UrbGrid);
  std::vector<LatticeStaggeredFermionD> sol(nrhs, UrbGrid);
  GridParallelRNG RNGrb2(UrbGrid); RNGrb2.SeedFixedIntegers({17,18,19,20});
  for (int r = 0; r < nrhs; r++) {
    random(RNGrb2, src[r]);
    sol[r] = Zero();
  }
  HDCG(src, sol);
  Grid_finalize();
  return 0;
 }