From 520b90259dce6bac473f8c68091d894bd4f74b45 Mon Sep 17 00:00:00 2001
From: Thomas Blum <thomasblum@Thomass-MacBook-Air.local>
Date: Wed, 27 May 2026 15:52:49 -0400
Subject: [PATCH] Add staggered HDCG multigrid test and mac-arm Homebrew build
 scripts

Test_staggered_hdcg.cc implements a two-level ADEF2 multigrid solver for
NaiveStaggeredFermion using SchurStaggeredOperator, following the mrhs
hermitian multigrid approach of arXiv:2409.03904. Uses a 33-point coarse
stencil (NextToNearestStencilGeometry4D) with nbasis=24, block={4,4,4,4},
and Chebyshev subspace generation with hi=5.0 (lambda_max ~4.6).

Also adds systems/mac-arm/sourceme-homebrew.sh and config-command-homebrew
for building Grid on Apple Silicon with Homebrew-installed dependencies.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 CLAUDE.md                               |  17 ++
 systems/mac-arm/config-command-homebrew |  18 ++
 systems/mac-arm/sourceme-homebrew.sh    |  15 ++
 tests/debug/Test_staggered_hdcg.cc      | 267 ++++++++++++++++++++++++
 4 files changed, 317 insertions(+)
 create mode 100644 systems/mac-arm/config-command-homebrew
 create mode 100644 systems/mac-arm/sourceme-homebrew.sh
 create mode 100644 tests/debug/Test_staggered_hdcg.cc

diff --git a/CLAUDE.md b/CLAUDE.md
index 4b6acf2a..5e990b5f 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -30,6 +30,9 @@ Key configure options:
 | `--enable-Nc=` | `3` (default), `2`, `4`, `5` |
 | `--with-gmp=`, `--with-mpfr=`, `--with-fftw=`, `--with-lime=` | paths to libs |
 | `--enable-hdf5`, `--enable-mkl`, `--enable-lapack` | optional features |
+| `--enable-debug=yes` | adds `-g`, removes `-O3` |
+
+Use `make V=1` for verbose compiler output (shows full flags; required for bug reports).
 
 Platform recipes from `README.md`:
 - **KNL**: `--enable-simd=KNL --enable-comms=mpi3-auto --enable-mkl`
@@ -96,3 +99,17 @@ GPU support is injected via macros (`accelerator_for`, `accelerator_for2dNB`). T
 - The `RealD`/`RealF`/`ComplexD`/`ComplexF` typedefs are used everywhere; avoid raw `double`/`float`.
 - Logging uses `Grid_log`, `Grid_error` macros (from `Grid/log/`); performance-critical paths use the `GRID_TRACE` / timer macros from `Grid/perfmon/`.
 - Reductions across MPI ranks go through `GridBase::GlobalSum` / `GlobalMax`; never reduce with bare MPI calls inside library code.
+
+## Skills
+
+`skills/` contains seven user-invocable Claude Code skills encoding deep domain knowledge for HPC work in this repo. Invoke with `/skill-name` or ask Claude to use them by name:
+
+| Skill | When to use |
+|-------|-------------|
+| `gpu-memory-performance` | Bandwidth/occupancy problems — `acceleratorThreads()` pitfalls, `coalescedRead/Write`, fused vs staged HBM patterns |
+| `gpu-runtime-correctness` | Wrong answers, non-deterministic results, premature `q.wait()` returns |
+| `communication-overlap` | Designing GPU+MPI overlap pipelines; replacing broken accelerator-aware MPI paths with host-staged 7-phase pipeline |
+| `mpi-heterogeneous` | Collective hangs, buffer aliasing in `MPI_Sendrecv`, heterogeneous topology bugs |
+| `hang-diagnosis` | Distinguishing ioctl hangs, infinite poll loops, collective deadlocks, and silent wrong-answer races |
+| `correctness-verification` | Reproducibility checksums, double-wait testing, bisecting non-deterministic failures |
+| `compiler-validation` | Confirming compiler/optimisation flags are safe before production runs |
diff --git a/systems/mac-arm/config-command-homebrew b/systems/mac-arm/config-command-homebrew
new file mode 100644
index 00000000..e3693601
--- /dev/null
+++ b/systems/mac-arm/config-command-homebrew
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+# Configure Grid from a build directory two levels below the source root,
+# e.g.: mkdir -p systems/mac-arm/build && cd systems/mac-arm/build && bash ../config-command-homebrew
+#
+# Prerequisites: source sourceme-homebrew.sh first.
+
+CXX=mpicxx ../../configure \
+    --enable-simd=GEN \
+    --enable-comms=mpi-auto \
+    --enable-Sp=yes \
+    --enable-unified=yes \
+    --prefix="${HOME}/Grid-install" \
+    --with-lime="${CLIME}" \
+    --with-openssl="${OPENSSL}" \
+    --with-gmp="${GMP}" \
+    --with-mpfr="${MPFR}" \
+    --with-fftw="${FFTW}" \
+    --disable-debug
diff --git a/systems/mac-arm/sourceme-homebrew.sh b/systems/mac-arm/sourceme-homebrew.sh
new file mode 100644
index 00000000..f8a2b9ab
--- /dev/null
+++ b/systems/mac-arm/sourceme-homebrew.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+# Environment for building Grid on Apple Silicon Mac with Homebrew dependencies.
+# Usage: source systems/mac-arm/sourceme-homebrew.sh
+
+HOMEBREW=/opt/homebrew
+
+export GMP=${HOMEBREW}/opt/gmp
+export MPFR=${HOMEBREW}/opt/mpfr
+export FFTW=${HOMEBREW}/opt/fftw
+export OPENSSL=${HOMEBREW}/opt/openssl@3
+export CLIME=/usr/local
+
+export PATH="${HOMEBREW}/opt/open-mpi/bin:${HOMEBREW}/bin:${PATH}"
+export LDFLAGS="-L${OPENSSL}/lib"
+export CPPFLAGS="-I${OPENSSL}/include"
diff --git a/tests/debug/Test_staggered_hdcg.cc b/tests/debug/Test_staggered_hdcg.cc
new file mode 100644
index 00000000..ed78333d
--- /dev/null
+++ b/tests/debug/Test_staggered_hdcg.cc
@@ -0,0 +1,267 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: tests/debug/Test_staggered_hdcg.cc
+
+    Authors: Thomas Blum, Peter Boyle
+
+    HDCG (Hierarchical Deflation Conjugate Gradient) multigrid solver
+    for naive staggered fermions, based on arXiv:2409.03904.
+
+    Adapts the DWF HDCG infrastructure (Test_general_coarse_hdcg_phys48.cc) to:
+      - NaiveStaggeredFermion (nearest-neighbour only, no Naik 3-hop term)
+      - 4D SchurStaggeredOperator:  Mpc = m^2 - D_oe * D_eo  (hermitian, positive-definite)
+      - vColourVector fine field type (staggered has colour but no spin)
+      - NextToNearestStencilGeometry4D: 33-point coarse stencil
+
+    Stencil count: D_oe*D_eo has 2-hop fine range.  With blocking B >= 2 the coarse
+    shifts have L1-distance <= 2, giving 33 stencil points in 4D:
+      1 (identity) + 8 (+-e_mu) + 24 (+-e_mu +- e_nu).
+    NaiveStaggeredFermion has no Naik term, so any B >= 2 suffices.
+    To extend to ImprovedStaggeredFermion later, use B >= 6.
+
+    Reference: arXiv:2409.03904 (mrhs hermitian multigrid for DWF).
+
+    Usage (after build):
+      ./Test_staggered_hdcg --grid 16.16.16.16 --mpi 1.1.1.1
+
+*************************************************************************************/
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/AdefMrhs.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
+
+using namespace Grid;
+
+// Non-converging CG used as a smoother (fixed number of iterations)
+template<class Field>
+class CGSmoother : public LinearFunction<Field>
+{
+public:
+  typedef LinearOperatorBase<Field> FineOperator;
+  FineOperator &_Op;
+  int iters;
+  CGSmoother(int _iters, FineOperator &Op) : _Op(Op), iters(_iters) {}
+  void operator()(const Field &in, Field &out)
+  {
+    ConjugateGradient<Field> CG(0.0, iters, false);
+    out = Zero();
+    CG(_Op, in, out);
+  }
+};
+
+int main(int argc, char **argv)
+{
+  Grid_init(&argc, &argv);
+
+  //--------------------------------------------------------------------
+  // Parameters — tune for production
+  //--------------------------------------------------------------------
+  const int nbasis = 24;   // near-null space dimension
+  const int cb     = 0;    // even checkerboard
+
+  RealD mass = 0.05;
+
+  // NaiveStaggeredFermion: nearest-neighbour hop only (no Naik term).
+  // c1 = coefficient of the hopping term (1.0 = standard normalisation).
+  // u0 = tadpole factor (1.0 = no tadpole improvement).
+  RealD c1 = 1.0;
+  RealD u0 = 1.0;
+
+  //--------------------------------------------------------------------
+  // Grids
+  // Fine:   UGrid (4D full), UrbGrid (4D red-black)
+  // Coarse: Coarse4d  with dimensions = GridDefaultLatt() / Block
+  //
+  // Recommended: GridDefaultLatt() >= 16^4, Block = {4,4,4,4}
+  // NaiveStaggeredFermion works with any Block >= {2,2,2,2}
+  //--------------------------------------------------------------------
+  GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(
+      GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi());
+  GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  Coordinate Block({4, 4, 4, 4});
+  Coordinate clatt = GridDefaultLatt();
+  for (int d = 0; d < clatt.size(); d++) clatt[d] /= Block[d];
+
+  GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(
+      clatt, GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi());
+
+  //--------------------------------------------------------------------
+  // RNG + gauge field
+  //--------------------------------------------------------------------
+  GridParallelRNG RNG4(UGrid);    RNG4.SeedFixedIntegers({1,2,3,4});
+  GridParallelRNG RNGrb(UrbGrid); RNGrb.SeedFixedIntegers({5,6,7,8});
+
+  LatticeGaugeField Umu(UGrid);
+  // Hot gauge start for testing; replace with NerscIO::readConfiguration for production
+  SU<Nc>::HotConfiguration(RNG4, Umu);
+
+  //--------------------------------------------------------------------
+  // Naive staggered fermion + Schur hermitian operator
+  //
+  // NaiveStaggeredFermion: nearest-neighbour hops only, no Naik term.
+  // SchurStaggeredOperator::Op = HermOp = Mpc = m^2 - D_oe * D_eo
+  // Positive-definite hermitian; CG and HDCG apply directly.
+  // No HermOpAdaptor wrapper needed (Op == HermOp for SchurStaggeredOperator).
+  //--------------------------------------------------------------------
+  NaiveStaggeredFermionD Ds(Umu, *UGrid, *UrbGrid, mass, c1, u0);
+  SchurStaggeredOperator<NaiveStaggeredFermionD, LatticeStaggeredFermionD> HermOp(Ds);
+
+  //--------------------------------------------------------------------
+  // Subspace: Chebyshev-filtered near-null vectors
+  //
+  // hi = spectral upper bound of HermOp = m^2 + lambda_max(D_oe*D_eo).
+  // For naive staggered (c1=1), lambda_max(SchurStaggeredOperator) ~ 4.6.
+  // Use 5.0 to give a small margin; refine with PowerMethod for each config.
+  // PowerMethod for production.  Progressive Chebyshev filtering
+  // (ChebyshevNew) uses two-pass polynomial filter.
+  //--------------------------------------------------------------------
+  typedef Aggregation<vColourVector, vTComplex, nbasis> Subspace;
+  Subspace Aggregates(Coarse4d, UrbGrid, cb);
+
+  RealD hi = 5.0;
+  Aggregates.CreateSubspaceChebyshevNew(RNGrb, HermOp, hi);
+  Aggregates.Orthogonalise();
+
+  //--------------------------------------------------------------------
+  // Coarse geometry: NextToNearestStencilGeometry4D
+  //   hops=2  ->  33 stencil points in 4D
+  //--------------------------------------------------------------------
+  NextToNearestStencilGeometry4D geom(Coarse4d);
+
+  std::cout << GridLogMessage << "Coarse stencil: " << geom.npoint << " points" << std::endl;
+
+  //--------------------------------------------------------------------
+  // Single-RHS coarse operator (used for correctness check below)
+  //--------------------------------------------------------------------
+  typedef GeneralCoarsenedMatrix<vColourVector, vTComplex, nbasis> LittleDiracOp;
+  typedef LittleDiracOp::CoarseVector CoarseVector;
+
+  LittleDiracOp LDO(geom, UrbGrid, Coarse4d);
+  LDO.CoarsenOperator(HermOp, Aggregates);
+
+  //--------------------------------------------------------------------
+  // Correctness check: P M_fine P^T c  ≈  M_coarse c
+  //
+  // Promote a random coarse vector into the fine subspace, apply the
+  // fine operator, project back, and compare with the coarse operator
+  // applied directly.  Error should be at the level of subspace
+  // approximation quality (smaller = better basis vectors).
+  //--------------------------------------------------------------------
+  {
+    GridParallelRNG RNGc(Coarse4d); RNGc.SeedFixedIntegers({9,10,11,12});
+    CoarseVector c_src(Coarse4d), c_ldop(Coarse4d), c_proj(Coarse4d);
+    random(RNGc, c_src);
+
+    LatticeStaggeredFermionD f_v(UrbGrid), f_Mv(UrbGrid);
+    Aggregates.PromoteFromSubspace(c_src, f_v);
+    HermOp.Op(f_v, f_Mv);
+    Aggregates.ProjectToSubspace(c_proj, f_Mv);
+
+    LDO.M(c_src, c_ldop);
+
+    c_proj -= c_ldop;
+    RealD err = norm2(c_proj) / norm2(c_ldop);
+    std::cout << GridLogMessage
+              << "Coarsen check |P*M_fine - M_coarse| / |M_coarse| = " << err << std::endl;
+  }
+
+  //--------------------------------------------------------------------
+  // Multi-RHS coarse grid
+  //
+  // The extra leading dimension holds nrhs right-hand sides packed into
+  // SIMD lanes, matching the pattern of Test_general_coarse_hdcg_phys48.
+  //--------------------------------------------------------------------
+  const int nrhs = vComplex::Nsimd() * 2;
+
+  Coordinate mpi   = GridDefaultMpi();
+  Coordinate rhMpi ({1,    mpi[0],  mpi[1],  mpi[2],  mpi[3]});
+  Coordinate rhLatt({nrhs, clatt[0], clatt[1], clatt[2], clatt[3]});
+  Coordinate rhSimd({vComplex::Nsimd(), 1, 1, 1, 1});
+
+  GridCartesian *CoarseMrhs = new GridCartesian(rhLatt, rhSimd, rhMpi);
+
+  typedef MultiGeneralCoarsenedMatrix<vColourVector, vTComplex, nbasis> MultiCoarseOp;
+  MultiCoarseOp mrhs(geom, CoarseMrhs);
+  mrhs.CoarsenOperator(HermOp, Aggregates, Coarse4d);
+
+  //--------------------------------------------------------------------
+  // Coarse-grid Lanczos for deflation
+  //--------------------------------------------------------------------
+  typedef HermitianLinearOperator<MultiCoarseOp, CoarseVector> MrhsHermOp;
+  MrhsHermOp MrhsCoarseOp(mrhs);
+
+  // Estimate spectral bounds for Lanczos Chebyshev filter
+  CoarseVector pm_src(CoarseMrhs); pm_src = ComplexD(1.0);
+  PowerMethod<CoarseVector> cPM;
+  RealD lambda_max = cPM(MrhsCoarseOp, pm_src);
+  RealD lambda_lo  = mass * mass * 0.5;
+
+  Chebyshev<CoarseVector> IRLCheby(lambda_lo, lambda_max * 1.1, 71);
+
+  int Nk    = 48;
+  int Nm    = 96;
+  int Nstop = Nk;
+
+  GridParallelRNG CRNG(CoarseMrhs); CRNG.SeedFixedIntegers({13,14,15,16});
+
+  ImplicitlyRestartedBlockLanczosCoarse<CoarseVector>
+    IRL(MrhsCoarseOp, Coarse4d, CoarseMrhs, nrhs, IRLCheby,
+        Nstop, /*conv_test_interval*/1, nrhs, Nk, Nm, 1.0e-5, 10);
+
+  int Nconv;
+  std::vector<RealD>        eval(Nm);
+  std::vector<CoarseVector> evec(Nm, Coarse4d);
+  std::vector<CoarseVector> c_srcs(nrhs, CoarseMrhs);
+  for (int r = 0; r < nrhs; r++) random(CRNG, c_srcs[r]);
+  IRL.calc(eval, evec, c_srcs, Nconv, LanczosType::irbl);
+
+  //--------------------------------------------------------------------
+  // HDCG solver assembly
+  //--------------------------------------------------------------------
+  MultiRHSDeflation<CoarseVector> MrhsGuesser;
+  MrhsGuesser.ImportEigenBasis(evec, eval);
+
+  // MrhsProjector maps between fine (UrbGrid) and coarse (Coarse4d) spaces
+  MultiRHSBlockProject<LatticeStaggeredFermionD> MrhsProjector;
+  MrhsProjector.Allocate(nbasis, UrbGrid, Coarse4d);
+  MrhsProjector.ImportBasis(Aggregates.subspace);
+
+  ConjugateGradient<CoarseVector> CoarseCG(5.0e-2, 5000, false);
+  DoNothingGuesser<CoarseVector>  DoNothing;
+  HPDSolver<CoarseVector> HPDSolve(MrhsCoarseOp, CoarseCG, DoNothing);
+
+  // Shifted smoother: CG on (HermOp + shift) damps low modes efficiently.
+  // Shift ~ m^2 sits just above the spectral gap.
+  RealD smootherShift = mass * mass;
+  ShiftedHermOpLinearOperator<LatticeStaggeredFermionD> ShiftedOp(HermOp, smootherShift);
+  CGSmoother<LatticeStaggeredFermionD> smoother(8, ShiftedOp);
+
+  TwoLevelADEF2mrhs<LatticeStaggeredFermionD, CoarseVector>
+    HDCG(1.0e-8, 500,
+         HermOp,
+         smoother,
+         HPDSolve,   // M1 (coarse correction)
+         HPDSolve,   // Vstart (initial guess projection)
+         MrhsProjector,
+         MrhsGuesser,
+         CoarseMrhs);
+
+  //--------------------------------------------------------------------
+  // Solve: nrhs right-hand sides simultaneously
+  //--------------------------------------------------------------------
+  std::vector<LatticeStaggeredFermionD> src(nrhs, UrbGrid);
+  std::vector<LatticeStaggeredFermionD> sol(nrhs, UrbGrid);
+
+  GridParallelRNG RNGrb2(UrbGrid); RNGrb2.SeedFixedIntegers({17,18,19,20});
+  for (int r = 0; r < nrhs; r++) {
+    random(RNGrb2, src[r]);
+    sol[r] = Zero();
+  }
+
+  HDCG(src, sol);
+
+  Grid_finalize();
+  return 0;
+}