More info

benchmarks: add DhopEO benchmarks to Benchmark_staggered and Benchmark_staggeredF
Also add NaiveStaggeredFermionF to Benchmark_staggeredF, and fix bug where NaiveStaggered timed loop was calling Ds.Dhop instead of Dn.Dhop. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-07-23 03:53:27 +01:00 · 2026-05-29 16:10:11 -04:00 · 2026-05-29 13:55:27 -04:00 · 2026-05-29 13:36:57 -04:00 · 2026-05-28 16:43:23 -04:00 · 2026-05-28 11:41:41 -04:00
24 changed files with 1185 additions and 568 deletions
@@ -30,6 +30,9 @@ Key configure options:
 | `--enable-Nc=` | `3` (default), `2`, `4`, `5` |
 | `--with-gmp=`, `--with-mpfr=`, `--with-fftw=`, `--with-lime=` | paths to libs |
 | `--enable-hdf5`, `--enable-mkl`, `--enable-lapack` | optional features |
+| `--enable-debug=yes` | adds `-g`, removes `-O3` |
+
+Use `make V=1` for verbose compiler output (shows full flags; required for bug reports).

 Platform recipes from `README.md`:
 - **KNL**: `--enable-simd=KNL --enable-comms=mpi3-auto --enable-mkl`
@@ -96,3 +99,17 @@ GPU support is injected via macros (`accelerator_for`, `accelerator_for2dNB`). T
 - The `RealD`/`RealF`/`ComplexD`/`ComplexF` typedefs are used everywhere; avoid raw `double`/`float`.
 - Logging uses `Grid_log`, `Grid_error` macros (from `Grid/log/`); performance-critical paths use the `GRID_TRACE` / timer macros from `Grid/perfmon/`.
 - Reductions across MPI ranks go through `GridBase::GlobalSum` / `GlobalMax`; never reduce with bare MPI calls inside library code.
+
+## Skills
+
+`skills/` contains seven user-invocable Claude Code skills encoding deep domain knowledge for HPC work in this repo. Invoke with `/skill-name` or ask Claude to use them by name:
+
+| Skill | When to use |
+|-------|-------------|
+| `gpu-memory-performance` | Bandwidth/occupancy problems — `acceleratorThreads()` pitfalls, `coalescedRead/Write`, fused vs staged HBM patterns |
+| `gpu-runtime-correctness` | Wrong answers, non-deterministic results, premature `q.wait()` returns |
+| `communication-overlap` | Designing GPU+MPI overlap pipelines; replacing broken accelerator-aware MPI paths with host-staged 7-phase pipeline |
+| `mpi-heterogeneous` | Collective hangs, buffer aliasing in `MPI_Sendrecv`, heterogeneous topology bugs |
+| `hang-diagnosis` | Distinguishing ioctl hangs, infinite poll loops, collective deadlocks, and silent wrong-answer races |
+| `correctness-verification` | Reproducibility checksums, double-wait testing, bisecting non-deterministic failures |
+| `compiler-validation` | Confirming compiler/optimisation flags are safe before production runs |
@@ -288,17 +288,21 @@ public:
      
      //----------------------------------------------------------------------
      if ( Nm>Nk ) {
-	//        Glog <<" #Apply shifted QR transformations "<<std::endl;
-        //int k2 = Nk+Nu;
+        Glog <<" #Ritz values of poly(A) before shift ["<<Nm<<" values]:"<<std::endl;
+        for(int i=0; i<Nm; ++i){
+          std::cout.precision(8);
+          std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
+          std::cout << "Rval = "<<std::setw(16)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
+        }
+
        int k2 = Nk;
-      
+
        Eigen::MatrixXcd BTDM = Eigen::MatrixXcd::Identity(Nm,Nm);
        Q = Eigen::MatrixXcd::Identity(Nm,Nm);
-        
+
        unpackHermitBlockTriDiagMatToEigen(lmd,lme,Nu,Nblock_m,Nm,Nm,BTDM);

        for(int ip=Nk; ip<Nm; ++ip){
-	  Glog << " ip "<<ip<<" / "<<Nm<<std::endl;
          shiftedQRDecompEigen(BTDM,Nu,Nm,eval2[ip],Q);
        }
        
@@ -326,11 +330,11 @@ public:
        Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
        diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid);
        _sort.push(eval2,Nk);
-	//	Glog << "#Ritz value after shift: "<< std::endl;
+        Glog << "#Ritz values of poly(A) after shift ["<<Nk<<" values]:"<<std::endl;
        for(int i=0; i<Nk; ++i){
-	  //	  std::cout.precision(13);
-	  //	  std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
-	  //	  std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
+          std::cout.precision(8);
+          std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
+          std::cout << "Rval = "<<std::setw(16)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
        }
      }
      //----------------------------------------------------------------------
@@ -710,11 +714,17 @@ private:
    //lme[0][L] = beta;
    
    for (int u=0; u<Nu; ++u) {
-      //      Glog << "norm2(w[" << u << "])= "<< norm2(w[u]) << std::endl;
      GRID_ASSERT (!isnan(norm2(w[u])));
-      for (int k=L+u; k<R; ++k) {
-	//        Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
-      }
+    }
+    // Diagnostic: print alpha (lmd) and beta (lme) block diagonals for this step
+    {
+      Glog << " blk b="<<std::setw(3)<<b<<"  alpha:";
+      for (int u=0; u<Nu; ++u)
+        std::cout << " " << std::setw(10) << std::setprecision(6) << real(lmd[u][L+u]);
+      std::cout << "   |beta|:";
+      for (int u=0; u<Nu; ++u)
+        std::cout << " " << std::setw(10) << std::setprecision(6) << abs(lme[u][L+u]);
+      std::cout << std::endl;
    }
    //    Glog << "LinAlg done "<< std::endl;

@@ -1,7 +1,6 @@
 #pragma once

 #if defined(GRID_CUDA)
-
 #include <cub/cub.cuh>
 #define gpucub cub
 #define gpuError_t cudaError_t
@@ -57,8 +56,13 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
  //copy offsets to device
  acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
  
+#if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 13)
+  #define GRID_CUB_SUM_OP ::cuda::std::plus<>{}
+#else
+  #define GRID_CUB_SUM_OP ::gpucub::Sum()
+#endif
  
-  gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
+  gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, GRID_CUB_SUM_OP, zero_init, computeStream);
  if (gpuErr!=gpuSuccess) {
    std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce (setup)! Error: " << gpuErr <<std::endl;
    exit(EXIT_FAILURE);
@@ -82,11 +86,13 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
  });
  
  //issue segmented reductions in computeStream
-  gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p, d_out, rd, d_offsets, d_offsets+1,::gpucub::Sum(), zero_init, computeStream);
+  gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p, d_out, rd, d_offsets, d_offsets+1, GRID_CUB_SUM_OP, zero_init, computeStream);
  if (gpuErr!=gpuSuccess) {
    std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce! Error: " << gpuErr <<std::endl;
    exit(EXIT_FAILURE);
  }
+
+#undef GRID_CUB_SUM_OP
  
  acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
  
@@ -113,6 +113,14 @@ accelerator_inline RealD    adj(const RealD  & r){ return r; }
 accelerator_inline ComplexD adj(const ComplexD& r){ return(conjugate(r)); }
 accelerator_inline ComplexF adj(const ComplexF& r ){ return(conjugate(r)); }

+#if defined(GRID_CUDA) || defined(GRID_HIP)
+//Provide for convenience
+accelerator_inline std::complex<double> conjugate(const std::complex<double>& r){ return(conj(r)); }
+accelerator_inline std::complex<float>  conjugate(const std::complex<float>& r) { return(conj(r)); }
+accelerator_inline std::complex<double> adj(const std::complex<double>& r)      { return(conj(r)); }
+accelerator_inline std::complex<float>  adj(const std::complex<float>& r)       { return(conj(r)); }
+#endif
+
 accelerator_inline RealF real(const RealF  & r){ return r; }
 accelerator_inline RealD real(const RealD  & r){ return r; }
 accelerator_inline RealF real(const ComplexF  & r){ return r.real(); }
@@ -96,7 +96,9 @@ void     acceleratorInit(void);

 #ifdef GRID_CUDA

+NAMESPACE_END(Grid);
 #include <cuda.h>
+NAMESPACE_BEGIN(Grid);

 #ifdef __CUDA_ARCH__
 #define GRID_SIMT
@@ -33,7 +33,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 using namespace std;
 using namespace Grid;
- ;

 int main (int argc, char ** argv)
 {
@@ -97,19 +96,49 @@ int main (int argc, char ** argv)
  RealD c2=-1.0/24.0;
  RealD u0=1.0;
  ImprovedStaggeredFermionD Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0,params);
-  
-  std::cout<<GridLogMessage << "Calling Ds"<<std::endl;
+  NaiveStaggeredFermionD Dn(Umu,Grid,RBGrid,mass,c1,u0,params);
+
+  FermionField src_e(&RBGrid); pickCheckerboard(Even, src_e, src);
+  FermionField src_o(&RBGrid); pickCheckerboard(Odd,  src_o, src);
+  FermionField res_e(&RBGrid); res_e = Zero();
+
  int ncall=1000;
+
+  // ImprovedStaggered Dhop
+  for(int i=0;i<ncall;i++) Ds.Dhop(src,result,0);
  double t0=usecond();
-  for(int i=0;i<ncall;i++){
-    Ds.Dhop(src,result,0);
-  }
+  for(int i=0;i<ncall;i++) Ds.Dhop(src,result,0);
  double t1=usecond();
-  double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 +  == 1146
-  
-  std::cout<<GridLogMessage << "Called Ds"<<std::endl;
-  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
-  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+  double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // 1146 flops/site
+  std::cout<<GridLogMessage << ncall << " ImprovedStaggered Dhop calls in "<< (t1-t0)<<" us"<<std::endl;
+  std::cout<<GridLogMessage << "ImprovedStaggered Dhop   mflop/s =   "<< flops/(t1-t0)<<std::endl;
+
+  // ImprovedStaggered DhopEO
+  for(int i=0;i<ncall;i++) Ds.DhopEO(src_o,res_e,0);
+  t0=usecond();
+  for(int i=0;i<ncall;i++) Ds.DhopEO(src_o,res_e,0);
+  t1=usecond();
+  flops=(16*(3*(6+8+8)) + 15*3*2)*(volume/2)*ncall;
+  std::cout<<GridLogMessage << ncall << " ImprovedStaggered DhopEO calls in "<< (t1-t0)<<" us"<<std::endl;
+  std::cout<<GridLogMessage << "ImprovedStaggered DhopEO mflop/s =   "<< flops/(t1-t0)<<std::endl;
+
+  // NaiveStaggered Dhop
+  for(int i=0;i<ncall;i++) Dn.Dhop(src,result,0);
+  t0=usecond();
+  for(int i=0;i<ncall;i++) Dn.Dhop(src,result,0);
+  t1=usecond();
+  flops=(8*(3*(6+8+8)) + 7*3*2)*volume*ncall;
+  std::cout<<GridLogMessage << ncall << " NaiveStaggered Dhop calls in "<< (t1-t0)<<" us"<<std::endl;
+  std::cout<<GridLogMessage << "NaiveStaggered    Dhop   mflop/s =   "<< flops/(t1-t0)<<std::endl;
+
+  // NaiveStaggered DhopEO
+  for(int i=0;i<ncall;i++) Dn.DhopEO(src_o,res_e,0);
+  t0=usecond();
+  for(int i=0;i<ncall;i++) Dn.DhopEO(src_o,res_e,0);
+  t1=usecond();
+  flops=(8*(3*(6+8+8)) + 7*3*2)*(volume/2)*ncall;
+  std::cout<<GridLogMessage << ncall << " NaiveStaggered DhopEO calls in "<< (t1-t0)<<" us"<<std::endl;
+  std::cout<<GridLogMessage << "NaiveStaggered    DhopEO mflop/s =   "<< flops/(t1-t0)<<std::endl;

  Grid_finalize();
 }
@@ -96,22 +96,45 @@ int main (int argc, char ** argv)
  RealD c2=-1.0/24.0;
  RealD u0=1.0;
  ImprovedStaggeredFermionF Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0,params);
-  
-  std::cout<<GridLogMessage << "Calling Ds"<<std::endl;
-  int ncall=1000;
-  for(int i=0;i<ncall;i++){
-    Ds.Dhop(src,result,0);
-  }
+  NaiveStaggeredFermionF    Dn(Umu,Grid,RBGrid,mass,c1,u0,params);
+
+  FermionField src_e(&RBGrid); pickCheckerboard(Even, src_e, src);
+  FermionField src_o(&RBGrid); pickCheckerboard(Odd,  src_o, src);
+  FermionField res_e(&RBGrid); res_e = Zero();
+
+  int ncall=10000;
+
+  // ImprovedStaggered Dhop
+  for(int i=0;i<ncall;i++) Ds.Dhop(src,result,0);
  double t0=usecond();
-  for(int i=0;i<ncall;i++){
-    Ds.Dhop(src,result,0);
-  }
+  for(int i=0;i<ncall;i++) Ds.Dhop(src,result,0);
  double t1=usecond();
-  double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 +  == 1146
-  
-  std::cout<<GridLogMessage << "Called Ds"<<std::endl;
-  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
-  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+  double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // 1146 flops/site
+  std::cout<<GridLogMessage << "ImprovedStaggered Dhop   mflop/s =   "<< flops/(t1-t0)<<std::endl;
+
+  // ImprovedStaggered DhopEO
+  for(int i=0;i<ncall;i++) Ds.DhopEO(src_o,res_e,0);
+  t0=usecond();
+  for(int i=0;i<ncall;i++) Ds.DhopEO(src_o,res_e,0);
+  t1=usecond();
+  flops=(16*(3*(6+8+8)) + 15*3*2)*(volume/2)*ncall;
+  std::cout<<GridLogMessage << "ImprovedStaggered DhopEO mflop/s =   "<< flops/(t1-t0)<<std::endl;
+
+  // NaiveStaggered Dhop
+  for(int i=0;i<ncall;i++) Dn.Dhop(src,result,0);
+  t0=usecond();
+  for(int i=0;i<ncall;i++) Dn.Dhop(src,result,0);
+  t1=usecond();
+  flops=(8*(3*(6+8+8)) + 7*3*2)*volume*ncall;
+  std::cout<<GridLogMessage << "NaiveStaggered    Dhop   mflop/s =   "<< flops/(t1-t0)<<std::endl;
+
+  // NaiveStaggered DhopEO
+  for(int i=0;i<ncall;i++) Dn.DhopEO(src_o,res_e,0);
+  t0=usecond();
+  for(int i=0;i<ncall;i++) Dn.DhopEO(src_o,res_e,0);
+  t1=usecond();
+  flops=(8*(3*(6+8+8)) + 7*3*2)*(volume/2)*ncall;
+  std::cout<<GridLogMessage << "NaiveStaggered    DhopEO mflop/s =   "<< flops/(t1-t0)<<std::endl;

  Grid_finalize();
 }
@@ -716,6 +716,161 @@ public:
    return mflops_best;
  }

+  static double NaiveStaggered(int L)
+  {
+    double mflops;
+    double mflops_best = 0;
+    double mflops_worst= 0;
+    std::vector<double> mflops_all;
+
+    ///////////////////////////////////////////////////////
+    // Set/Get the layout & grid size
+    ///////////////////////////////////////////////////////
+    int threads = GridThread::GetThreads();
+    Coordinate mpi = GridDefaultMpi(); GRID_ASSERT(mpi.size()==4);
+    Coordinate local({L,L,L,L});
+    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
+    
+    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(latt4,
+								       GridDefaultSimd(Nd,vComplex::Nsimd()),
+								       GridDefaultMpi());
+    uint64_t NP = TmpGrid->RankCount();
+    uint64_t NN = TmpGrid->NodeCount();
+    NN_global=NN;
+    uint64_t SHM=NP/NN;
+
+
+    ///////// Welcome message ////////////
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "Benchmark NaiveStaggered on "<<L<<"^4 local volume "<<std::endl;
+    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
+    std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl;
+    std::cout<<GridLogMessage << "* nodes          : "<<NN  <<std::endl;
+    std::cout<<GridLogMessage << "* ranks/node     : "<<SHM <<std::endl;
+    std::cout<<GridLogMessage << "* ranks geom     : "<<GridCmdVectorIntToString(mpi)<<std::endl;
+    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+    ///////// Lattice Init ////////////
+    GridCartesian         * FGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
+    
+    ///////// RNG Init ////////////
+    std::vector<int> seeds4({1,2,3,4});
+    GridParallelRNG          RNG4(FGrid);  RNG4.SeedFixedIntegers(seeds4);
+    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+    RealD mass=0.1;
+    RealD c1=9.0/8.0;
+    RealD c2=-1.0/24.0;
+    RealD u0=1.0;
+
+    typedef NaiveStaggeredFermionF Action;
+    typedef typename Action::FermionField Fermion; 
+    typedef LatticeGaugeFieldF Gauge;
+    
+    Gauge Umu(FGrid);  SU<Nc>::HotConfiguration(RNG4,Umu); 
+
+    typename Action::ImplParams params;
+    Action Ds(Umu,*FGrid,*FrbGrid,mass,c1,u0,params);
+
+    ///////// Source preparation ////////////
+    Fermion src   (FGrid); random(RNG4,src);
+    Fermion src_e (FrbGrid);
+    Fermion src_o (FrbGrid);
+    Fermion r_e   (FrbGrid);
+    Fermion r_o   (FrbGrid);
+    Fermion r_eo  (FGrid);
+  
+    {
+
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd,src_o,src);
+    
+      const int num_cases = 2;
+      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
+      
+      controls Cases [] = {
+	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
+	{  StaggeredKernelsStatic::OptHandUnroll,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
+	{  StaggeredKernelsStatic::OptInlineAsm ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  }
+      }; 
+
+      for(int c=0;c<num_cases;c++) {
+	
+	StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap;
+	StaggeredKernelsStatic::Opt   = Cases[c].Opt;
+	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
+      
+	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+	if ( StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc StaggeredKernels" <<std::endl;
+	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+	
+	int nwarm = 10;
+	double t0=usecond();
+	FGrid->Barrier();
+	for(int i=0;i<nwarm;i++){
+	  Ds.DhopEO(src_o,r_e,DaggerNo);
+	}
+	FGrid->Barrier();
+	double t1=usecond();
+
+	uint64_t no    = 50;
+	uint64_t ni    = 100;
+
+	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
+
+	time_statistics timestat;
+	std::vector<double> t_time(no);
+	for(uint64_t i=0;i<no;i++){
+	  t0=usecond();
+	  for(uint64_t j=0;j<ni;j++){
+	    Ds.DhopEO(src_o,r_e,DaggerNo);
+	  }
+	  t1=usecond();
+	  t_time[i] = t1-t0;
+	}
+	FGrid->Barrier();
+	
+	double volume=1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+	double flops=((8*(3*(6+8+8)) + 7*3*2)*1.0*volume)/2;
+	double mf_hi, mf_lo, mf_err;
+	
+	timestat.statistics(t_time);
+	mf_hi = flops/timestat.min*ni;
+	mf_lo = flops/timestat.max*ni;
+	mf_err= flops/timestat.min * timestat.err/timestat.mean;
+
+	mflops = flops/timestat.mean*ni;
+	mflops_all.push_back(mflops);
+	if ( mflops_best == 0   ) mflops_best = mflops;
+	if ( mflops_worst== 0   ) mflops_worst= mflops;
+	if ( mflops>mflops_best ) mflops_best = mflops;
+	if ( mflops<mflops_worst) mflops_worst= mflops;
+	
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call   "<< timestat.mean/ni<<std::endl;
+      
+      }
+
+      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+      std::cout<<GridLogMessage << L<<"^4  Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
+      std::cout<<GridLogMessage << L<<"^4  Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
+      std::cout<<GridLogMessage <<fmt << std::endl;
+      std::cout<<GridLogMessage ;
+
+      for(int i=0;i<mflops_all.size();i++){
+	std::cout<<mflops_all[i]/NN<<" ; " ;
+      }
+      std::cout<<std::endl;
+    }
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    return mflops_best;
+  }
+  
  static double Clover(int L)
  {
    double mflops;
@@ -887,6 +1042,7 @@ int main (int argc, char ** argv)
  std::vector<double> clover;
  std::vector<double> dwf4;
  std::vector<double> staggered;
+  std::vector<double> naive_staggered;

  int Ls=1;
  if (do_dslash){
@@ -914,13 +1070,21 @@ int main (int argc, char ** argv)
    staggered.push_back(result);
  }

+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << " Naive Staggered dslash 4D vectorised" <<std::endl;
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  for(int l=0;l<L_list.size();l++){
+    double result = Benchmark::NaiveStaggered(L_list[l]) ;
+    naive_staggered.push_back(result);
+  }
+

  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "L \t\t Clover \t\t DWF4 \t\t Staggered" <<std::endl;
+  std::cout<<GridLogMessage << "L \t\t Clover \t\t DWF4 \t\t Staggered \t\t Naive Staggered" <<std::endl;
  for(int l=0;l<L_list.size();l++){
-    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl;
+    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<" \t\t "<<naive_staggered[l]<<std::endl;
  }
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  }
@@ -930,14 +1094,14 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl;
+    std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered \t\t NaiveStag \t|\t (GF/s per node)" <<std::endl;
    fprintf(FP,"Per node summary table\n");
    fprintf(FP,"\n");
-    fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n");
+    fprintf(FP,"L , Wilson, DWF4, Staggered, NaiveStag\n");
    fprintf(FP,"\n");
    for(int l=0;l<L_list.size();l++){
-      std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
-      fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.);
+      std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<" \t " <<naive_staggered[l]/NN<<std::endl;
+      fprintf(FP,"%d , %.0f, %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.,naive_staggered[l]/NN/1000.);
    }
    fprintf(FP,"\n");
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
@@ -1,76 +1,91 @@
+Per node summary table
+
+L , Wilson, DWF4, Staggered, NaiveStag
+
+8 , 90, 933, 38, 23
+12 , 403, 1688, 178, 113
+16 , 188, 1647, 449, 295
+24 , 947, 1574, 674, 553
+32 , 931, 1371, 718, 643
+
 Memory Bandwidth

 Bytes, GB/s per node
-6291456, 379.297050
-100663296, 3754.674992
-509607936, 6521.472413
-1610612736, 8513.456479
-3932160000, 9018.901766
-
-
-GEMM
-
- M, N, K, BATCH, GF/s per rank
-16, 8, 16, 256, 0.564958
-16, 16, 16, 256, 243.148058
-16, 32, 16, 256, 440.346877
-32, 8, 32, 256, 439.194136
-32, 16, 32, 256, 847.334141
-32, 32, 32, 256, 1430.892623
-64, 8, 64, 256, 1242.756741
-64, 16, 64, 256, 2196.689493
-64, 32, 64, 256, 3697.458072
-16, 8, 256, 256, 899.582627
-16, 16, 256, 256, 1673.537756
-16, 32, 256, 256, 2959.597089
-32, 8, 256, 256, 1558.858630
-32, 16, 256, 256, 2864.839445
-32, 32, 256, 256, 4810.671254
-64, 8, 256, 256, 2386.092942
-64, 16, 256, 256, 4451.665937
-64, 32, 256, 256, 5942.124095
-8, 256, 16, 256, 799.867271
-16, 256, 16, 256, 1584.624888
-32, 256, 16, 256, 1949.422338
-8, 256, 32, 256, 1389.417474
-16, 256, 32, 256, 2668.344493
-32, 256, 32, 256, 3234.162120
-8, 256, 64, 256, 2150.925128
-16, 256, 64, 256, 4012.488132
-32, 256, 64, 256, 5154.785521
-
+786432, 40.271620
+12582912, 433.611792
+63700992, 905.374321
+201326592, 1114.979152
+491520000, 1180.241898


 Communications

 Packet bytes, direction, GB/s per node
-4718592, 1, 245.026198
-4718592, 2, 251.180996
-4718592, 3, 361.110977
-4718592, 5, 247.898447
-4718592, 6, 249.867523
-4718592, 7, 359.033061
-15925248, 1, 255.030946
-15925248, 2, 264.453890
-15925248, 3, 392.949183
-15925248, 5, 256.040644
-15925248, 6, 264.681896
-15925248, 7, 392.102622
-37748736, 1, 258.823333
-37748736, 2, 268.181577
-37748736, 3, 401.478191
-37748736, 5, 258.995363
-37748736, 6, 268.206586
-37748736, 7, 400.397611


-Per node summary table
+GEMM
+
+ M, N, K, BATCH, GF/s per rank fp64
+16, 8, 16, 4096, 693.316363
+16, 12, 16, 4096, 657.277058
+16, 16, 16, 4096, 711.992616
+32, 8, 32, 4096, 821.084324
+32, 12, 32, 4096, 1279.852719
+32, 16, 32, 4096, 2647.096674
+64, 8, 64, 4096, 2630.192325
+64, 12, 64, 4096, 3338.071321
+64, 16, 64, 4096, 3950.899281
+16, 8, 256, 4096, 1638.362501
+16, 12, 256, 4096, 2377.502234
+16, 16, 256, 4096, 3048.328833
+32, 8, 256, 4096, 2917.384276
+32, 12, 256, 4096, 4103.085151
+32, 16, 256, 4096, 5102.971860
+64, 8, 256, 4096, 3222.258206
+64, 12, 256, 4096, 4619.456391
+64, 16, 256, 4096, 5847.916650
+8, 256, 16, 4096, 1728.073337
+12, 256, 16, 4096, 2356.653970
+16, 256, 16, 4096, 2676.876038
+8, 256, 32, 4096, 2611.531990
+12, 256, 32, 4096, 3451.573106
+16, 256, 32, 4096, 3966.915301
+8, 256, 64, 4096, 3436.248737
+12, 256, 64, 4096, 4539.497945
+16, 256, 64, 4096, 5307.992323
+
+
+
+GEMM
+
+ M, N, K, BATCH, GF/s per rank fp32
+16, 8, 16, 4096, 499.017445
+16, 12, 16, 4096, 731.543385
+16, 16, 16, 4096, 958.800786
+32, 8, 32, 4096, 1549.813550
+32, 12, 32, 4096, 2147.907502
+32, 16, 32, 4096, 2601.698596
+64, 8, 64, 4096, 3785.446233
+64, 12, 64, 4096, 5116.694843
+64, 16, 64, 4096, 6109.345016
+16, 8, 256, 4096, 1206.627737
+16, 12, 256, 4096, 1809.699599
+16, 16, 256, 4096, 2412.014053
+32, 8, 256, 4096, 2406.114488
+32, 12, 256, 4096, 3605.531907
+32, 16, 256, 4096, 4798.444037
+64, 8, 256, 4096, 4688.711196
+64, 12, 256, 4096, 6990.696301
+64, 16, 256, 4096, 9214.749925
+8, 256, 16, 4096, 2596.307289
+12, 256, 16, 4096, 3439.892562
+16, 256, 16, 4096, 3907.201036
+8, 256, 32, 4096, 3012.752067
+12, 256, 32, 4096, 3904.217583
+16, 256, 32, 4096, 4599.047092
+8, 256, 64, 4096, 3721.999042
+12, 256, 64, 4096, 5098.573927
+16, 256, 64, 4096, 6159.080872

-L , Wilson, DWF4, Staggered, GF/s per node

-8 , 155, 1386, 50
-12 , 694, 4208, 230
-16 , 1841, 6675, 609
-24 , 3934, 8573, 1641
-32 , 5083, 9771, 3086

@@ -1,4 +1,3 @@
-CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
 ../../configure --enable-comms=mpi-auto \
 --with-lime=$CLIME \
 --enable-unified=no \
@@ -9,8 +8,9 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
 --disable-gparity \
 --disable-fermion-reps \
 --enable-simd=GPU \
--with-gmp=$OLCF_GMP_ROOT \
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
+--with-gmp=$GMP \
+--with-mpfr=$MPFR \
+--with-openssl=$OPENSSL \
 --disable-fermion-reps \
 CXX=hipcc MPICXX=mpicxx \
 CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include " \
@@ -2,6 +2,10 @@
 echo spack
 . /autofs/nccs-svm1_home1/paboyle/spack/share/spack/setup-env.sh

+export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
+export MPFR=`spack find --paths mpfr    | grep ^mpfr  | awk '{print $2}' `
+export OPENSSL=`spack find --paths openssl | grep openssl | awk  '{print $2}' `
+export GMP=`spack find --paths gmp      | grep ^gmp | awk '{print $2}' `

 module load cce/21.0.0
 module load cpe/26.03
@@ -1,12 +1,12 @@
 DIR=`pwd`
-PREFIX=$HOME/DDHMC/Grid/systems/Prerequisites/install/
+
 ../../configure \
    --enable-comms=mpi \
    --enable-simd=GPU \
    --enable-shm=nvlink \
    --enable-gen-simd-width=64 \
-    --with-gmp=$PREFIX \
-    --with-mpfr=$PREFIX \
+    --with-gmp=$GMP \
+    --with-mpfr=$MPFR \
    --enable-accelerator=cuda \
    --disable-fermion-reps \
    --disable-unified \
@@ -1,4 +1,6 @@
-
 export CRAY_ACCEL_TARGET=nvidia80
+source /global/homes/p/pboyle/spack/share/spack/setup-env.sh
+export MPFR=`spack find --paths mpfr | grep mpfr | cut -c 13-`
+export GMP=`spack find --paths gmp   | grep gmp  | cut -c 12-`

-module load PrgEnv-gnu cpe-cuda cudatoolkit/11.4
+module load PrgEnv-gnu cpe-cuda cudatoolkit/12.0
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+# Configure Grid from a build directory two levels below the source root,
+# e.g.: mkdir -p systems/mac-arm/build && cd systems/mac-arm/build && bash ../config-command-homebrew
+#
+# Prerequisites: source sourceme-homebrew.sh first.
+
+CXX=mpicxx ../../configure \
+    --enable-simd=GEN \
+    --enable-comms=mpi-auto \
+    --enable-unified=yes \
+    --prefix="${HOME}/Grid-install" \
+    --with-lime="${CLIME}" \
+    --with-openssl="${OPENSSL}" \
+    --with-gmp="${GMP}" \
+    --with-mpfr="${MPFR}" \
+    --with-fftw="${FFTW}" \
+    --enable-Sp=no \
+    --disable-fermion-reps \
+    --disable-gparity \
+    --disable-debug
@@ -3,7 +3,10 @@
 CXX=mpicxx ../../configure \
 	   --enable-simd=GEN \
 	   --enable-comms=mpi-auto \
-	   --enable-Sp=yes \
+	   --enable-Sp=no \
+	   --disable-fermion-reps \
+	   --disable-gparity \
+	   --with-fftw=$FFTW \
 	   --enable-unified=yes \
 	   --prefix /Users/peterboyle/QCD/vtk/Grid/install \
 	   --with-lime=$CLIME \
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+# Environment for building Grid on Apple Silicon Mac with Homebrew dependencies.
+# Usage: source systems/mac-arm/sourceme-homebrew.sh
+
+HOMEBREW=/opt/homebrew
+
+export GMP=${HOMEBREW}/opt/gmp
+export MPFR=${HOMEBREW}/opt/mpfr
+export FFTW=${HOMEBREW}/opt/fftw
+export OPENSSL=${HOMEBREW}/opt/openssl@3
+export CLIME=/usr/local
+
+export PATH="${HOMEBREW}/opt/open-mpi/bin:${HOMEBREW}/bin:${PATH}"
+export LDFLAGS="-L${OPENSSL}/lib"
+export CPPFLAGS="-I${OPENSSL}/include"
@@ -0,0 +1,11 @@
+source /Users/peterboyle/QCD//Spack/spack//share/spack/setup-env.sh
+
+export FFTW=`spack find --paths fftw    | grep ^fftw   | awk '{print $2}' `
+#export HDF5=`spack find --paths hdf5+cxx   | grep ^hdf5   | awk '{print $2}' `
+export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
+export MPFR=`spack find --paths mpfr    | grep ^mpfr  | awk '{print $2}' `
+export OPENSSL=`spack find --paths openssl | grep openssl | awk  '{print $2}' `
+export GMP=`spack find --paths gmp      | grep ^gmp | awk '{print $2}' `
+
+export LD_LIBRARY_PATH=$MPFR/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$GMP/lib:$LD_LIBRARY_PATH
@@ -36,8 +36,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 using namespace std;
 using namespace Grid;

-gridblasHandle_t GridBLAS::gridblasHandle;
-int            GridBLAS::gridblasInit;
+//gridblasHandle_t GridBLAS::gridblasHandle;
+//int            GridBLAS::gridblasInit;

 ///////////////////////
 // Tells little dirac op to use MdagM as the .Op()
@@ -1,76 +0,0 @@
-/*
- * Isolating the hipfft HIPFFT_PARSE_ERROR on ROCm 7 / hipFFT 1.0.20.
- *
- * Tests three orderings with an empty rocFFT cache to find which GPU
- * operation before plan creation triggers the failure:
- *   A) hipMalloc only            — hypothesis: passes (no async GPU work)
- *   B) hipMalloc + hipMemset     — hypothesis: fails  (async GPU work in flight)
- *   C) hipMalloc + hipMemset     — hypothesis: passes (work completed before plan)
- *      + hipDeviceSynchronize
- *
- * Compile:
- *   hipcc -o Test_hipfft_bug_fail Test_hipfft_bug_fail.cc -lhipfft
- *
- * Run with empty cache:
- *   rm -rf ~/.cache/
- *   ./Test_hipfft_bug_fail
- */
-
-#include <cstdio>
-#include <hipfft/hipfft.h>
-#include <hip/hip_runtime.h>
-
-static const char *res(hipfftResult rv) {
-  return rv == HIPFFT_SUCCESS ? "SUCCESS" : "PARSE_ERROR";
-}
-
-static hipfftResult makePlan(int G, int howmany) {
-  int n[] = {G};
-  hipfftHandle p;
-  size_t workSize = 0;
-  hipfftCreate(&p);
-  hipfftResult rv = hipfftMakePlanMany(p, 1, n,
-                                       nullptr, 1, G, nullptr, 1, G,
-                                       HIPFFT_Z2Z, howmany, &workSize);
-  hipfftDestroy(p);
-  return rv;
-}
-
-int main(void) {
-  hipDeviceProp_t prop;
-  hipGetDeviceProperties(&prop, 0);
-  printf("Device: %s\n", prop.name);
-#ifdef hipfftVersionMinor
-  printf("hipFFT version: %d.%d.%d\n\n",
-         hipfftVersionMajor, hipfftVersionMinor, hipfftVersionPatch);
-#endif
-
-  for (int G : {4, 8, 16, 32}) {
-    int howmany = 512;
-    long nelems = (long)G * howmany;
-    hipfftDoubleComplex *buf = nullptr;
-    hipMalloc(&buf, nelems * sizeof(hipfftDoubleComplex));
-
-    // Tests ordered so each runs before a prior success can populate the cache.
-
-    // B first: hipMalloc + hipMemset (async GPU work in flight)
-    // If this fails, A (no hipMemset) will pass, confirming hipMemset is the trigger.
-    hipMemset(buf, 0, nelems * sizeof(hipfftDoubleComplex));
-    hipfftResult rvB = makePlan(G, howmany);
-    printf("G=%-4d  B) hipMalloc + hipMemset       : %s\n", G, res(rvB));
-
-    // C: hipMalloc + hipMemset + sync — does syncing before plan creation fix it?
-    hipMemset(buf, 0, nelems * sizeof(hipfftDoubleComplex));
-    hipDeviceSynchronize();
-    hipfftResult rvC = makePlan(G, howmany);
-    printf("G=%-4d  C) hipMalloc + hipMemset + sync: %s\n", G, res(rvC));
-
-    // A last: hipMalloc only, no async GPU work — should always pass
-    hipfftResult rvA = makePlan(G, howmany);
-    printf("G=%-4d  A) hipMalloc only             : %s\n\n", G, res(rvA));
-
-    hipFree(buf);
-  }
-
-  return 0;
-}
@@ -1,61 +0,0 @@
-/*
- * Minimal program demonstrating the workaround for the hipfft ROCm 7 bug.
- *
- * Workaround: create the hipfft plan BEFORE any hipMalloc.  Plan creation
- * for G < 32 then succeeds even with an empty rocFFT cache.
- *
- * Compile:
- *   hipcc -o Test_hipfft_bug_pass Test_hipfft_bug_pass.cc -lhipfft
- *
- * Run:
- *   rm -rf ~/.cache/rocfft
- *   ./Test_hipfft_bug_pass
- *
- * Expected: all G values succeed.
- * Compare with Test_hipfft_bug_fail.cc which uses the opposite ordering.
- */
-
-#include <cstdio>
-#include <hipfft/hipfft.h>
-#include <hip/hip_runtime.h>
-
-int main(void) {
-  hipDeviceProp_t prop;
-  hipGetDeviceProperties(&prop, 0);
-  printf("Device: %s\n", prop.name);
-#ifdef hipfftVersionMinor
-  printf("hipFFT version: %d.%d.%d\n\n",
-         hipfftVersionMajor, hipfftVersionMinor, hipfftVersionPatch);
-#endif
-
-  for (int G : {8, 16, 32}) {
-    int howmany = 512;
-    int n[] = {G};
-    long nelems = (long)G * howmany;
-
-    // Plan created BEFORE hipMalloc — succeeds for all G
-    hipfftHandle p;
-    size_t workSize = 0;
-    hipfftCreate(&p);
-    hipfftResult rv = hipfftMakePlanMany(p, 1, n,
-                                         nullptr, 1, G, nullptr, 1, G,
-                                         HIPFFT_Z2Z, howmany, &workSize);
-    printf("G=%-4d  plan-then-hipMalloc: %d (%s)\n",
-           G, (int)rv, rv == HIPFFT_SUCCESS ? "HIPFFT_SUCCESS" : "HIPFFT_PARSE_ERROR");
-
-    if (rv == HIPFFT_SUCCESS) {
-      hipfftDoubleComplex *buf = nullptr;
-      hipMalloc(&buf, nelems * sizeof(hipfftDoubleComplex));
-      hipMemset(buf, 0, nelems * sizeof(hipfftDoubleComplex));
-      rv = hipfftExecZ2Z(p, buf, buf, HIPFFT_FORWARD);
-      hipDeviceSynchronize();
-      printf("G=%-4d  execFwd:             %d (%s)\n",
-             G, (int)rv, rv == HIPFFT_SUCCESS ? "HIPFFT_SUCCESS" : "FAILED");
-      hipFree(buf);
-    }
-
-    hipfftDestroy(p);
-  }
-
-  return 0;
-}
@@ -1,142 +0,0 @@
-/*
- * Minimal reproducer for hipfftMakePlanMany / hipfftPlanMany failures.
- *
- * Compile on Frontier (no Grid headers needed):
- *   hipcc -o Test_hipfft_minimal Test_hipfft_minimal.cc -lhipfft
- *
- * Run:
- *   ./Test_hipfft_minimal
- */
-
-#include <cstdio>
-#include <cstdlib>
-#include <hipfft/hipfft.h>
-#include <hip/hip_runtime.h>
-
-static const char *hipfftResultString(hipfftResult r) {
-  switch (r) {
-  case HIPFFT_SUCCESS:                  return "HIPFFT_SUCCESS";
-  case HIPFFT_INVALID_PLAN:             return "HIPFFT_INVALID_PLAN";
-  case HIPFFT_ALLOC_FAILED:             return "HIPFFT_ALLOC_FAILED";
-  case HIPFFT_INVALID_TYPE:             return "HIPFFT_INVALID_TYPE";
-  case HIPFFT_INVALID_VALUE:            return "HIPFFT_INVALID_VALUE";
-  case HIPFFT_INTERNAL_ERROR:           return "HIPFFT_INTERNAL_ERROR";
-  case HIPFFT_EXEC_FAILED:              return "HIPFFT_EXEC_FAILED";
-  case HIPFFT_SETUP_FAILED:             return "HIPFFT_SETUP_FAILED";
-  case HIPFFT_INVALID_SIZE:             return "HIPFFT_INVALID_SIZE";
-  case HIPFFT_UNALIGNED_DATA:           return "HIPFFT_UNALIGNED_DATA";
-  case HIPFFT_INCOMPLETE_PARAMETER_LIST:return "HIPFFT_INCOMPLETE_PARAMETER_LIST";
-  case HIPFFT_INVALID_DEVICE:           return "HIPFFT_INVALID_DEVICE";
-  case HIPFFT_PARSE_ERROR:              return "HIPFFT_PARSE_ERROR";
-  case HIPFFT_NO_WORKSPACE:             return "HIPFFT_NO_WORKSPACE";
-  case HIPFFT_NOT_IMPLEMENTED:          return "HIPFFT_NOT_IMPLEMENTED";
-  case HIPFFT_NOT_SUPPORTED:            return "HIPFFT_NOT_SUPPORTED";
-  default:                              return "UNKNOWN";
-  }
-}
-
-// Plan creation + execution for (G, howmany).
-// Tests two orderings to isolate whether a prior hipMalloc poisons hipfft
-// plan creation for small G on ROCm 7:
-//   A) plan BEFORE hipMalloc  — hypothesis: succeeds
-//   B) hipMalloc BEFORE plan  — hypothesis: fails for G < 32
-static void tryPlanAndExec(int G, long howmany) {
-  int n[] = {G};
-  long nelems = (long)G * howmany;
-
-  printf("--- G=%-4d  howmany=%-10ld  total_elems=%-12ld ---\n",
-         G, howmany, nelems);
-
-  // --- A: create plan first, allocate buffer afterwards ---
-  {
-    hipfftHandle p;
-    size_t workSize = 0;
-    hipfftCreate(&p);
-    hipfftResult rv = hipfftMakePlanMany(p, 1, n,
-                                         nullptr, 1, G, nullptr, 1, G,
-                                         HIPFFT_Z2Z, (int)howmany, &workSize);
-    printf("  plan-first  create : %d (%s)\n", (int)rv, hipfftResultString(rv));
-    if (rv == HIPFFT_SUCCESS) {
-      hipfftDoubleComplex *buf = nullptr;
-      hipMalloc(&buf, nelems * sizeof(hipfftDoubleComplex));
-      hipMemset(buf, 0, nelems * sizeof(hipfftDoubleComplex));
-      rv = hipfftExecZ2Z(p, buf, buf, HIPFFT_FORWARD);
-      hipDeviceSynchronize();
-      printf("  plan-first  execFwd: %d (%s)\n", (int)rv, hipfftResultString(rv));
-      hipFree(buf);
-    }
-    hipfftDestroy(p);
-  }
-
-  // --- B: hipMalloc first, create plan afterwards ---
-  {
-    hipfftDoubleComplex *buf = nullptr;
-    hipMalloc(&buf, nelems * sizeof(hipfftDoubleComplex));
-    hipMemset(buf, 0, nelems * sizeof(hipfftDoubleComplex));
-
-    hipfftHandle p;
-    size_t workSize = 0;
-    hipfftCreate(&p);
-    hipfftResult rv = hipfftMakePlanMany(p, 1, n,
-                                         nullptr, 1, G, nullptr, 1, G,
-                                         HIPFFT_Z2Z, (int)howmany, &workSize);
-    printf("  malloc-first create : %d (%s)\n", (int)rv, hipfftResultString(rv));
-    if (rv == HIPFFT_SUCCESS) {
-      rv = hipfftExecZ2Z(p, buf, buf, HIPFFT_FORWARD);
-      hipDeviceSynchronize();
-      printf("  malloc-first execFwd: %d (%s)\n", (int)rv, hipfftResultString(rv));
-    }
-    hipfftDestroy(p);
-    hipFree(buf);
-  }
-
-  printf("\n");
-}
-
-int main(void) {
-  // Print HIP device info
-  int device = 0;
-  hipGetDevice(&device);
-  hipDeviceProp_t prop;
-  hipGetDeviceProperties(&prop, device);
-  printf("Device %d: %s  warpSize=%d\n\n", device, prop.name, prop.warpSize);
-
-#ifdef hipfftVersionMinor
-  printf("hipFFT version: %d.%d.%d\n\n",
-         hipfftVersionMajor, hipfftVersionMinor, hipfftVersionPatch);
-#endif
-
-  // Original sweep with small howmany (these passed first time)
-  printf("=== Small howmany (original sweep) ===\n\n");
-  for (int G : {4, 8, 12, 16, 24, 32, 48, 64})
-    tryPlanAndExec(G, 512);
-
-  // Grid-realistic howmany values derived from actual lattice geometries.
-  // howmany = Ncomp * product(ldimensions[d] for d != dim)
-  // For LatticeComplexD: Ncomp=1.
-  printf("=== Grid-realistic parameters ===\n\n");
-
-  // --grid 16.16.16.16  4D FFT  (KNOWN TO FAIL in Grid)
-  // Each dim: G=16, Nperp=16^3=4096
-  tryPlanAndExec(16, 4096);
-
-  // --grid 32.32.32.32  4D FFT  (KNOWN TO SUCCEED in Grid)
-  // Each dim: G=32, Nperp=32^3=32768
-  tryPlanAndExec(32, 32768);
-
-  // --grid 32.32.32.32 Ls=8  5D DWF FFT  (KNOWN TO FAIL on dim 0 in Grid)
-  // dim 0: G=8,  Nperp=32^4=1048576
-  tryPlanAndExec(8, 1048576);
-  // dim 1-4: G=32, Nperp=8*32^3=262144
-  tryPlanAndExec(32, 262144);
-
-  // Extra intermediate cases to bracket the failure
-  tryPlanAndExec(16, 1024);
-  tryPlanAndExec(16, 2048);
-  tryPlanAndExec(16, 8192);
-  tryPlanAndExec(8,  4096);
-  tryPlanAndExec(8,  65536);
-  tryPlanAndExec(8,  262144);
-
-  return 0;
-}
@@ -1,168 +0,0 @@
-/*
- * Reproducer for HIPFFT_PARSE_ERROR (error 12) from hipfftMakePlanMany on
- * ROCm 7 / hipFFT 1.0.20 (Frontier, MI210 login and MI250X compute nodes).
- *
- * Observed failure: G < 32 returns HIPFFT_PARSE_ERROR from all three plan
- * creation APIs (hipfftPlanMany, hipfftMakePlanMany, hipfftPlan1d) when a
- * device buffer is allocated and zeroed with hipMalloc+hipMemset before the
- * plan creation call.  G >= 32 succeeds.
- *
- * Contrast with Test_hipfft_minimal.cc (plan-first ordering) which passes
- * for all G even with an empty rocFFT cache.
- *
- * Compile on Frontier (no Grid headers needed):
- *   hipcc -o Test_hipfft_repro Test_hipfft_repro.cc -lhipfft
- *
- * Run with empty cache to reproduce the failure:
- *   rm -rf ~/.cache/rocfft
- *   ./Test_hipfft_repro
- */
-
-#include <cstdio>
-#include <cstdlib>
-#include <hipfft/hipfft.h>
-#include <hip/hip_runtime.h>
-
-static const char *hipfftResultString(hipfftResult r) {
-  switch (r) {
-  case HIPFFT_SUCCESS:                  return "HIPFFT_SUCCESS";
-  case HIPFFT_INVALID_PLAN:             return "HIPFFT_INVALID_PLAN";
-  case HIPFFT_ALLOC_FAILED:             return "HIPFFT_ALLOC_FAILED";
-  case HIPFFT_INVALID_TYPE:             return "HIPFFT_INVALID_TYPE";
-  case HIPFFT_INVALID_VALUE:            return "HIPFFT_INVALID_VALUE";
-  case HIPFFT_INTERNAL_ERROR:           return "HIPFFT_INTERNAL_ERROR";
-  case HIPFFT_EXEC_FAILED:              return "HIPFFT_EXEC_FAILED";
-  case HIPFFT_SETUP_FAILED:             return "HIPFFT_SETUP_FAILED";
-  case HIPFFT_INVALID_SIZE:             return "HIPFFT_INVALID_SIZE";
-  case HIPFFT_UNALIGNED_DATA:           return "HIPFFT_UNALIGNED_DATA";
-  case HIPFFT_INCOMPLETE_PARAMETER_LIST:return "HIPFFT_INCOMPLETE_PARAMETER_LIST";
-  case HIPFFT_INVALID_DEVICE:           return "HIPFFT_INVALID_DEVICE";
-  case HIPFFT_PARSE_ERROR:              return "HIPFFT_PARSE_ERROR";
-  case HIPFFT_NO_WORKSPACE:             return "HIPFFT_NO_WORKSPACE";
-  case HIPFFT_NOT_IMPLEMENTED:          return "HIPFFT_NOT_IMPLEMENTED";
-  case HIPFFT_NOT_SUPPORTED:            return "HIPFFT_NOT_SUPPORTED";
-  default:                              return "UNKNOWN";
-  }
-}
-
-// Plan creation + execution for (G, howmany) using hipfftCreate+hipfftMakePlanMany.
-// This is the path Grid's FFT.h now uses.
-static void tryPlanAndExec(int G, long howmany) {
-  int n[] = {G};
-  long nelems = (long)G * howmany;
-
-  printf("--- G=%-4d  howmany=%-10ld  total_elems=%-12ld ---\n",
-         G, howmany, nelems);
-
-  // Allocate device buffer (hipfftDoubleComplex = 16 bytes each)
-  hipfftDoubleComplex *dbuf = nullptr;
-  hipError_t herr = hipMalloc(&dbuf, nelems * sizeof(hipfftDoubleComplex));
-  if (herr != hipSuccess) {
-    printf("  hipMalloc failed (%d) for %ld elems — skipping\n\n", (int)herr, nelems);
-    return;
-  }
-  hipMemset(dbuf, 0, nelems * sizeof(hipfftDoubleComplex));
-
-  // 1. hipfftPlanMany (one-step, nullptr embed) — current Grid path
-  {
-    hipfftHandle p;
-    hipfftResult rv = hipfftPlanMany(&p, 1, n,
-                                     nullptr, 1, G,
-                                     nullptr, 1, G,
-                                     HIPFFT_Z2Z, (int)howmany);
-    printf("  hipfftPlanMany   create : %d (%s)\n", (int)rv, hipfftResultString(rv));
-    if (rv == HIPFFT_SUCCESS) {
-      rv = hipfftExecZ2Z(p, dbuf, dbuf, HIPFFT_FORWARD);
-      hipDeviceSynchronize();
-      printf("  hipfftPlanMany   execFwd: %d (%s)\n", (int)rv, hipfftResultString(rv));
-      hipfftDestroy(p);
-    }
-  }
-
-  // 2. hipfftCreate + hipfftMakePlanMany (two-step) — also current Grid path
-  {
-    hipfftHandle p;
-    size_t workSize = 0;
-    hipfftResult rc = hipfftCreate(&p);
-    if (rc == HIPFFT_SUCCESS) {
-      hipfftResult rv = hipfftMakePlanMany(p, 1, n,
-                                           nullptr, 1, G,
-                                           nullptr, 1, G,
-                                           HIPFFT_Z2Z, (int)howmany, &workSize);
-      printf("  hipfftMakePlanMany      : %d (%s)  workSize=%zu\n",
-             (int)rv, hipfftResultString(rv), workSize);
-      if (rv == HIPFFT_SUCCESS) {
-        rv = hipfftExecZ2Z(p, dbuf, dbuf, HIPFFT_FORWARD);
-        hipDeviceSynchronize();
-        printf("  hipfftMakePlanMany exec : %d (%s)\n", (int)rv, hipfftResultString(rv));
-      }
-      hipfftDestroy(p);
-    } else {
-      printf("  hipfftCreate            : %d (%s)\n", (int)rc, hipfftResultString(rc));
-    }
-  }
-
-  // 3. hipfftPlan1d (simplest API, batch = howmany)
-  {
-    hipfftHandle p;
-    hipfftResult rv = hipfftPlan1d(&p, G, HIPFFT_Z2Z, (int)howmany);
-    printf("  hipfftPlan1d     create : %d (%s)\n", (int)rv, hipfftResultString(rv));
-    if (rv == HIPFFT_SUCCESS) {
-      rv = hipfftExecZ2Z(p, dbuf, dbuf, HIPFFT_FORWARD);
-      hipDeviceSynchronize();
-      printf("  hipfftPlan1d     execFwd: %d (%s)\n", (int)rv, hipfftResultString(rv));
-      hipfftDestroy(p);
-    }
-  }
-
-  hipFree(dbuf);
-  printf("\n");
-}
-
-int main(void) {
-  // Print HIP device info
-  int device = 0;
-  hipGetDevice(&device);
-  hipDeviceProp_t prop;
-  hipGetDeviceProperties(&prop, device);
-  printf("Device %d: %s  warpSize=%d\n\n", device, prop.name, prop.warpSize);
-
-#ifdef hipfftVersionMinor
-  printf("hipFFT version: %d.%d.%d\n\n",
-         hipfftVersionMajor, hipfftVersionMinor, hipfftVersionPatch);
-#endif
-
-  // Original sweep with small howmany (these passed first time)
-  printf("=== Small howmany (original sweep) ===\n\n");
-  for (int G : {4, 8, 12, 16, 24, 32, 48, 64})
-    tryPlanAndExec(G, 512);
-
-  // Grid-realistic howmany values derived from actual lattice geometries.
-  // howmany = Ncomp * product(ldimensions[d] for d != dim)
-  // For LatticeComplexD: Ncomp=1.
-  printf("=== Grid-realistic parameters ===\n\n");
-
-  // --grid 16.16.16.16  4D FFT  (KNOWN TO FAIL in Grid)
-  // Each dim: G=16, Nperp=16^3=4096
-  tryPlanAndExec(16, 4096);
-
-  // --grid 32.32.32.32  4D FFT  (KNOWN TO SUCCEED in Grid)
-  // Each dim: G=32, Nperp=32^3=32768
-  tryPlanAndExec(32, 32768);
-
-  // --grid 32.32.32.32 Ls=8  5D DWF FFT  (KNOWN TO FAIL on dim 0 in Grid)
-  // dim 0: G=8,  Nperp=32^4=1048576
-  tryPlanAndExec(8, 1048576);
-  // dim 1-4: G=32, Nperp=8*32^3=262144
-  tryPlanAndExec(32, 262144);
-
-  // Extra intermediate cases to bracket the failure
-  tryPlanAndExec(16, 1024);
-  tryPlanAndExec(16, 2048);
-  tryPlanAndExec(16, 8192);
-  tryPlanAndExec(8,  4096);
-  tryPlanAndExec(8,  65536);
-  tryPlanAndExec(8,  262144);
-
-  return 0;
-}
@@ -0,0 +1,362 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: tests/debug/Test_staggered_hdcg.cc
+
+    Authors: Thomas Blum, Peter Boyle
+
+    HDCG (Hierarchical Deflation Conjugate Gradient) multigrid solver
+    for naive staggered fermions, based on arXiv:2409.03904.
+
+    Adapts the DWF HDCG infrastructure (Test_general_coarse_hdcg_phys48.cc) to:
+      - NaiveStaggeredFermion (nearest-neighbour only, no Naik 3-hop term)
+      - 4D SchurStaggeredOperator:  Mpc = m^2 - D_oe * D_eo  (hermitian, positive-definite)
+      - vColourVector fine field type (staggered has colour but no spin)
+      - NextToNearestStencilGeometry4D: 33-point coarse stencil
+
+    Stencil count: D_oe*D_eo has 2-hop fine range.  With blocking B >= 2 the coarse
+    shifts have L1-distance <= 2, giving 33 stencil points in 4D:
+      1 (identity) + 8 (+-e_mu) + 24 (+-e_mu +- e_nu).
+    NaiveStaggeredFermion has no Naik term, so any B >= 2 suffices.
+    To extend to ImprovedStaggeredFermion later, use B >= 6.
+
+    Reference: arXiv:2409.03904 (mrhs hermitian multigrid for DWF).
+
+    Usage (after build):
+      ./Test_staggered_hdcg --grid 16.16.16.16 --mpi 1.1.1.1
+
+*************************************************************************************/
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/AdefMrhs.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
+
+using namespace Grid;
+
+// Non-converging CG used as a smoother (fixed number of iterations)
+template<class Field>
+class CGSmoother : public LinearFunction<Field>
+{
+public:
+  typedef LinearOperatorBase<Field> FineOperator;
+  FineOperator &_Op;
+  int iters;
+  CGSmoother(int _iters, FineOperator &Op) : _Op(Op), iters(_iters) {}
+  void operator()(const Field &in, Field &out)
+  {
+    ConjugateGradient<Field> CG(0.0, iters, false);
+    out = Zero();
+    CG(_Op, in, out);
+  }
+};
+
+int main(int argc, char **argv)
+{
+  fprintf(stderr, "TRACE: entering main\n"); fflush(stderr);
+  Grid_init(&argc, &argv);
+  fprintf(stderr, "TRACE: Grid_init done\n"); fflush(stderr);
+
+  //--------------------------------------------------------------------
+  // Parameters — tune for production
+  //--------------------------------------------------------------------
+  const int nbasis = 24;   // near-null space dimension
+  const int cb     = 0;    // even checkerboard
+
+  RealD mass = 0.05;
+
+  // NaiveStaggeredFermion: nearest-neighbour hop only (no Naik term).
+  // c1 = coefficient of the hopping term (1.0 = standard normalisation).
+  // u0 = tadpole factor (1.0 = no tadpole improvement).
+  RealD c1 = 1.0;
+  RealD u0 = 1.0;
+
+  //--------------------------------------------------------------------
+  // Grids
+  // Fine:   UGrid (4D full), UrbGrid (4D red-black)
+  // Coarse: Coarse4d  with dimensions = GridDefaultLatt() / Block
+  //
+  // Recommended: GridDefaultLatt() >= 16^4, Block = {4,4,4,4}
+  // NaiveStaggeredFermion works with any Block >= {2,2,2,2}
+  //--------------------------------------------------------------------
+  fprintf(stderr, "TRACE: making UGrid\n"); fflush(stderr);
+  GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(
+      GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi());
+  fprintf(stderr, "TRACE: making UrbGrid\n"); fflush(stderr);
+  GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  Coordinate Block({4, 4, 4, 4});
+  Coordinate clatt = GridDefaultLatt();
+  for (int d = 0; d < clatt.size(); d++) clatt[d] /= Block[d];
+  Coordinate csimd = GridDefaultSimd(Nd, vComplex::Nsimd());
+  Coordinate cmpi  = GridDefaultMpi();
+  fprintf(stderr, "TRACE: making Coarse4d clatt=%d %d %d %d simd=%d %d %d %d mpi=%d %d %d %d Nsimd=%d\n",
+          clatt[0],clatt[1],clatt[2],clatt[3],
+          csimd[0],csimd[1],csimd[2],csimd[3],
+          cmpi[0],cmpi[1],cmpi[2],cmpi[3],
+          (int)vComplex::Nsimd()); fflush(stderr);
+
+  GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt, csimd, cmpi);
+  fprintf(stderr, "TRACE: Coarse4d made\n"); fflush(stderr);
+
+  //--------------------------------------------------------------------
+  // RNG + gauge field
+  //--------------------------------------------------------------------
+  fprintf(stderr, "TRACE: RNG4\n"); fflush(stderr);
+  GridParallelRNG RNG4(UGrid);    RNG4.SeedFixedIntegers({1,2,3,4});
+  fprintf(stderr, "TRACE: RNGrb\n"); fflush(stderr);
+  GridParallelRNG RNGrb(UGrid); RNGrb.SeedFixedIntegers({5,6,7,8}); // must use full grid, not UrbGrid
+  fprintf(stderr, "TRACE: Umu\n"); fflush(stderr);
+  LatticeGaugeField Umu(UGrid);
+  fprintf(stderr, "TRACE: HotConfig\n"); fflush(stderr);
+  SU<Nc>::HotConfiguration(RNG4, Umu);
+  fprintf(stderr, "TRACE: NaiveStaggeredFermionD\n"); fflush(stderr);
+  NaiveStaggeredFermionD Ds(Umu, *UGrid, *UrbGrid, mass, c1, u0);
+  fprintf(stderr, "TRACE: SchurStaggeredOperator\n"); fflush(stderr);
+  SchurStaggeredOperator<NaiveStaggeredFermionD, LatticeStaggeredFermionD> HermOp(Ds);
+  fprintf(stderr, "TRACE: HermOp done\n"); fflush(stderr);
+
+  //--------------------------------------------------------------------
+  // Subspace: inverse-iteration near-null vectors
+  //
+  // CreateSubspace applies CG (4 solves, tol=1e-4) to random noise vectors,
+  // converging naturally to the low modes of HermOp without needing spectral
+  // bound tuning.  Switch to CreateSubspaceChebyshevNew once the spectrum is
+  // well characterised (hi ~ 5.0 for naive staggered SchurStaggeredOperator).
+  //--------------------------------------------------------------------
+  typedef Aggregation<vColourVector, vTComplex, nbasis> Subspace;
+  Subspace Aggregates(Coarse4d, UrbGrid, cb);
+
+  Aggregates.CreateSubspace(RNGrb, HermOp);
+  Aggregates.Orthogonalise();
+
+  //--------------------------------------------------------------------
+  // Coarse geometry: NextToNearestStencilGeometry4D
+  //   hops=2  ->  33 stencil points in 4D
+  //--------------------------------------------------------------------
+  NextToNearestStencilGeometry4D geom(Coarse4d);
+
+  std::cout << GridLogMessage << "Coarse stencil: " << geom.npoint << " points" << std::endl;
+
+  //--------------------------------------------------------------------
+  // Single-RHS coarse operator (used for correctness check below)
+  //--------------------------------------------------------------------
+  typedef GeneralCoarsenedMatrix<vColourVector, vTComplex, nbasis> LittleDiracOp;
+  typedef LittleDiracOp::CoarseVector CoarseVector;
+
+  LittleDiracOp LDO(geom, UrbGrid, Coarse4d);
+  LDO.CoarsenOperator(HermOp, Aggregates);
+
+  //--------------------------------------------------------------------
+  // Correctness check: P M_fine P^T c  ≈  M_coarse c
+  //
+  // Promote a random coarse vector into the fine subspace, apply the
+  // fine operator, project back, and compare with the coarse operator
+  // applied directly.  Error should be at the level of subspace
+  // approximation quality (smaller = better basis vectors).
+  //--------------------------------------------------------------------
+  {
+    GridParallelRNG RNGc(Coarse4d); RNGc.SeedFixedIntegers({9,10,11,12});
+    CoarseVector c_src(Coarse4d), c_ldop(Coarse4d), c_proj(Coarse4d);
+    random(RNGc, c_src);
+
+    LatticeStaggeredFermionD f_v(UrbGrid), f_Mv(UrbGrid);
+    Aggregates.PromoteFromSubspace(c_src, f_v);
+    HermOp.Op(f_v, f_Mv);
+    Aggregates.ProjectToSubspace(c_proj, f_Mv);
+
+    LDO.M(c_src, c_ldop);
+
+    c_proj -= c_ldop;
+    RealD err = norm2(c_proj) / norm2(c_ldop);
+    std::cout << GridLogMessage
+              << "Coarsen check |P*M_fine - M_coarse| / |M_coarse| = " << err << std::endl;
+  }
+
+  //--------------------------------------------------------------------
+  // Multi-RHS coarse grid
+  //
+  // The extra leading dimension holds nrhs right-hand sides packed into
+  // SIMD lanes, matching the pattern of Test_general_coarse_hdcg_phys48.
+  //--------------------------------------------------------------------
+  const int nrhs = vComplex::Nsimd() * 2;
+
+  Coordinate mpi   = GridDefaultMpi();
+  Coordinate rhMpi ({1,    mpi[0],  mpi[1],  mpi[2],  mpi[3]});
+  Coordinate rhLatt({nrhs, clatt[0], clatt[1], clatt[2], clatt[3]});
+  Coordinate rhSimd({vComplex::Nsimd(), 1, 1, 1, 1});
+
+  GridCartesian *CoarseMrhs = new GridCartesian(rhLatt, rhSimd, rhMpi);
+
+  typedef MultiGeneralCoarsenedMatrix<vColourVector, vTComplex, nbasis> MultiCoarseOp;
+  MultiCoarseOp mrhs(geom, CoarseMrhs);
+  mrhs.CoarsenOperator(HermOp, Aggregates, Coarse4d);
+
+  //--------------------------------------------------------------------
+  // Coarse-grid Lanczos for deflation
+  //--------------------------------------------------------------------
+  typedef HermitianLinearOperator<MultiCoarseOp, CoarseVector> MrhsHermOp;
+  MrhsHermOp MrhsCoarseOp(mrhs);
+
+  // Estimate spectral bounds for Lanczos Chebyshev filter
+  CoarseVector pm_src(CoarseMrhs); pm_src = ComplexD(1.0);
+  PowerMethod<CoarseVector> cPM;
+  RealD lambda_max = cPM(MrhsCoarseOp, pm_src);
+  // Chebyshev filter window [lo, hi]:
+  //   lo must sit in the spectral gap between the Nstop-th and (Nstop+1)-th
+  //   coarse eigenvalues so that only the target modes receive cosh amplification.
+  //
+  // From a pilot run (16^4 fine, 4^4 coarse, mass=0.05, hot config):
+  //   Group 1 (near-null, 24 modes): lambda in [0.002647, 0.002746]  ~= mass^2
+  //   Spectral gap: factor 60 (lambda_24/lambda_23 = 0.165/0.00275)
+  //   Group 2 (second group): lambda in [0.165, 0.179]
+  //
+  // lo = 0.02 sits in the spectral gap (factor 7x above lambda_23=0.00275,
+  // factor 8x below lambda_24=0.165).
+  //   hi = lambda_max_coarse * 1.1 ~= 2.121
+  //   y(lambda_0=0.002647)  ~ -1.016 -> T_70 ~ 1.7e5  (cosh(70*0.182))
+  //   y(lambda_23=0.002746) ~ -1.015 -> T_70 ~ 1.6e5
+  //   Relative spread across near-null cluster: ~4.3%
+  //   y(lambda_24=0.165)    ~ -0.862 -> inside [lo,hi] -> |T_70| <= 1
+  //
+  // order=71 (degree 70) is needed to give ~4% relative spread across the
+  // near-null cluster of 24 nearly-degenerate eigenvalues; order=31 (tried)
+  // gave only ~1.7% spread, insufficient for Nk=24/Nm=48 to converge.
+  // Absolute amplification ~1e5; what matters for IRL convergence is the
+  // relative spread, not the absolute value.
+  // lo=0.005 failed (T_70~53, 0/24 modes in 10 restarts).
+  // lo=0.01 worked but needed 2 restarts (13/24 then 24/24); lo=0.02 converges in 1.
+  RealD lambda_lo  = 0.02;
+
+  std::cout << GridLogMessage << "Chebyshev filter: lo=" << lambda_lo
+            << " hi=" << lambda_max*1.1 << " order=71" << std::endl;
+
+  Chebyshev<CoarseVector> IRLCheby(lambda_lo, lambda_max * 1.1, 71);
+
+  // 24 near-null modes (eigenvalues ~mass^2) converge to resid^2~1e-28
+  // in the first Lanczos restart.  The remaining modes (~0.165) are a
+  // second spectral group that needs more Krylov vectors; handle them
+  // separately once the basic HDCG solve is validated.
+  int Nk    = 24;
+  int Nm    = 48;
+  int Nstop = Nk;
+
+  GridParallelRNG CRNG(Coarse4d); CRNG.SeedFixedIntegers({13,14,15,16});
+
+  ImplicitlyRestartedBlockLanczosCoarse<CoarseVector>
+    IRL(MrhsCoarseOp, Coarse4d, CoarseMrhs, nrhs, IRLCheby,
+        Nstop, /*conv_test_interval*/1, nrhs, Nk, Nm, 1.0e-5, 10);
+
+  int Nconv;
+  std::vector<RealD>        eval(Nm);
+  std::vector<CoarseVector> evec(Nm,   Coarse4d);  // evec on f_grid (single-RHS coarse)
+  std::vector<CoarseVector> c_srcs(nrhs, Coarse4d); // src on same grid as evec
+  for (int r = 0; r < nrhs; r++) random(CRNG, c_srcs[r]);
+  IRL.calc(eval, evec, c_srcs, Nconv, LanczosType::irbl);
+
+  //--------------------------------------------------------------------
+  // HDCG solver assembly
+  //--------------------------------------------------------------------
+  MultiRHSDeflation<CoarseVector> MrhsGuesser;
+  MrhsGuesser.ImportEigenBasis(evec, eval);
+
+  // MrhsProjector maps between fine (UrbGrid) and coarse (Coarse4d) spaces
+  MultiRHSBlockProject<LatticeStaggeredFermionD> MrhsProjector;
+  MrhsProjector.Allocate(nbasis, UrbGrid, Coarse4d);
+  MrhsProjector.ImportBasis(Aggregates.subspace);
+
+  ConjugateGradient<CoarseVector> CoarseCG(5.0e-2, 5000, false);
+  DoNothingGuesser<CoarseVector>  DoNothing;
+  HPDSolver<CoarseVector> HPDSolve(MrhsCoarseOp, CoarseCG, DoNothing);
+
+  // Spectral radius of the fine operator, needed for the smoother shift.
+  // Use a random checkerboard vector (UrbGrid) as starting guess for PowerMethod.
+  LatticeStaggeredFermionD fine_pm_src(UrbGrid);
+  random(RNGrb, fine_pm_src);
+  PowerMethod<LatticeStaggeredFermionD> finePM;
+  RealD fine_lambda_max = finePM(HermOp, fine_pm_src);
+
+  // Shifted smoother: CG on (HermOp + shift*I) with shift = lambda_max / 100.
+  //
+  // The O(8) CG polynomial has 8 roots. With this shift all 8 roots lie in the
+  // interval [shift, lambda_max + shift] ~ [0.046, 4.65], so the polynomial
+  // focuses entirely on the HIGH-frequency part of the spectrum and leaves
+  // near-null modes (lambda << shift) essentially untouched (polynomial ~ 1 there).
+  //
+  // This is the right target because the coarse-grid correction always introduces
+  // high-frequency spectral leakage: the blocked coarse-grid degrees of freedom
+  // are piecewise constant across coarse cells and therefore have sharp edges at
+  // cell boundaries (like lego-block edges). Smoothness is measured by the
+  // covariant Dirac derivative, so promoting the coarse solution back to the
+  // fine grid inevitably excites high-frequency components — just as a step
+  // function always carries high-frequency Fourier content.  The smoother must
+  // repair exactly these high modes.
+  //
+  // The smoother and the coarse-grid correction are applied alternately: together
+  // they both lift the low eigenvalues and pull down the upper eigenvalues of the
+  // composite preconditioned operator, reducing the condition number seen by the
+  // outer HDCG iterations.
+  //
+  // DWF HDCG convention; using mass^2 = 0.0025 was far too small: it scattered
+  // the 8 roots over [0.005, 4.6] and diluted their effect on the high modes.
+  RealD smootherShift = fine_lambda_max / 200.0;
+  std::cout << GridLogMessage << "Smoother shift: lambda_max_fine/200 = "
+            << fine_lambda_max << "/200 = " << smootherShift << std::endl;
+  ShiftedHermOpLinearOperator<LatticeStaggeredFermionD> ShiftedOp(HermOp, smootherShift);
+  CGSmoother<LatticeStaggeredFermionD> smoother(8, ShiftedOp);
+
+  TwoLevelADEF2mrhs<LatticeStaggeredFermionD, CoarseVector>
+    HDCG(1.0e-8, 500,
+         HermOp,
+         smoother,
+         HPDSolve,   // M1 (coarse correction)
+         HPDSolve,   // Vstart (initial guess projection)
+         MrhsProjector,
+         MrhsGuesser,
+         CoarseMrhs);
+
+  //--------------------------------------------------------------------
+  // Solve: nrhs right-hand sides simultaneously
+  //--------------------------------------------------------------------
+  std::vector<LatticeStaggeredFermionD> src(nrhs, UrbGrid);
+  std::vector<LatticeStaggeredFermionD> sol(nrhs, UrbGrid);
+
+  GridParallelRNG RNGrb2(UGrid); RNGrb2.SeedFixedIntegers({17,18,19,20}); // must use full grid, not UrbGrid
+  for (int r = 0; r < nrhs; r++) {
+    random(RNGrb2, src[r]);
+    sol[r] = Zero();
+  }
+
+  //--------------------------------------------------------------------
+  // Baseline: standard single-RHS CG on HermOp (no preconditioning)
+  // Run before HDCG to establish the unpreconditioned iteration count
+  // and wall-clock time for direct comparison.
+  //--------------------------------------------------------------------
+  {
+    ConjugateGradient<LatticeStaggeredFermionD> CG(1.0e-8, 50000, false);
+    std::vector<LatticeStaggeredFermionD> cg_sol(nrhs, UrbGrid);
+    for (int r = 0; r < nrhs; r++) cg_sol[r] = Zero();
+
+    RealD t0 = usecond();
+    int total_iters = 0;
+    for (int r = 0; r < nrhs; r++) {
+      std::cout << GridLogMessage << "====== CG baseline RHS " << r
+                << " ======" << std::endl;
+      CG(HermOp, src[r], cg_sol[r]);
+      total_iters += CG.IterationsToComplete;
+    }
+    RealD t1 = usecond();
+    std::cout << GridLogMessage << "CG baseline: " << nrhs << " RHS, "
+              << total_iters << " total iterations, "
+              << (t1 - t0) / 1.0e6 << " s total, "
+              << (t1 - t0) / 1.0e6 / nrhs << " s/RHS" << std::endl;
+  }
+
+  //--------------------------------------------------------------------
+  // HDCG solve
+  //--------------------------------------------------------------------
+  HDCG(src, sol);
+
+  Grid_finalize();
+  return 0;
+}
@@ -0,0 +1,373 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: tests/debug/Test_staggered_hdcg.cc
+
+    Authors: Thomas Blum, Peter Boyle
+
+    HDCG (Hierarchical Deflation Conjugate Gradient) multigrid solver
+    for naive staggered fermions, based on arXiv:2409.03904.
+
+    Adapts the DWF HDCG infrastructure (Test_general_coarse_hdcg_phys48.cc) to:
+      - NaiveStaggeredFermion (nearest-neighbour only, no Naik 3-hop term)
+      - 4D SchurStaggeredOperator:  Mpc = m^2 - D_oe * D_eo  (hermitian, positive-definite)
+      - vColourVector fine field type (staggered has colour but no spin)
+      - NextToNearestStencilGeometry4D: 33-point coarse stencil
+
+    Stencil count: D_oe*D_eo has 2-hop fine range.  With blocking B >= 2 the coarse
+    shifts have L1-distance <= 2, giving 33 stencil points in 4D:
+      1 (identity) + 8 (+-e_mu) + 24 (+-e_mu +- e_nu).
+    NaiveStaggeredFermion has no Naik term, so any B >= 2 suffices.
+    To extend to ImprovedStaggeredFermion later, use B >= 6.
+
+    Reference: arXiv:2409.03904 (mrhs hermitian multigrid for DWF).
+
+    Usage (after build):
+      ./Test_staggered_hdcg --grid 16.16.16.16 --mpi 1.1.1.1
+
+*************************************************************************************/
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/AdefMrhs.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
+
+using namespace Grid;
+
+// Non-converging CG used as a smoother (fixed number of iterations)
+template<class Field>
+class CGSmoother : public LinearFunction<Field>
+{
+public:
+  typedef LinearOperatorBase<Field> FineOperator;
+  FineOperator &_Op;
+  int iters;
+  CGSmoother(int _iters, FineOperator &Op) : _Op(Op), iters(_iters) {}
+  void operator()(const Field &in, Field &out)
+  {
+    ConjugateGradient<Field> CG(0.0, iters, false);
+    out = Zero();
+    CG(_Op, in, out);
+  }
+};
+
+int main(int argc, char **argv)
+{
+  fprintf(stderr, "TRACE: entering main\n"); fflush(stderr);
+  Grid_init(&argc, &argv);
+  fprintf(stderr, "TRACE: Grid_init done\n"); fflush(stderr);
+
+  //--------------------------------------------------------------------
+  // Parameters — tune for production
+  //--------------------------------------------------------------------
+  const int nbasis = 24;   // near-null space dimension
+  const int cb     = 0;    // even checkerboard
+
+  RealD mass = 0.00184;
+
+  // NaiveStaggeredFermion: nearest-neighbour hop only (no Naik term).
+  // c1 = coefficient of the hopping term (1.0 = standard normalisation).
+  // u0 = tadpole factor (1.0 = no tadpole improvement).
+  RealD c1 = 1.0;
+  RealD u0 = 1.0;
+
+  //--------------------------------------------------------------------
+  // Grids
+  // Fine:   UGrid (4D full), UrbGrid (4D red-black)
+  // Coarse: Coarse4d  with dimensions = GridDefaultLatt() / Block
+  //
+  // Recommended: GridDefaultLatt() >= 16^4, Block = {4,4,4,4}
+  // NaiveStaggeredFermion works with any Block >= {2,2,2,2}
+  //--------------------------------------------------------------------
+  fprintf(stderr, "TRACE: making UGrid\n"); fflush(stderr);
+  GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(
+      GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi());
+  fprintf(stderr, "TRACE: making UrbGrid\n"); fflush(stderr);
+  GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  Coordinate Block({4, 4, 4, 4});
+  Coordinate clatt = GridDefaultLatt();
+  for (int d = 0; d < clatt.size(); d++) clatt[d] /= Block[d];
+  Coordinate csimd = GridDefaultSimd(Nd, vComplex::Nsimd());
+  Coordinate cmpi  = GridDefaultMpi();
+  fprintf(stderr, "TRACE: making Coarse4d clatt=%d %d %d %d simd=%d %d %d %d mpi=%d %d %d %d Nsimd=%d\n",
+          clatt[0],clatt[1],clatt[2],clatt[3],
+          csimd[0],csimd[1],csimd[2],csimd[3],
+          cmpi[0],cmpi[1],cmpi[2],cmpi[3],
+          (int)vComplex::Nsimd()); fflush(stderr);
+
+  GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt, csimd, cmpi);
+  fprintf(stderr, "TRACE: Coarse4d made\n"); fflush(stderr);
+
+  //--------------------------------------------------------------------
+  // RNG + gauge field
+  //--------------------------------------------------------------------
+  fprintf(stderr, "TRACE: RNG4\n"); fflush(stderr);
+  GridParallelRNG RNG4(UGrid);    RNG4.SeedFixedIntegers({1,2,3,4});
+  fprintf(stderr, "TRACE: RNGrb\n"); fflush(stderr);
+  GridParallelRNG RNGrb(UGrid); RNGrb.SeedFixedIntegers({5,6,7,8}); // must use full grid, not UrbGrid
+  fprintf(stderr, "TRACE: Umu\n"); fflush(stderr);
+  LatticeGaugeField Umu(UGrid);
+  int HotStart = 0;
+  if ( HotStart ) {
+    fprintf(stderr, "TRACE: HotConfig\n"); fflush(stderr);
+    SU<Nc>::HotConfiguration(RNG4, Umu);
+  } else { 
+    FieldMetaData header;
+    std::string file("./configuration.ildg");
+    IldgReader IR;
+    IR.open(file);
+    IR.readConfiguration(Umu,header);
+    IR.close();
+  }
+  
+  fprintf(stderr, "TRACE: NaiveStaggeredFermionD\n"); fflush(stderr);
+  NaiveStaggeredFermionD Ds(Umu, *UGrid, *UrbGrid, mass, c1, u0);
+  fprintf(stderr, "TRACE: SchurStaggeredOperator\n"); fflush(stderr);
+  SchurStaggeredOperator<NaiveStaggeredFermionD, LatticeStaggeredFermionD> HermOp(Ds);
+  fprintf(stderr, "TRACE: HermOp done\n"); fflush(stderr);
+
+  //--------------------------------------------------------------------
+  // Subspace: inverse-iteration near-null vectors
+  //
+  // CreateSubspace applies CG (4 solves, tol=1e-4) to random noise vectors,
+  // converging naturally to the low modes of HermOp without needing spectral
+  // bound tuning.  Switch to CreateSubspaceChebyshevNew once the spectrum is
+  // well characterised (hi ~ 5.0 for naive staggered SchurStaggeredOperator).
+  //--------------------------------------------------------------------
+  typedef Aggregation<vColourVector, vTComplex, nbasis> Subspace;
+  Subspace Aggregates(Coarse4d, UrbGrid, cb);
+
+  Aggregates.CreateSubspace(RNGrb, HermOp);
+  Aggregates.Orthogonalise();
+
+  //--------------------------------------------------------------------
+  // Coarse geometry: NextToNearestStencilGeometry4D
+  //   hops=2  ->  33 stencil points in 4D
+  //--------------------------------------------------------------------
+  NextToNearestStencilGeometry4D geom(Coarse4d);
+
+  std::cout << GridLogMessage << "Coarse stencil: " << geom.npoint << " points" << std::endl;
+
+  //--------------------------------------------------------------------
+  // Single-RHS coarse operator (used for correctness check below)
+  //--------------------------------------------------------------------
+  typedef GeneralCoarsenedMatrix<vColourVector, vTComplex, nbasis> LittleDiracOp;
+  typedef LittleDiracOp::CoarseVector CoarseVector;
+
+  LittleDiracOp LDO(geom, UrbGrid, Coarse4d);
+  LDO.CoarsenOperator(HermOp, Aggregates);
+
+  //--------------------------------------------------------------------
+  // Correctness check: P M_fine P^T c  ≈  M_coarse c
+  //
+  // Promote a random coarse vector into the fine subspace, apply the
+  // fine operator, project back, and compare with the coarse operator
+  // applied directly.  Error should be at the level of subspace
+  // approximation quality (smaller = better basis vectors).
+  //--------------------------------------------------------------------
+  {
+    GridParallelRNG RNGc(Coarse4d); RNGc.SeedFixedIntegers({9,10,11,12});
+    CoarseVector c_src(Coarse4d), c_ldop(Coarse4d), c_proj(Coarse4d);
+    random(RNGc, c_src);
+
+    LatticeStaggeredFermionD f_v(UrbGrid), f_Mv(UrbGrid);
+    Aggregates.PromoteFromSubspace(c_src, f_v);
+    HermOp.Op(f_v, f_Mv);
+    Aggregates.ProjectToSubspace(c_proj, f_Mv);
+
+    LDO.M(c_src, c_ldop);
+
+    c_proj -= c_ldop;
+    RealD err = norm2(c_proj) / norm2(c_ldop);
+    std::cout << GridLogMessage
+              << "Coarsen check |P*M_fine - M_coarse| / |M_coarse| = " << err << std::endl;
+  }
+
+  //--------------------------------------------------------------------
+  // Multi-RHS coarse grid
+  //
+  // The extra leading dimension holds nrhs right-hand sides packed into
+  // SIMD lanes, matching the pattern of Test_general_coarse_hdcg_phys48.
+  //--------------------------------------------------------------------
+  const int nrhs = vComplex::Nsimd() * 2;
+
+  Coordinate mpi   = GridDefaultMpi();
+  Coordinate rhMpi ({1,    mpi[0],  mpi[1],  mpi[2],  mpi[3]});
+  Coordinate rhLatt({nrhs, clatt[0], clatt[1], clatt[2], clatt[3]});
+  Coordinate rhSimd({vComplex::Nsimd(), 1, 1, 1, 1});
+
+  GridCartesian *CoarseMrhs = new GridCartesian(rhLatt, rhSimd, rhMpi);
+
+  typedef MultiGeneralCoarsenedMatrix<vColourVector, vTComplex, nbasis> MultiCoarseOp;
+  MultiCoarseOp mrhs(geom, CoarseMrhs);
+  mrhs.CoarsenOperator(HermOp, Aggregates, Coarse4d);
+
+  //--------------------------------------------------------------------
+  // Coarse-grid Lanczos for deflation
+  //--------------------------------------------------------------------
+  typedef HermitianLinearOperator<MultiCoarseOp, CoarseVector> MrhsHermOp;
+  MrhsHermOp MrhsCoarseOp(mrhs);
+
+  // Estimate spectral bounds for Lanczos Chebyshev filter
+  CoarseVector pm_src(CoarseMrhs); pm_src = ComplexD(1.0);
+  PowerMethod<CoarseVector> cPM;
+  RealD lambda_max = cPM(MrhsCoarseOp, pm_src);
+  // Chebyshev filter window [lo, hi]:
+  //   lo must sit in the spectral gap between the Nstop-th and (Nstop+1)-th
+  //   coarse eigenvalues so that only the target modes receive cosh amplification.
+  //
+  // From a pilot run (16^4 fine, 4^4 coarse, mass=0.05, hot config):
+  //   Group 1 (near-null, 24 modes): lambda in [0.002647, 0.002746]  ~= mass^2
+  //   Spectral gap: factor 60 (lambda_24/lambda_23 = 0.165/0.00275)
+  //   Group 2 (second group): lambda in [0.165, 0.179]
+  //
+  // lo = 0.02 sits in the spectral gap (factor 7x above lambda_23=0.00275,
+  // factor 8x below lambda_24=0.165).
+  //   hi = lambda_max_coarse * 1.1 ~= 2.121
+  //   y(lambda_0=0.002647)  ~ -1.016 -> T_70 ~ 1.7e5  (cosh(70*0.182))
+  //   y(lambda_23=0.002746) ~ -1.015 -> T_70 ~ 1.6e5
+  //   Relative spread across near-null cluster: ~4.3%
+  //   y(lambda_24=0.165)    ~ -0.862 -> inside [lo,hi] -> |T_70| <= 1
+  //
+  // order=71 (degree 70) is needed to give ~4% relative spread across the
+  // near-null cluster of 24 nearly-degenerate eigenvalues; order=31 (tried)
+  // gave only ~1.7% spread, insufficient for Nk=24/Nm=48 to converge.
+  // Absolute amplification ~1e5; what matters for IRL convergence is the
+  // relative spread, not the absolute value.
+  // lo=0.005 failed (T_70~53, 0/24 modes in 10 restarts).
+  // lo=0.01 worked but needed 2 restarts (13/24 then 24/24); lo=0.02 converges in 1.
+  RealD lambda_lo  = 0.02;
+
+  std::cout << GridLogMessage << "Chebyshev filter: lo=" << lambda_lo
+            << " hi=" << lambda_max*1.1 << " order=71" << std::endl;
+
+  Chebyshev<CoarseVector> IRLCheby(lambda_lo, lambda_max * 1.1, 71);
+
+  // 24 near-null modes (eigenvalues ~mass^2) converge to resid^2~1e-28
+  // in the first Lanczos restart.  The remaining modes (~0.165) are a
+  // second spectral group that needs more Krylov vectors; handle them
+  // separately once the basic HDCG solve is validated.
+  int Nk    = 24;
+  int Nm    = 48;
+  int Nstop = Nk;
+
+  GridParallelRNG CRNG(Coarse4d); CRNG.SeedFixedIntegers({13,14,15,16});
+
+  ImplicitlyRestartedBlockLanczosCoarse<CoarseVector>
+    IRL(MrhsCoarseOp, Coarse4d, CoarseMrhs, nrhs, IRLCheby,
+        Nstop, /*conv_test_interval*/1, nrhs, Nk, Nm, 1.0e-5, 10);
+
+  int Nconv;
+  std::vector<RealD>        eval(Nm);
+  std::vector<CoarseVector> evec(Nm,   Coarse4d);  // evec on f_grid (single-RHS coarse)
+  std::vector<CoarseVector> c_srcs(nrhs, Coarse4d); // src on same grid as evec
+  for (int r = 0; r < nrhs; r++) random(CRNG, c_srcs[r]);
+  IRL.calc(eval, evec, c_srcs, Nconv, LanczosType::irbl);
+
+  //--------------------------------------------------------------------
+  // HDCG solver assembly
+  //--------------------------------------------------------------------
+  MultiRHSDeflation<CoarseVector> MrhsGuesser;
+  MrhsGuesser.ImportEigenBasis(evec, eval);
+
+  // MrhsProjector maps between fine (UrbGrid) and coarse (Coarse4d) spaces
+  MultiRHSBlockProject<LatticeStaggeredFermionD> MrhsProjector;
+  MrhsProjector.Allocate(nbasis, UrbGrid, Coarse4d);
+  MrhsProjector.ImportBasis(Aggregates.subspace);
+
+  ConjugateGradient<CoarseVector> CoarseCG(5.0e-2, 5000, false);
+  DoNothingGuesser<CoarseVector>  DoNothing;
+  HPDSolver<CoarseVector> HPDSolve(MrhsCoarseOp, CoarseCG, DoNothing);
+
+  // Spectral radius of the fine operator, needed for the smoother shift.
+  // Use a random checkerboard vector (UrbGrid) as starting guess for PowerMethod.
+  LatticeStaggeredFermionD fine_pm_src(UrbGrid);
+  random(RNGrb, fine_pm_src);
+  PowerMethod<LatticeStaggeredFermionD> finePM;
+  RealD fine_lambda_max = finePM(HermOp, fine_pm_src);
+
+  // Shifted smoother: CG on (HermOp + shift*I) with shift = lambda_max / 100.
+  //
+  // The O(8) CG polynomial has 8 roots. With this shift all 8 roots lie in the
+  // interval [shift, lambda_max + shift] ~ [0.046, 4.65], so the polynomial
+  // focuses entirely on the HIGH-frequency part of the spectrum and leaves
+  // near-null modes (lambda << shift) essentially untouched (polynomial ~ 1 there).
+  //
+  // This is the right target because the coarse-grid correction always introduces
+  // high-frequency spectral leakage: the blocked coarse-grid degrees of freedom
+  // are piecewise constant across coarse cells and therefore have sharp edges at
+  // cell boundaries (like lego-block edges). Smoothness is measured by the
+  // covariant Dirac derivative, so promoting the coarse solution back to the
+  // fine grid inevitably excites high-frequency components — just as a step
+  // function always carries high-frequency Fourier content.  The smoother must
+  // repair exactly these high modes.
+  //
+  // The smoother and the coarse-grid correction are applied alternately: together
+  // they both lift the low eigenvalues and pull down the upper eigenvalues of the
+  // composite preconditioned operator, reducing the condition number seen by the
+  // outer HDCG iterations.
+  //
+  // DWF HDCG convention; using mass^2 = 0.0025 was far too small: it scattered
+  // the 8 roots over [0.005, 4.6] and diluted their effect on the high modes.
+  RealD smootherShift = fine_lambda_max / 200.0;
+  std::cout << GridLogMessage << "Smoother shift: lambda_max_fine/200 = "
+            << fine_lambda_max << "/200 = " << smootherShift << std::endl;
+  ShiftedHermOpLinearOperator<LatticeStaggeredFermionD> ShiftedOp(HermOp, smootherShift);
+  CGSmoother<LatticeStaggeredFermionD> smoother(8, ShiftedOp);
+
+  TwoLevelADEF2mrhs<LatticeStaggeredFermionD, CoarseVector>
+    HDCG(1.0e-8, 500,
+         HermOp,
+         smoother,
+         HPDSolve,   // M1 (coarse correction)
+         HPDSolve,   // Vstart (initial guess projection)
+         MrhsProjector,
+         MrhsGuesser,
+         CoarseMrhs);
+
+  //--------------------------------------------------------------------
+  // Solve: nrhs right-hand sides simultaneously
+  //--------------------------------------------------------------------
+  std::vector<LatticeStaggeredFermionD> src(nrhs, UrbGrid);
+  std::vector<LatticeStaggeredFermionD> sol(nrhs, UrbGrid);
+
+  GridParallelRNG RNGrb2(UGrid); RNGrb2.SeedFixedIntegers({17,18,19,20}); // must use full grid, not UrbGrid
+  for (int r = 0; r < nrhs; r++) {
+    random(RNGrb2, src[r]);
+    sol[r] = Zero();
+  }
+
+  //--------------------------------------------------------------------
+  // Baseline: standard single-RHS CG on HermOp (no preconditioning)
+  // Run before HDCG to establish the unpreconditioned iteration count
+  // and wall-clock time for direct comparison.
+  //--------------------------------------------------------------------
+  {
+    ConjugateGradient<LatticeStaggeredFermionD> CG(1.0e-8, 100000, false);
+    std::vector<LatticeStaggeredFermionD> cg_sol(nrhs, UrbGrid);
+    for (int r = 0; r < nrhs; r++) cg_sol[r] = Zero();
+
+    RealD t0 = usecond();
+    int total_iters = 0;
+    for (int r = 0; r < nrhs; r++) {
+      std::cout << GridLogMessage << "====== CG baseline RHS " << r
+                << " ======" << std::endl;
+      CG(HermOp, src[r], cg_sol[r]);
+      total_iters += CG.IterationsToComplete;
+    }
+    RealD t1 = usecond();
+    std::cout << GridLogMessage << "CG baseline: " << nrhs << " RHS, "
+              << total_iters << " total iterations, "
+              << (t1 - t0) / 1.0e6 << " s total, "
+              << (t1 - t0) / 1.0e6 / nrhs << " s/RHS" << std::endl;
+  }
+
+  //--------------------------------------------------------------------
+  // HDCG solve
+  //--------------------------------------------------------------------
+  HDCG(src, sol);
+
+  Grid_finalize();
+  return 0;
+}