Some improvements that should have been there if in synch with develop,

and also some staggered hdcg type work
2026-07-22 03:23:28 +01:00 · 2026-05-29 13:36:57 -04:00
parent 34d8d003a8
commit 42cd9eda71
8 changed files with 660 additions and 93 deletions
@@ -59,7 +59,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
 #if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 13)
  #define GRID_CUB_SUM_OP ::cuda::std::plus<>{}
 #else
-  #define GRID_CUB_SUM_OP ::cub::Sum()
+  #define GRID_CUB_SUM_OP ::gpucub::Sum()
 #endif
  gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, GRID_CUB_SUM_OP, zero_init, computeStream);
@@ -33,7 +33,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 using namespace std;
 using namespace Grid;
 ;
 int main (int argc, char ** argv)
 {
@@ -97,20 +96,38 @@ int main (int argc, char ** argv)
  RealD c2=-1.0/24.0;
  RealD u0=1.0;
  ImprovedStaggeredFermionD Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0,params);
  NaiveStaggeredFermionD Dn(Umu,Grid,RBGrid,mass,c1,u0,params);
  std::cout<<GridLogMessage << "Calling Ds"<<std::endl;
-  int ncall=1000;
+  int ncall=100;
  // warm perf only
  for(int i=0;i<ncall;i++){
    Ds.Dhop(src,result,0);
  }
  double t0=usecond();
  for(int i=0;i<ncall;i++){
    Ds.Dhop(src,result,0);
  }
  double t1=usecond();
-  double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 +  == 1146
+  double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + 90 == 1146
  std::cout<<GridLogMessage << "Called Ds"<<std::endl;
  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
  // Warm perf only
  for(int i=0;i<ncall;i++){
    Dn.Dhop(src,result,0);
  }
  t0=usecond();
  for(int i=0;i<ncall;i++){
    Ds.Dhop(src,result,0);
  }
  t1=usecond();
  flops=(8*(3*(6+8+8)) + 7*3*2)*volume*ncall; 
  std::cout<<GridLogMessage << "Called Dn"<<std::endl;
  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
  Grid_finalize();
 }
@@ -716,6 +716,161 @@ public:
    return mflops_best;
  }
  static double NaiveStaggered(int L)
  {
    double mflops;
    double mflops_best = 0;
    double mflops_worst= 0;
    std::vector<double> mflops_all;
    ///////////////////////////////////////////////////////
    // Set/Get the layout & grid size
    ///////////////////////////////////////////////////////
    int threads = GridThread::GetThreads();
    Coordinate mpi = GridDefaultMpi(); GRID_ASSERT(mpi.size()==4);
    Coordinate local({L,L,L,L});
    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(latt4,
 								       GridDefaultSimd(Nd,vComplex::Nsimd()),
 								       GridDefaultMpi());
    uint64_t NP = TmpGrid->RankCount();
    uint64_t NN = TmpGrid->NodeCount();
    NN_global=NN;
    uint64_t SHM=NP/NN;
    ///////// Welcome message ////////////
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "Benchmark NaiveStaggered on "<<L<<"^4 local volume "<<std::endl;
    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
    std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl;
    std::cout<<GridLogMessage << "* nodes          : "<<NN  <<std::endl;
    std::cout<<GridLogMessage << "* ranks/node     : "<<SHM <<std::endl;
    std::cout<<GridLogMessage << "* ranks geom     : "<<GridCmdVectorIntToString(mpi)<<std::endl;
    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    ///////// Lattice Init ////////////
    GridCartesian         * FGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
    ///////// RNG Init ////////////
    std::vector<int> seeds4({1,2,3,4});
    GridParallelRNG          RNG4(FGrid);  RNG4.SeedFixedIntegers(seeds4);
    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
    RealD mass=0.1;
    RealD c1=9.0/8.0;
    RealD c2=-1.0/24.0;
    RealD u0=1.0;
    typedef NaiveStaggeredFermionF Action;
    typedef typename Action::FermionField Fermion; 
    typedef LatticeGaugeFieldF Gauge;
    Gauge Umu(FGrid);  SU<Nc>::HotConfiguration(RNG4,Umu); 
    typename Action::ImplParams params;
    Action Ds(Umu,*FGrid,*FrbGrid,mass,c1,u0,params);
    ///////// Source preparation ////////////
    Fermion src   (FGrid); random(RNG4,src);
    Fermion src_e (FrbGrid);
    Fermion src_o (FrbGrid);
    Fermion r_e   (FrbGrid);
    Fermion r_o   (FrbGrid);
    Fermion r_eo  (FGrid);
    {
      pickCheckerboard(Even,src_e,src);
      pickCheckerboard(Odd,src_o,src);
      const int num_cases = 2;
      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
      controls Cases [] = {
 	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
 	{  StaggeredKernelsStatic::OptHandUnroll,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
 	{  StaggeredKernelsStatic::OptInlineAsm ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  }
      }; 
      for(int c=0;c<num_cases;c++) {
 	StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap;
 	StaggeredKernelsStatic::Opt   = Cases[c].Opt;
 	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	if ( StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc StaggeredKernels" <<std::endl;
 	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	int nwarm = 10;
 	double t0=usecond();
 	FGrid->Barrier();
 	for(int i=0;i<nwarm;i++){
 	  Ds.DhopEO(src_o,r_e,DaggerNo);
 	}
 	FGrid->Barrier();
 	double t1=usecond();
 	uint64_t no    = 50;
 	uint64_t ni    = 100;
 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
 	time_statistics timestat;
 	std::vector<double> t_time(no);
 	for(uint64_t i=0;i<no;i++){
 	  t0=usecond();
 	  for(uint64_t j=0;j<ni;j++){
 	    Ds.DhopEO(src_o,r_e,DaggerNo);
 	  }
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
 	FGrid->Barrier();
 	double volume=1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 	double flops=((8*(3*(6+8+8)) + 7*3*2)*1.0*volume)/2;
 	double mf_hi, mf_lo, mf_err;
 	timestat.statistics(t_time);
 	mf_hi = flops/timestat.min*ni;
 	mf_lo = flops/timestat.max*ni;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;
 	mflops = flops/timestat.mean*ni;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
 	if ( mflops>mflops_best ) mflops_best = mflops;
 	if ( mflops<mflops_worst) mflops_worst= mflops;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call   "<< timestat.mean/ni<<std::endl;
      }
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
      std::cout<<GridLogMessage << L<<"^4  Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage << L<<"^4  Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage <<fmt << std::endl;
      std::cout<<GridLogMessage ;
      for(int i=0;i<mflops_all.size();i++){
 	std::cout<<mflops_all[i]/NN<<" ; " ;
      }
      std::cout<<std::endl;
    }
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    return mflops_best;
  }
  static double Clover(int L)
  {
    double mflops;
@@ -887,6 +1042,7 @@ int main (int argc, char ** argv)
  std::vector<double> clover;
  std::vector<double> dwf4;
  std::vector<double> staggered;
  std::vector<double> naive_staggered;
  int Ls=1;
  if (do_dslash){
@@ -914,13 +1070,21 @@ int main (int argc, char ** argv)
    staggered.push_back(result);
  }
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Naive Staggered dslash 4D vectorised" <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  for(int l=0;l<L_list.size();l++){
    double result = Benchmark::NaiveStaggered(L_list[l]) ;
    naive_staggered.push_back(result);
  }
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "L \t\t Clover \t\t DWF4 \t\t Staggered" <<std::endl;
+  std::cout<<GridLogMessage << "L \t\t Clover \t\t DWF4 \t\t Staggered \t\t Naive Staggered" <<std::endl;
  for(int l=0;l<L_list.size();l++){
-    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl;
+    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<" \t\t "<<naive_staggered[l]<<std::endl;
  }
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  }
@@ -930,14 +1094,14 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl;
+    std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered \t\t NaiveStag \t|\t (GF/s per node)" <<std::endl;
    fprintf(FP,"Per node summary table\n");
    fprintf(FP,"\n");
-    fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n");
+    fprintf(FP,"L , Wilson, DWF4, Staggered, NaiveStag\n");
    fprintf(FP,"\n");
    for(int l=0;l<L_list.size();l++){
-      std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
+      std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<" \t " <<naive_staggered[l]/NN<<std::endl;
-      fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.);
+      fprintf(FP,"%d , %.0f, %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.,naive_staggered[l]/NN/1000.);
    }
    fprintf(FP,"\n");
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
@@ -1,76 +1,91 @@
 Per node summary table
 L , Wilson, DWF4, Staggered, NaiveStag
 8 , 90, 933, 38, 23
 12 , 403, 1688, 178, 113
 16 , 188, 1647, 449, 295
 24 , 947, 1574, 674, 553
 32 , 931, 1371, 718, 643
 Memory Bandwidth
 Bytes, GB/s per node
-6291456, 379.297050
+786432, 40.271620
-100663296, 3754.674992
+12582912, 433.611792
-509607936, 6521.472413
+63700992, 905.374321
-1610612736, 8513.456479
+201326592, 1114.979152
-3932160000, 9018.901766
+491520000, 1180.241898
 GEMM
 M, N, K, BATCH, GF/s per rank
 16, 8, 16, 256, 0.564958
 16, 16, 16, 256, 243.148058
 16, 32, 16, 256, 440.346877
 32, 8, 32, 256, 439.194136
 32, 16, 32, 256, 847.334141
 32, 32, 32, 256, 1430.892623
 64, 8, 64, 256, 1242.756741
 64, 16, 64, 256, 2196.689493
 64, 32, 64, 256, 3697.458072
 16, 8, 256, 256, 899.582627
 16, 16, 256, 256, 1673.537756
 16, 32, 256, 256, 2959.597089
 32, 8, 256, 256, 1558.858630
 32, 16, 256, 256, 2864.839445
 32, 32, 256, 256, 4810.671254
 64, 8, 256, 256, 2386.092942
 64, 16, 256, 256, 4451.665937
 64, 32, 256, 256, 5942.124095
 8, 256, 16, 256, 799.867271
 16, 256, 16, 256, 1584.624888
 32, 256, 16, 256, 1949.422338
 8, 256, 32, 256, 1389.417474
 16, 256, 32, 256, 2668.344493
 32, 256, 32, 256, 3234.162120
 8, 256, 64, 256, 2150.925128
 16, 256, 64, 256, 4012.488132
 32, 256, 64, 256, 5154.785521
 Communications
 Packet bytes, direction, GB/s per node
 4718592, 1, 245.026198
 4718592, 2, 251.180996
 4718592, 3, 361.110977
 4718592, 5, 247.898447
 4718592, 6, 249.867523
 4718592, 7, 359.033061
 15925248, 1, 255.030946
 15925248, 2, 264.453890
 15925248, 3, 392.949183
 15925248, 5, 256.040644
 15925248, 6, 264.681896
 15925248, 7, 392.102622
 37748736, 1, 258.823333
 37748736, 2, 268.181577
 37748736, 3, 401.478191
 37748736, 5, 258.995363
 37748736, 6, 268.206586
 37748736, 7, 400.397611
-Per node summary table
+GEMM
 M, N, K, BATCH, GF/s per rank fp64
 16, 8, 16, 4096, 693.316363
 16, 12, 16, 4096, 657.277058
 16, 16, 16, 4096, 711.992616
 32, 8, 32, 4096, 821.084324
 32, 12, 32, 4096, 1279.852719
 32, 16, 32, 4096, 2647.096674
 64, 8, 64, 4096, 2630.192325
 64, 12, 64, 4096, 3338.071321
 64, 16, 64, 4096, 3950.899281
 16, 8, 256, 4096, 1638.362501
 16, 12, 256, 4096, 2377.502234
 16, 16, 256, 4096, 3048.328833
 32, 8, 256, 4096, 2917.384276
 32, 12, 256, 4096, 4103.085151
 32, 16, 256, 4096, 5102.971860
 64, 8, 256, 4096, 3222.258206
 64, 12, 256, 4096, 4619.456391
 64, 16, 256, 4096, 5847.916650
 8, 256, 16, 4096, 1728.073337
 12, 256, 16, 4096, 2356.653970
 16, 256, 16, 4096, 2676.876038
 8, 256, 32, 4096, 2611.531990
 12, 256, 32, 4096, 3451.573106
 16, 256, 32, 4096, 3966.915301
 8, 256, 64, 4096, 3436.248737
 12, 256, 64, 4096, 4539.497945
 16, 256, 64, 4096, 5307.992323
 GEMM
 M, N, K, BATCH, GF/s per rank fp32
 16, 8, 16, 4096, 499.017445
 16, 12, 16, 4096, 731.543385
 16, 16, 16, 4096, 958.800786
 32, 8, 32, 4096, 1549.813550
 32, 12, 32, 4096, 2147.907502
 32, 16, 32, 4096, 2601.698596
 64, 8, 64, 4096, 3785.446233
 64, 12, 64, 4096, 5116.694843
 64, 16, 64, 4096, 6109.345016
 16, 8, 256, 4096, 1206.627737
 16, 12, 256, 4096, 1809.699599
 16, 16, 256, 4096, 2412.014053
 32, 8, 256, 4096, 2406.114488
 32, 12, 256, 4096, 3605.531907
 32, 16, 256, 4096, 4798.444037
 64, 8, 256, 4096, 4688.711196
 64, 12, 256, 4096, 6990.696301
 64, 16, 256, 4096, 9214.749925
 8, 256, 16, 4096, 2596.307289
 12, 256, 16, 4096, 3439.892562
 16, 256, 16, 4096, 3907.201036
 8, 256, 32, 4096, 3012.752067
 12, 256, 32, 4096, 3904.217583
 16, 256, 32, 4096, 4599.047092
 8, 256, 64, 4096, 3721.999042
 12, 256, 64, 4096, 5098.573927
 16, 256, 64, 4096, 6159.080872
 L , Wilson, DWF4, Staggered, GF/s per node
 8 , 155, 1386, 50
 12 , 694, 4208, 230
 16 , 1841, 6675, 609
 24 , 3934, 8573, 1641
 32 , 5083, 9771, 3086
@@ -1,4 +1,3 @@
 CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
 ../../configure --enable-comms=mpi-auto \
 --with-lime=$CLIME \
 --enable-unified=no \
@@ -9,12 +8,13 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
 --disable-gparity \
 --disable-fermion-reps \
 --enable-simd=GPU \
--with-gmp=$OLCF_GMP_ROOT \
+--with-gmp=$GMP \
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
+--with-mpfr=$MPFR \
 --with-openssl=$OPENSSL \
 --disable-fermion-reps \
 CXX=hipcc MPICXX=mpicxx \
-CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
+CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include " \
- LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas -lhipfft"
+ LDFLAGS="-L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -lmpi_gtl_hsa -lhipblas -lrocblas -lhipfft -lamdhip64"
@@ -1,16 +1,14 @@
 echo spack
-. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
+. /autofs/nccs-svm1_home1/paboyle/spack/share/spack/setup-env.sh
-module load amd/7.0.2
+export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
-module load cray-fftw
+export MPFR=`spack find --paths mpfr    | grep ^mpfr  | awk '{print $2}' `
-module load craype-accel-amd-gfx90a
+export OPENSSL=`spack find --paths openssl | grep openssl | awk  '{print $2}' `
-mkdir $HOME/LD_PATH
+export GMP=`spack find --paths gmp      | grep ^gmp | awk '{print $2}' `
 ln -s /opt/rocm-6.4.2/lib/libamdhip* $HOME/LD_PATH
-#Ugly hacks to get down level software working on current system
+module load cce/21.0.0
-export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
+module load cpe/26.03
-export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
+module load rocm/7.0.2
-#export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH
-export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/LD_PATH/
+export LD_LIBRARY_PATH=/opt/rocm-7.0.2/lib/llvm/lib/:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm-7.0.2/lib
@@ -36,8 +36,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 using namespace std;
 using namespace Grid;
-gridblasHandle_t GridBLAS::gridblasHandle;
+//gridblasHandle_t GridBLAS::gridblasHandle;
-int            GridBLAS::gridblasInit;
+//int            GridBLAS::gridblasInit;
 ///////////////////////
 // Tells little dirac op to use MdagM as the .Op()
@@ -0,0 +1,373 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: tests/debug/Test_staggered_hdcg.cc
    Authors: Thomas Blum, Peter Boyle
    HDCG (Hierarchical Deflation Conjugate Gradient) multigrid solver
    for naive staggered fermions, based on arXiv:2409.03904.
    Adapts the DWF HDCG infrastructure (Test_general_coarse_hdcg_phys48.cc) to:
      - NaiveStaggeredFermion (nearest-neighbour only, no Naik 3-hop term)
      - 4D SchurStaggeredOperator:  Mpc = m^2 - D_oe * D_eo  (hermitian, positive-definite)
      - vColourVector fine field type (staggered has colour but no spin)
      - NextToNearestStencilGeometry4D: 33-point coarse stencil
    Stencil count: D_oe*D_eo has 2-hop fine range.  With blocking B >= 2 the coarse
    shifts have L1-distance <= 2, giving 33 stencil points in 4D:
      1 (identity) + 8 (+-e_mu) + 24 (+-e_mu +- e_nu).
    NaiveStaggeredFermion has no Naik term, so any B >= 2 suffices.
    To extend to ImprovedStaggeredFermion later, use B >= 6.
    Reference: arXiv:2409.03904 (mrhs hermitian multigrid for DWF).
    Usage (after build):
      ./Test_staggered_hdcg --grid 16.16.16.16 --mpi 1.1.1.1
 *************************************************************************************/
 #include <Grid/Grid.h>
 #include <Grid/algorithms/iterative/AdefMrhs.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
 using namespace Grid;
 // Non-converging CG used as a smoother (fixed number of iterations)
 template<class Field>
 class CGSmoother : public LinearFunction<Field>
 {
 public:
  typedef LinearOperatorBase<Field> FineOperator;
  FineOperator &_Op;
  int iters;
  CGSmoother(int _iters, FineOperator &Op) : _Op(Op), iters(_iters) {}
  void operator()(const Field &in, Field &out)
  {
    ConjugateGradient<Field> CG(0.0, iters, false);
    out = Zero();
    CG(_Op, in, out);
  }
 };
 int main(int argc, char **argv)
 {
  fprintf(stderr, "TRACE: entering main\n"); fflush(stderr);
  Grid_init(&argc, &argv);
  fprintf(stderr, "TRACE: Grid_init done\n"); fflush(stderr);
  //--------------------------------------------------------------------
  // Parameters — tune for production
  //--------------------------------------------------------------------
  const int nbasis = 24;   // near-null space dimension
  const int cb     = 0;    // even checkerboard
  RealD mass = 0.00184;
  // NaiveStaggeredFermion: nearest-neighbour hop only (no Naik term).
  // c1 = coefficient of the hopping term (1.0 = standard normalisation).
  // u0 = tadpole factor (1.0 = no tadpole improvement).
  RealD c1 = 1.0;
  RealD u0 = 1.0;
  //--------------------------------------------------------------------
  // Grids
  // Fine:   UGrid (4D full), UrbGrid (4D red-black)
  // Coarse: Coarse4d  with dimensions = GridDefaultLatt() / Block
  //
  // Recommended: GridDefaultLatt() >= 16^4, Block = {4,4,4,4}
  // NaiveStaggeredFermion works with any Block >= {2,2,2,2}
  //--------------------------------------------------------------------
  fprintf(stderr, "TRACE: making UGrid\n"); fflush(stderr);
  GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(
      GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi());
  fprintf(stderr, "TRACE: making UrbGrid\n"); fflush(stderr);
  GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  Coordinate Block({4, 4, 4, 4});
  Coordinate clatt = GridDefaultLatt();
  for (int d = 0; d < clatt.size(); d++) clatt[d] /= Block[d];
  Coordinate csimd = GridDefaultSimd(Nd, vComplex::Nsimd());
  Coordinate cmpi  = GridDefaultMpi();
  fprintf(stderr, "TRACE: making Coarse4d clatt=%d %d %d %d simd=%d %d %d %d mpi=%d %d %d %d Nsimd=%d\n",
          clatt[0],clatt[1],clatt[2],clatt[3],
          csimd[0],csimd[1],csimd[2],csimd[3],
          cmpi[0],cmpi[1],cmpi[2],cmpi[3],
          (int)vComplex::Nsimd()); fflush(stderr);
  GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt, csimd, cmpi);
  fprintf(stderr, "TRACE: Coarse4d made\n"); fflush(stderr);
  //--------------------------------------------------------------------
  // RNG + gauge field
  //--------------------------------------------------------------------
  fprintf(stderr, "TRACE: RNG4\n"); fflush(stderr);
  GridParallelRNG RNG4(UGrid);    RNG4.SeedFixedIntegers({1,2,3,4});
  fprintf(stderr, "TRACE: RNGrb\n"); fflush(stderr);
  GridParallelRNG RNGrb(UGrid); RNGrb.SeedFixedIntegers({5,6,7,8}); // must use full grid, not UrbGrid
  fprintf(stderr, "TRACE: Umu\n"); fflush(stderr);
  LatticeGaugeField Umu(UGrid);
  int HotStart = 0;
  if ( HotStart ) {
    fprintf(stderr, "TRACE: HotConfig\n"); fflush(stderr);
    SU<Nc>::HotConfiguration(RNG4, Umu);
  } else { 
    FieldMetaData header;
    std::string file("./configuration.ildg");
    IldgReader IR;
    IR.open(file);
    IR.readConfiguration(Umu,header);
    IR.close();
  }
  fprintf(stderr, "TRACE: NaiveStaggeredFermionD\n"); fflush(stderr);
  NaiveStaggeredFermionD Ds(Umu, *UGrid, *UrbGrid, mass, c1, u0);
  fprintf(stderr, "TRACE: SchurStaggeredOperator\n"); fflush(stderr);
  SchurStaggeredOperator<NaiveStaggeredFermionD, LatticeStaggeredFermionD> HermOp(Ds);
  fprintf(stderr, "TRACE: HermOp done\n"); fflush(stderr);
  //--------------------------------------------------------------------
  // Subspace: inverse-iteration near-null vectors
  //
  // CreateSubspace applies CG (4 solves, tol=1e-4) to random noise vectors,
  // converging naturally to the low modes of HermOp without needing spectral
  // bound tuning.  Switch to CreateSubspaceChebyshevNew once the spectrum is
  // well characterised (hi ~ 5.0 for naive staggered SchurStaggeredOperator).
  //--------------------------------------------------------------------
  typedef Aggregation<vColourVector, vTComplex, nbasis> Subspace;
  Subspace Aggregates(Coarse4d, UrbGrid, cb);
  Aggregates.CreateSubspace(RNGrb, HermOp);
  Aggregates.Orthogonalise();
  //--------------------------------------------------------------------
  // Coarse geometry: NextToNearestStencilGeometry4D
  //   hops=2  ->  33 stencil points in 4D
  //--------------------------------------------------------------------
  NextToNearestStencilGeometry4D geom(Coarse4d);
  std::cout << GridLogMessage << "Coarse stencil: " << geom.npoint << " points" << std::endl;
  //--------------------------------------------------------------------
  // Single-RHS coarse operator (used for correctness check below)
  //--------------------------------------------------------------------
  typedef GeneralCoarsenedMatrix<vColourVector, vTComplex, nbasis> LittleDiracOp;
  typedef LittleDiracOp::CoarseVector CoarseVector;
  LittleDiracOp LDO(geom, UrbGrid, Coarse4d);
  LDO.CoarsenOperator(HermOp, Aggregates);
  //--------------------------------------------------------------------
  // Correctness check: P M_fine P^T c  ≈  M_coarse c
  //
  // Promote a random coarse vector into the fine subspace, apply the
  // fine operator, project back, and compare with the coarse operator
  // applied directly.  Error should be at the level of subspace
  // approximation quality (smaller = better basis vectors).
  //--------------------------------------------------------------------
  {
    GridParallelRNG RNGc(Coarse4d); RNGc.SeedFixedIntegers({9,10,11,12});
    CoarseVector c_src(Coarse4d), c_ldop(Coarse4d), c_proj(Coarse4d);
    random(RNGc, c_src);
    LatticeStaggeredFermionD f_v(UrbGrid), f_Mv(UrbGrid);
    Aggregates.PromoteFromSubspace(c_src, f_v);
    HermOp.Op(f_v, f_Mv);
    Aggregates.ProjectToSubspace(c_proj, f_Mv);
    LDO.M(c_src, c_ldop);
    c_proj -= c_ldop;
    RealD err = norm2(c_proj) / norm2(c_ldop);
    std::cout << GridLogMessage
              << "Coarsen check |P*M_fine - M_coarse| / |M_coarse| = " << err << std::endl;
  }
  //--------------------------------------------------------------------
  // Multi-RHS coarse grid
  //
  // The extra leading dimension holds nrhs right-hand sides packed into
  // SIMD lanes, matching the pattern of Test_general_coarse_hdcg_phys48.
  //--------------------------------------------------------------------
  const int nrhs = vComplex::Nsimd() * 2;
  Coordinate mpi   = GridDefaultMpi();
  Coordinate rhMpi ({1,    mpi[0],  mpi[1],  mpi[2],  mpi[3]});
  Coordinate rhLatt({nrhs, clatt[0], clatt[1], clatt[2], clatt[3]});
  Coordinate rhSimd({vComplex::Nsimd(), 1, 1, 1, 1});
  GridCartesian *CoarseMrhs = new GridCartesian(rhLatt, rhSimd, rhMpi);
  typedef MultiGeneralCoarsenedMatrix<vColourVector, vTComplex, nbasis> MultiCoarseOp;
  MultiCoarseOp mrhs(geom, CoarseMrhs);
  mrhs.CoarsenOperator(HermOp, Aggregates, Coarse4d);
  //--------------------------------------------------------------------
  // Coarse-grid Lanczos for deflation
  //--------------------------------------------------------------------
  typedef HermitianLinearOperator<MultiCoarseOp, CoarseVector> MrhsHermOp;
  MrhsHermOp MrhsCoarseOp(mrhs);
  // Estimate spectral bounds for Lanczos Chebyshev filter
  CoarseVector pm_src(CoarseMrhs); pm_src = ComplexD(1.0);
  PowerMethod<CoarseVector> cPM;
  RealD lambda_max = cPM(MrhsCoarseOp, pm_src);
  // Chebyshev filter window [lo, hi]:
  //   lo must sit in the spectral gap between the Nstop-th and (Nstop+1)-th
  //   coarse eigenvalues so that only the target modes receive cosh amplification.
  //
  // From a pilot run (16^4 fine, 4^4 coarse, mass=0.05, hot config):
  //   Group 1 (near-null, 24 modes): lambda in [0.002647, 0.002746]  ~= mass^2
  //   Spectral gap: factor 60 (lambda_24/lambda_23 = 0.165/0.00275)
  //   Group 2 (second group): lambda in [0.165, 0.179]
  //
  // lo = 0.02 sits in the spectral gap (factor 7x above lambda_23=0.00275,
  // factor 8x below lambda_24=0.165).
  //   hi = lambda_max_coarse * 1.1 ~= 2.121
  //   y(lambda_0=0.002647)  ~ -1.016 -> T_70 ~ 1.7e5  (cosh(70*0.182))
  //   y(lambda_23=0.002746) ~ -1.015 -> T_70 ~ 1.6e5
  //   Relative spread across near-null cluster: ~4.3%
  //   y(lambda_24=0.165)    ~ -0.862 -> inside [lo,hi] -> |T_70| <= 1
  //
  // order=71 (degree 70) is needed to give ~4% relative spread across the
  // near-null cluster of 24 nearly-degenerate eigenvalues; order=31 (tried)
  // gave only ~1.7% spread, insufficient for Nk=24/Nm=48 to converge.
  // Absolute amplification ~1e5; what matters for IRL convergence is the
  // relative spread, not the absolute value.
  // lo=0.005 failed (T_70~53, 0/24 modes in 10 restarts).
  // lo=0.01 worked but needed 2 restarts (13/24 then 24/24); lo=0.02 converges in 1.
  RealD lambda_lo  = 0.02;
  std::cout << GridLogMessage << "Chebyshev filter: lo=" << lambda_lo
            << " hi=" << lambda_max*1.1 << " order=71" << std::endl;
  Chebyshev<CoarseVector> IRLCheby(lambda_lo, lambda_max * 1.1, 71);
  // 24 near-null modes (eigenvalues ~mass^2) converge to resid^2~1e-28
  // in the first Lanczos restart.  The remaining modes (~0.165) are a
  // second spectral group that needs more Krylov vectors; handle them
  // separately once the basic HDCG solve is validated.
  int Nk    = 24;
  int Nm    = 48;
  int Nstop = Nk;
  GridParallelRNG CRNG(Coarse4d); CRNG.SeedFixedIntegers({13,14,15,16});
  ImplicitlyRestartedBlockLanczosCoarse<CoarseVector>
    IRL(MrhsCoarseOp, Coarse4d, CoarseMrhs, nrhs, IRLCheby,
        Nstop, /*conv_test_interval*/1, nrhs, Nk, Nm, 1.0e-5, 10);
  int Nconv;
  std::vector<RealD>        eval(Nm);
  std::vector<CoarseVector> evec(Nm,   Coarse4d);  // evec on f_grid (single-RHS coarse)
  std::vector<CoarseVector> c_srcs(nrhs, Coarse4d); // src on same grid as evec
  for (int r = 0; r < nrhs; r++) random(CRNG, c_srcs[r]);
  IRL.calc(eval, evec, c_srcs, Nconv, LanczosType::irbl);
  //--------------------------------------------------------------------
  // HDCG solver assembly
  //--------------------------------------------------------------------
  MultiRHSDeflation<CoarseVector> MrhsGuesser;
  MrhsGuesser.ImportEigenBasis(evec, eval);
  // MrhsProjector maps between fine (UrbGrid) and coarse (Coarse4d) spaces
  MultiRHSBlockProject<LatticeStaggeredFermionD> MrhsProjector;
  MrhsProjector.Allocate(nbasis, UrbGrid, Coarse4d);
  MrhsProjector.ImportBasis(Aggregates.subspace);
  ConjugateGradient<CoarseVector> CoarseCG(5.0e-2, 5000, false);
  DoNothingGuesser<CoarseVector>  DoNothing;
  HPDSolver<CoarseVector> HPDSolve(MrhsCoarseOp, CoarseCG, DoNothing);
  // Spectral radius of the fine operator, needed for the smoother shift.
  // Use a random checkerboard vector (UrbGrid) as starting guess for PowerMethod.
  LatticeStaggeredFermionD fine_pm_src(UrbGrid);
  random(RNGrb, fine_pm_src);
  PowerMethod<LatticeStaggeredFermionD> finePM;
  RealD fine_lambda_max = finePM(HermOp, fine_pm_src);
  // Shifted smoother: CG on (HermOp + shift*I) with shift = lambda_max / 100.
  //
  // The O(8) CG polynomial has 8 roots. With this shift all 8 roots lie in the
  // interval [shift, lambda_max + shift] ~ [0.046, 4.65], so the polynomial
  // focuses entirely on the HIGH-frequency part of the spectrum and leaves
  // near-null modes (lambda << shift) essentially untouched (polynomial ~ 1 there).
  //
  // This is the right target because the coarse-grid correction always introduces
  // high-frequency spectral leakage: the blocked coarse-grid degrees of freedom
  // are piecewise constant across coarse cells and therefore have sharp edges at
  // cell boundaries (like lego-block edges). Smoothness is measured by the
  // covariant Dirac derivative, so promoting the coarse solution back to the
  // fine grid inevitably excites high-frequency components — just as a step
  // function always carries high-frequency Fourier content.  The smoother must
  // repair exactly these high modes.
  //
  // The smoother and the coarse-grid correction are applied alternately: together
  // they both lift the low eigenvalues and pull down the upper eigenvalues of the
  // composite preconditioned operator, reducing the condition number seen by the
  // outer HDCG iterations.
  //
  // DWF HDCG convention; using mass^2 = 0.0025 was far too small: it scattered
  // the 8 roots over [0.005, 4.6] and diluted their effect on the high modes.
  RealD smootherShift = fine_lambda_max / 200.0;
  std::cout << GridLogMessage << "Smoother shift: lambda_max_fine/200 = "
            << fine_lambda_max << "/200 = " << smootherShift << std::endl;
  ShiftedHermOpLinearOperator<LatticeStaggeredFermionD> ShiftedOp(HermOp, smootherShift);
  CGSmoother<LatticeStaggeredFermionD> smoother(8, ShiftedOp);
  TwoLevelADEF2mrhs<LatticeStaggeredFermionD, CoarseVector>
    HDCG(1.0e-8, 500,
         HermOp,
         smoother,
         HPDSolve,   // M1 (coarse correction)
         HPDSolve,   // Vstart (initial guess projection)
         MrhsProjector,
         MrhsGuesser,
         CoarseMrhs);
  //--------------------------------------------------------------------
  // Solve: nrhs right-hand sides simultaneously
  //--------------------------------------------------------------------
  std::vector<LatticeStaggeredFermionD> src(nrhs, UrbGrid);
  std::vector<LatticeStaggeredFermionD> sol(nrhs, UrbGrid);
  GridParallelRNG RNGrb2(UGrid); RNGrb2.SeedFixedIntegers({17,18,19,20}); // must use full grid, not UrbGrid
  for (int r = 0; r < nrhs; r++) {
    random(RNGrb2, src[r]);
    sol[r] = Zero();
  }
  //--------------------------------------------------------------------
  // Baseline: standard single-RHS CG on HermOp (no preconditioning)
  // Run before HDCG to establish the unpreconditioned iteration count
  // and wall-clock time for direct comparison.
  //--------------------------------------------------------------------
  {
    ConjugateGradient<LatticeStaggeredFermionD> CG(1.0e-8, 100000, false);
    std::vector<LatticeStaggeredFermionD> cg_sol(nrhs, UrbGrid);
    for (int r = 0; r < nrhs; r++) cg_sol[r] = Zero();
    RealD t0 = usecond();
    int total_iters = 0;
    for (int r = 0; r < nrhs; r++) {
      std::cout << GridLogMessage << "====== CG baseline RHS " << r
                << " ======" << std::endl;
      CG(HermOp, src[r], cg_sol[r]);
      total_iters += CG.IterationsToComplete;
    }
    RealD t1 = usecond();
    std::cout << GridLogMessage << "CG baseline: " << nrhs << " RHS, "
              << total_iters << " total iterations, "
              << (t1 - t0) / 1.0e6 << " s total, "
              << (t1 - t0) / 1.0e6 / nrhs << " s/RHS" << std::endl;
  }
  //--------------------------------------------------------------------
  // HDCG solve
  //--------------------------------------------------------------------
  HDCG(src, sol);
  Grid_finalize();
  return 0;
 }