1
0
mirror of https://github.com/paboyle/Grid.git synced 2026-06-04 11:14:38 +01:00

Some improvements that should have been there if in synch with develop,

and also some staggered hdcg type work
This commit is contained in:
Peter Boyle
2026-05-29 13:36:57 -04:00
parent 34d8d003a8
commit 42cd9eda71
8 changed files with 660 additions and 93 deletions
+1 -1
View File
@@ -59,7 +59,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
#if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 13)
#define GRID_CUB_SUM_OP ::cuda::std::plus<>{}
#else
#define GRID_CUB_SUM_OP ::cub::Sum()
#define GRID_CUB_SUM_OP ::gpucub::Sum()
#endif
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, GRID_CUB_SUM_OP, zero_init, computeStream);
+21 -4
View File
@@ -33,7 +33,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
using namespace std;
using namespace Grid;
;
int main (int argc, char ** argv)
{
@@ -97,20 +96,38 @@ int main (int argc, char ** argv)
RealD c2=-1.0/24.0;
RealD u0=1.0;
ImprovedStaggeredFermionD Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0,params);
NaiveStaggeredFermionD Dn(Umu,Grid,RBGrid,mass,c1,u0,params);
std::cout<<GridLogMessage << "Calling Ds"<<std::endl;
int ncall=1000;
int ncall=100;
// warm perf only
for(int i=0;i<ncall;i++){
Ds.Dhop(src,result,0);
}
double t0=usecond();
for(int i=0;i<ncall;i++){
Ds.Dhop(src,result,0);
}
double t1=usecond();
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + 90 == 1146
std::cout<<GridLogMessage << "Called Ds"<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
// Warm perf only
for(int i=0;i<ncall;i++){
Dn.Dhop(src,result,0);
}
t0=usecond();
for(int i=0;i<ncall;i++){
Ds.Dhop(src,result,0);
}
t1=usecond();
flops=(8*(3*(6+8+8)) + 7*3*2)*volume*ncall;
std::cout<<GridLogMessage << "Called Dn"<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
Grid_finalize();
}
+170 -6
View File
@@ -716,6 +716,161 @@ public:
return mflops_best;
}
static double NaiveStaggered(int L)
{
double mflops;
double mflops_best = 0;
double mflops_worst= 0;
std::vector<double> mflops_all;
///////////////////////////////////////////////////////
// Set/Get the layout & grid size
///////////////////////////////////////////////////////
int threads = GridThread::GetThreads();
Coordinate mpi = GridDefaultMpi(); GRID_ASSERT(mpi.size()==4);
Coordinate local({L,L,L,L});
Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4,
GridDefaultSimd(Nd,vComplex::Nsimd()),
GridDefaultMpi());
uint64_t NP = TmpGrid->RankCount();
uint64_t NN = TmpGrid->NodeCount();
NN_global=NN;
uint64_t SHM=NP/NN;
///////// Welcome message ////////////
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "Benchmark NaiveStaggered on "<<L<<"^4 local volume "<<std::endl;
std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl;
std::cout<<GridLogMessage << "* ranks : "<<NP <<std::endl;
std::cout<<GridLogMessage << "* nodes : "<<NN <<std::endl;
std::cout<<GridLogMessage << "* ranks/node : "<<SHM <<std::endl;
std::cout<<GridLogMessage << "* ranks geom : "<<GridCmdVectorIntToString(mpi)<<std::endl;
std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
///////// Lattice Init ////////////
GridCartesian * FGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
///////// RNG Init ////////////
std::vector<int> seeds4({1,2,3,4});
GridParallelRNG RNG4(FGrid); RNG4.SeedFixedIntegers(seeds4);
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
RealD mass=0.1;
RealD c1=9.0/8.0;
RealD c2=-1.0/24.0;
RealD u0=1.0;
typedef NaiveStaggeredFermionF Action;
typedef typename Action::FermionField Fermion;
typedef LatticeGaugeFieldF Gauge;
Gauge Umu(FGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
typename Action::ImplParams params;
Action Ds(Umu,*FGrid,*FrbGrid,mass,c1,u0,params);
///////// Source preparation ////////////
Fermion src (FGrid); random(RNG4,src);
Fermion src_e (FrbGrid);
Fermion src_o (FrbGrid);
Fermion r_e (FrbGrid);
Fermion r_o (FrbGrid);
Fermion r_eo (FGrid);
{
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
const int num_cases = 2;
std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
controls Cases [] = {
{ StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
{ StaggeredKernelsStatic::OptHandUnroll, StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
{ StaggeredKernelsStatic::OptInlineAsm , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }
};
for(int c=0;c<num_cases;c++) {
StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap;
StaggeredKernelsStatic::Opt = Cases[c].Opt;
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
if ( StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc StaggeredKernels" <<std::endl;
std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
int nwarm = 10;
double t0=usecond();
FGrid->Barrier();
for(int i=0;i<nwarm;i++){
Ds.DhopEO(src_o,r_e,DaggerNo);
}
FGrid->Barrier();
double t1=usecond();
uint64_t no = 50;
uint64_t ni = 100;
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
time_statistics timestat;
std::vector<double> t_time(no);
for(uint64_t i=0;i<no;i++){
t0=usecond();
for(uint64_t j=0;j<ni;j++){
Ds.DhopEO(src_o,r_e,DaggerNo);
}
t1=usecond();
t_time[i] = t1-t0;
}
FGrid->Barrier();
double volume=1; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=((8*(3*(6+8+8)) + 7*3*2)*1.0*volume)/2;
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops/timestat.min*ni;
mf_lo = flops/timestat.max*ni;
mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean*ni;
mflops_all.push_back(mflops);
if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops;
if ( mflops>mflops_best ) mflops_best = mflops;
if ( mflops<mflops_worst) mflops_worst= mflops;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call "<< timestat.mean/ni<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << L<<"^4 Deo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage << L<<"^4 Deo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage <<fmt << std::endl;
std::cout<<GridLogMessage ;
for(int i=0;i<mflops_all.size();i++){
std::cout<<mflops_all[i]/NN<<" ; " ;
}
std::cout<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
return mflops_best;
}
static double Clover(int L)
{
double mflops;
@@ -887,6 +1042,7 @@ int main (int argc, char ** argv)
std::vector<double> clover;
std::vector<double> dwf4;
std::vector<double> staggered;
std::vector<double> naive_staggered;
int Ls=1;
if (do_dslash){
@@ -914,13 +1070,21 @@ int main (int argc, char ** argv)
staggered.push_back(result);
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Naive Staggered dslash 4D vectorised" <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
for(int l=0;l<L_list.size();l++){
double result = Benchmark::NaiveStaggered(L_list[l]) ;
naive_staggered.push_back(result);
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "L \t\t Clover \t\t DWF4 \t\t Staggered" <<std::endl;
std::cout<<GridLogMessage << "L \t\t Clover \t\t DWF4 \t\t Staggered \t\t Naive Staggered" <<std::endl;
for(int l=0;l<L_list.size();l++){
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl;
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<" \t\t "<<naive_staggered[l]<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
}
@@ -930,14 +1094,14 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl;
std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered \t\t NaiveStag \t|\t (GF/s per node)" <<std::endl;
fprintf(FP,"Per node summary table\n");
fprintf(FP,"\n");
fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n");
fprintf(FP,"L , Wilson, DWF4, Staggered, NaiveStag\n");
fprintf(FP,"\n");
for(int l=0;l<L_list.size();l++){
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.);
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<" \t " <<naive_staggered[l]/NN<<std::endl;
fprintf(FP,"%d , %.0f, %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.,naive_staggered[l]/NN/1000.);
}
fprintf(FP,"\n");
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+78 -63
View File
@@ -1,76 +1,91 @@
Per node summary table
L , Wilson, DWF4, Staggered, NaiveStag
8 , 90, 933, 38, 23
12 , 403, 1688, 178, 113
16 , 188, 1647, 449, 295
24 , 947, 1574, 674, 553
32 , 931, 1371, 718, 643
Memory Bandwidth
Bytes, GB/s per node
6291456, 379.297050
100663296, 3754.674992
509607936, 6521.472413
1610612736, 8513.456479
3932160000, 9018.901766
GEMM
M, N, K, BATCH, GF/s per rank
16, 8, 16, 256, 0.564958
16, 16, 16, 256, 243.148058
16, 32, 16, 256, 440.346877
32, 8, 32, 256, 439.194136
32, 16, 32, 256, 847.334141
32, 32, 32, 256, 1430.892623
64, 8, 64, 256, 1242.756741
64, 16, 64, 256, 2196.689493
64, 32, 64, 256, 3697.458072
16, 8, 256, 256, 899.582627
16, 16, 256, 256, 1673.537756
16, 32, 256, 256, 2959.597089
32, 8, 256, 256, 1558.858630
32, 16, 256, 256, 2864.839445
32, 32, 256, 256, 4810.671254
64, 8, 256, 256, 2386.092942
64, 16, 256, 256, 4451.665937
64, 32, 256, 256, 5942.124095
8, 256, 16, 256, 799.867271
16, 256, 16, 256, 1584.624888
32, 256, 16, 256, 1949.422338
8, 256, 32, 256, 1389.417474
16, 256, 32, 256, 2668.344493
32, 256, 32, 256, 3234.162120
8, 256, 64, 256, 2150.925128
16, 256, 64, 256, 4012.488132
32, 256, 64, 256, 5154.785521
786432, 40.271620
12582912, 433.611792
63700992, 905.374321
201326592, 1114.979152
491520000, 1180.241898
Communications
Packet bytes, direction, GB/s per node
4718592, 1, 245.026198
4718592, 2, 251.180996
4718592, 3, 361.110977
4718592, 5, 247.898447
4718592, 6, 249.867523
4718592, 7, 359.033061
15925248, 1, 255.030946
15925248, 2, 264.453890
15925248, 3, 392.949183
15925248, 5, 256.040644
15925248, 6, 264.681896
15925248, 7, 392.102622
37748736, 1, 258.823333
37748736, 2, 268.181577
37748736, 3, 401.478191
37748736, 5, 258.995363
37748736, 6, 268.206586
37748736, 7, 400.397611
Per node summary table
GEMM
M, N, K, BATCH, GF/s per rank fp64
16, 8, 16, 4096, 693.316363
16, 12, 16, 4096, 657.277058
16, 16, 16, 4096, 711.992616
32, 8, 32, 4096, 821.084324
32, 12, 32, 4096, 1279.852719
32, 16, 32, 4096, 2647.096674
64, 8, 64, 4096, 2630.192325
64, 12, 64, 4096, 3338.071321
64, 16, 64, 4096, 3950.899281
16, 8, 256, 4096, 1638.362501
16, 12, 256, 4096, 2377.502234
16, 16, 256, 4096, 3048.328833
32, 8, 256, 4096, 2917.384276
32, 12, 256, 4096, 4103.085151
32, 16, 256, 4096, 5102.971860
64, 8, 256, 4096, 3222.258206
64, 12, 256, 4096, 4619.456391
64, 16, 256, 4096, 5847.916650
8, 256, 16, 4096, 1728.073337
12, 256, 16, 4096, 2356.653970
16, 256, 16, 4096, 2676.876038
8, 256, 32, 4096, 2611.531990
12, 256, 32, 4096, 3451.573106
16, 256, 32, 4096, 3966.915301
8, 256, 64, 4096, 3436.248737
12, 256, 64, 4096, 4539.497945
16, 256, 64, 4096, 5307.992323
GEMM
M, N, K, BATCH, GF/s per rank fp32
16, 8, 16, 4096, 499.017445
16, 12, 16, 4096, 731.543385
16, 16, 16, 4096, 958.800786
32, 8, 32, 4096, 1549.813550
32, 12, 32, 4096, 2147.907502
32, 16, 32, 4096, 2601.698596
64, 8, 64, 4096, 3785.446233
64, 12, 64, 4096, 5116.694843
64, 16, 64, 4096, 6109.345016
16, 8, 256, 4096, 1206.627737
16, 12, 256, 4096, 1809.699599
16, 16, 256, 4096, 2412.014053
32, 8, 256, 4096, 2406.114488
32, 12, 256, 4096, 3605.531907
32, 16, 256, 4096, 4798.444037
64, 8, 256, 4096, 4688.711196
64, 12, 256, 4096, 6990.696301
64, 16, 256, 4096, 9214.749925
8, 256, 16, 4096, 2596.307289
12, 256, 16, 4096, 3439.892562
16, 256, 16, 4096, 3907.201036
8, 256, 32, 4096, 3012.752067
12, 256, 32, 4096, 3904.217583
16, 256, 32, 4096, 4599.047092
8, 256, 64, 4096, 3721.999042
12, 256, 64, 4096, 5098.573927
16, 256, 64, 4096, 6159.080872
L , Wilson, DWF4, Staggered, GF/s per node
8 , 155, 1386, 50
12 , 694, 4208, 230
16 , 1841, 6675, 609
24 , 3934, 8573, 1641
32 , 5083, 9771, 3086
1 Memory Bandwidth Per node summary table
1 Per node summary table
2 L , Wilson, DWF4, Staggered, NaiveStag
3 8 , 90, 933, 38, 23
4 12 , 403, 1688, 178, 113
5 16 , 188, 1647, 449, 295
6 24 , 947, 1574, 674, 553
7 32 , 931, 1371, 718, 643
8 Memory Bandwidth
9 Bytes, GB/s per node
10 786432, 40.271620
11 Memory Bandwidth 12582912, 433.611792
12 Bytes, GB/s per node 63700992, 905.374321
13 6291456, 379.297050 201326592, 1114.979152
14 100663296, 3754.674992 491520000, 1180.241898
15 509607936, 6521.472413 Communications
16 1610612736, 8513.456479 Packet bytes, direction, GB/s per node
17 3932160000, 9018.901766 GEMM
18 GEMM M, N, K, BATCH, GF/s per rank fp64
M, N, K, BATCH, GF/s per rank
16, 8, 16, 256, 0.564958
16, 16, 16, 256, 243.148058
16, 32, 16, 256, 440.346877
32, 8, 32, 256, 439.194136
32, 16, 32, 256, 847.334141
32, 32, 32, 256, 1430.892623
64, 8, 64, 256, 1242.756741
64, 16, 64, 256, 2196.689493
64, 32, 64, 256, 3697.458072
16, 8, 256, 256, 899.582627
16, 16, 256, 256, 1673.537756
16, 32, 256, 256, 2959.597089
32, 8, 256, 256, 1558.858630
32, 16, 256, 256, 2864.839445
32, 32, 256, 256, 4810.671254
64, 8, 256, 256, 2386.092942
64, 16, 256, 256, 4451.665937
64, 32, 256, 256, 5942.124095
8, 256, 16, 256, 799.867271
16, 256, 16, 256, 1584.624888
32, 256, 16, 256, 1949.422338
8, 256, 32, 256, 1389.417474
16, 256, 32, 256, 2668.344493
32, 256, 32, 256, 3234.162120
8, 256, 64, 256, 2150.925128
16, 256, 64, 256, 4012.488132
32, 256, 64, 256, 5154.785521
Communications
Packet bytes, direction, GB/s per node
4718592, 1, 245.026198
4718592, 2, 251.180996
4718592, 3, 361.110977
19 4718592, 5, 247.898447 16, 8, 16, 4096, 693.316363
20 4718592, 6, 249.867523 16, 12, 16, 4096, 657.277058
21 4718592, 7, 359.033061 16, 16, 16, 4096, 711.992616
22 15925248, 1, 255.030946 32, 8, 32, 4096, 821.084324
23 15925248, 2, 264.453890 32, 12, 32, 4096, 1279.852719
15925248, 3, 392.949183
15925248, 5, 256.040644
15925248, 6, 264.681896
15925248, 7, 392.102622
37748736, 1, 258.823333
37748736, 2, 268.181577
37748736, 3, 401.478191
37748736, 5, 258.995363
37748736, 6, 268.206586
37748736, 7, 400.397611
Per node summary table
L , Wilson, DWF4, Staggered, GF/s per node
8 , 155, 1386, 50
12 , 694, 4208, 230
16 , 1841, 6675, 609
24 , 3934, 8573, 1641
32 , 5083, 9771, 3086
24 32, 16, 32, 4096, 2647.096674
25 64, 8, 64, 4096, 2630.192325
26 64, 12, 64, 4096, 3338.071321
27 64, 16, 64, 4096, 3950.899281
28 16, 8, 256, 4096, 1638.362501
29 16, 12, 256, 4096, 2377.502234
30 16, 16, 256, 4096, 3048.328833
31 32, 8, 256, 4096, 2917.384276
32 32, 12, 256, 4096, 4103.085151
33 32, 16, 256, 4096, 5102.971860
34 64, 8, 256, 4096, 3222.258206
35 64, 12, 256, 4096, 4619.456391
36 64, 16, 256, 4096, 5847.916650
37 8, 256, 16, 4096, 1728.073337
38 12, 256, 16, 4096, 2356.653970
39 16, 256, 16, 4096, 2676.876038
40 8, 256, 32, 4096, 2611.531990
41 12, 256, 32, 4096, 3451.573106
42 16, 256, 32, 4096, 3966.915301
43 8, 256, 64, 4096, 3436.248737
44 12, 256, 64, 4096, 4539.497945
45 16, 256, 64, 4096, 5307.992323
46 GEMM
47 M, N, K, BATCH, GF/s per rank fp32
48 16, 8, 16, 4096, 499.017445
49 16, 12, 16, 4096, 731.543385
50 16, 16, 16, 4096, 958.800786
51 32, 8, 32, 4096, 1549.813550
52 32, 12, 32, 4096, 2147.907502
53 32, 16, 32, 4096, 2601.698596
54 64, 8, 64, 4096, 3785.446233
55 64, 12, 64, 4096, 5116.694843
56 64, 16, 64, 4096, 6109.345016
57 16, 8, 256, 4096, 1206.627737
58 16, 12, 256, 4096, 1809.699599
59 16, 16, 256, 4096, 2412.014053
60 32, 8, 256, 4096, 2406.114488
61 32, 12, 256, 4096, 3605.531907
62 32, 16, 256, 4096, 4798.444037
63 64, 8, 256, 4096, 4688.711196
64 64, 12, 256, 4096, 6990.696301
65 64, 16, 256, 4096, 9214.749925
66 8, 256, 16, 4096, 2596.307289
67 12, 256, 16, 4096, 3439.892562
68 16, 256, 16, 4096, 3907.201036
69 8, 256, 32, 4096, 3012.752067
70 12, 256, 32, 4096, 3904.217583
71 16, 256, 32, 4096, 4599.047092
72 8, 256, 64, 4096, 3721.999042
73 12, 256, 64, 4096, 5098.573927
74 16, 256, 64, 4096, 6159.080872
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
+5 -5
View File
@@ -1,4 +1,3 @@
CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
../../configure --enable-comms=mpi-auto \
--with-lime=$CLIME \
--enable-unified=no \
@@ -9,12 +8,13 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
--disable-gparity \
--disable-fermion-reps \
--enable-simd=GPU \
--with-gmp=$OLCF_GMP_ROOT \
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
--with-gmp=$GMP \
--with-mpfr=$MPFR \
--with-openssl=$OPENSSL \
--disable-fermion-reps \
CXX=hipcc MPICXX=mpicxx \
CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas -lhipfft"
CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include " \
LDFLAGS="-L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -lmpi_gtl_hsa -lhipblas -lrocblas -lhipfft -lamdhip64"
+10 -12
View File
@@ -1,16 +1,14 @@
echo spack
. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
. /autofs/nccs-svm1_home1/paboyle/spack/share/spack/setup-env.sh
module load amd/7.0.2
module load cray-fftw
module load craype-accel-amd-gfx90a
mkdir $HOME/LD_PATH
ln -s /opt/rocm-6.4.2/lib/libamdhip* $HOME/LD_PATH
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
export MPFR=`spack find --paths mpfr | grep ^mpfr | awk '{print $2}' `
export OPENSSL=`spack find --paths openssl | grep openssl | awk '{print $2}' `
export GMP=`spack find --paths gmp | grep ^gmp | awk '{print $2}' `
#Ugly hacks to get down level software working on current system
export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
#export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/LD_PATH/
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm-7.0.2/lib
module load cce/21.0.0
module load cpe/26.03
module load rocm/7.0.2
export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/opt/rocm-7.0.2/lib/llvm/lib/:$LD_LIBRARY_PATH
+2 -2
View File
@@ -36,8 +36,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
using namespace std;
using namespace Grid;
gridblasHandle_t GridBLAS::gridblasHandle;
int GridBLAS::gridblasInit;
//gridblasHandle_t GridBLAS::gridblasHandle;
//int GridBLAS::gridblasInit;
///////////////////////
// Tells little dirac op to use MdagM as the .Op()
+373
View File
@@ -0,0 +1,373 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: tests/debug/Test_staggered_hdcg.cc
Authors: Thomas Blum, Peter Boyle
HDCG (Hierarchical Deflation Conjugate Gradient) multigrid solver
for naive staggered fermions, based on arXiv:2409.03904.
Adapts the DWF HDCG infrastructure (Test_general_coarse_hdcg_phys48.cc) to:
- NaiveStaggeredFermion (nearest-neighbour only, no Naik 3-hop term)
- 4D SchurStaggeredOperator: Mpc = m^2 - D_oe * D_eo (hermitian, positive-definite)
- vColourVector fine field type (staggered has colour but no spin)
- NextToNearestStencilGeometry4D: 33-point coarse stencil
Stencil count: D_oe*D_eo has 2-hop fine range. With blocking B >= 2 the coarse
shifts have L1-distance <= 2, giving 33 stencil points in 4D:
1 (identity) + 8 (+-e_mu) + 24 (+-e_mu +- e_nu).
NaiveStaggeredFermion has no Naik term, so any B >= 2 suffices.
To extend to ImprovedStaggeredFermion later, use B >= 6.
Reference: arXiv:2409.03904 (mrhs hermitian multigrid for DWF).
Usage (after build):
./Test_staggered_hdcg --grid 16.16.16.16 --mpi 1.1.1.1
*************************************************************************************/
#include <Grid/Grid.h>
#include <Grid/algorithms/iterative/AdefMrhs.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
using namespace Grid;
// Non-converging CG used as a smoother (fixed number of iterations)
template<class Field>
class CGSmoother : public LinearFunction<Field>
{
public:
typedef LinearOperatorBase<Field> FineOperator;
FineOperator &_Op;
int iters;
CGSmoother(int _iters, FineOperator &Op) : _Op(Op), iters(_iters) {}
void operator()(const Field &in, Field &out)
{
ConjugateGradient<Field> CG(0.0, iters, false);
out = Zero();
CG(_Op, in, out);
}
};
int main(int argc, char **argv)
{
fprintf(stderr, "TRACE: entering main\n"); fflush(stderr);
Grid_init(&argc, &argv);
fprintf(stderr, "TRACE: Grid_init done\n"); fflush(stderr);
//--------------------------------------------------------------------
// Parameters — tune for production
//--------------------------------------------------------------------
const int nbasis = 24; // near-null space dimension
const int cb = 0; // even checkerboard
RealD mass = 0.00184;
// NaiveStaggeredFermion: nearest-neighbour hop only (no Naik term).
// c1 = coefficient of the hopping term (1.0 = standard normalisation).
// u0 = tadpole factor (1.0 = no tadpole improvement).
RealD c1 = 1.0;
RealD u0 = 1.0;
//--------------------------------------------------------------------
// Grids
// Fine: UGrid (4D full), UrbGrid (4D red-black)
// Coarse: Coarse4d with dimensions = GridDefaultLatt() / Block
//
// Recommended: GridDefaultLatt() >= 16^4, Block = {4,4,4,4}
// NaiveStaggeredFermion works with any Block >= {2,2,2,2}
//--------------------------------------------------------------------
fprintf(stderr, "TRACE: making UGrid\n"); fflush(stderr);
GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(
GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi());
fprintf(stderr, "TRACE: making UrbGrid\n"); fflush(stderr);
GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
Coordinate Block({4, 4, 4, 4});
Coordinate clatt = GridDefaultLatt();
for (int d = 0; d < clatt.size(); d++) clatt[d] /= Block[d];
Coordinate csimd = GridDefaultSimd(Nd, vComplex::Nsimd());
Coordinate cmpi = GridDefaultMpi();
fprintf(stderr, "TRACE: making Coarse4d clatt=%d %d %d %d simd=%d %d %d %d mpi=%d %d %d %d Nsimd=%d\n",
clatt[0],clatt[1],clatt[2],clatt[3],
csimd[0],csimd[1],csimd[2],csimd[3],
cmpi[0],cmpi[1],cmpi[2],cmpi[3],
(int)vComplex::Nsimd()); fflush(stderr);
GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt, csimd, cmpi);
fprintf(stderr, "TRACE: Coarse4d made\n"); fflush(stderr);
//--------------------------------------------------------------------
// RNG + gauge field
//--------------------------------------------------------------------
fprintf(stderr, "TRACE: RNG4\n"); fflush(stderr);
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers({1,2,3,4});
fprintf(stderr, "TRACE: RNGrb\n"); fflush(stderr);
GridParallelRNG RNGrb(UGrid); RNGrb.SeedFixedIntegers({5,6,7,8}); // must use full grid, not UrbGrid
fprintf(stderr, "TRACE: Umu\n"); fflush(stderr);
LatticeGaugeField Umu(UGrid);
int HotStart = 0;
if ( HotStart ) {
fprintf(stderr, "TRACE: HotConfig\n"); fflush(stderr);
SU<Nc>::HotConfiguration(RNG4, Umu);
} else {
FieldMetaData header;
std::string file("./configuration.ildg");
IldgReader IR;
IR.open(file);
IR.readConfiguration(Umu,header);
IR.close();
}
fprintf(stderr, "TRACE: NaiveStaggeredFermionD\n"); fflush(stderr);
NaiveStaggeredFermionD Ds(Umu, *UGrid, *UrbGrid, mass, c1, u0);
fprintf(stderr, "TRACE: SchurStaggeredOperator\n"); fflush(stderr);
SchurStaggeredOperator<NaiveStaggeredFermionD, LatticeStaggeredFermionD> HermOp(Ds);
fprintf(stderr, "TRACE: HermOp done\n"); fflush(stderr);
//--------------------------------------------------------------------
// Subspace: inverse-iteration near-null vectors
//
// CreateSubspace applies CG (4 solves, tol=1e-4) to random noise vectors,
// converging naturally to the low modes of HermOp without needing spectral
// bound tuning. Switch to CreateSubspaceChebyshevNew once the spectrum is
// well characterised (hi ~ 5.0 for naive staggered SchurStaggeredOperator).
//--------------------------------------------------------------------
typedef Aggregation<vColourVector, vTComplex, nbasis> Subspace;
Subspace Aggregates(Coarse4d, UrbGrid, cb);
Aggregates.CreateSubspace(RNGrb, HermOp);
Aggregates.Orthogonalise();
//--------------------------------------------------------------------
// Coarse geometry: NextToNearestStencilGeometry4D
// hops=2 -> 33 stencil points in 4D
//--------------------------------------------------------------------
NextToNearestStencilGeometry4D geom(Coarse4d);
std::cout << GridLogMessage << "Coarse stencil: " << geom.npoint << " points" << std::endl;
//--------------------------------------------------------------------
// Single-RHS coarse operator (used for correctness check below)
//--------------------------------------------------------------------
typedef GeneralCoarsenedMatrix<vColourVector, vTComplex, nbasis> LittleDiracOp;
typedef LittleDiracOp::CoarseVector CoarseVector;
LittleDiracOp LDO(geom, UrbGrid, Coarse4d);
LDO.CoarsenOperator(HermOp, Aggregates);
//--------------------------------------------------------------------
// Correctness check: P M_fine P^T c ≈ M_coarse c
//
// Promote a random coarse vector into the fine subspace, apply the
// fine operator, project back, and compare with the coarse operator
// applied directly. Error should be at the level of subspace
// approximation quality (smaller = better basis vectors).
//--------------------------------------------------------------------
{
GridParallelRNG RNGc(Coarse4d); RNGc.SeedFixedIntegers({9,10,11,12});
CoarseVector c_src(Coarse4d), c_ldop(Coarse4d), c_proj(Coarse4d);
random(RNGc, c_src);
LatticeStaggeredFermionD f_v(UrbGrid), f_Mv(UrbGrid);
Aggregates.PromoteFromSubspace(c_src, f_v);
HermOp.Op(f_v, f_Mv);
Aggregates.ProjectToSubspace(c_proj, f_Mv);
LDO.M(c_src, c_ldop);
c_proj -= c_ldop;
RealD err = norm2(c_proj) / norm2(c_ldop);
std::cout << GridLogMessage
<< "Coarsen check |P*M_fine - M_coarse| / |M_coarse| = " << err << std::endl;
}
//--------------------------------------------------------------------
// Multi-RHS coarse grid
//
// The extra leading dimension holds nrhs right-hand sides packed into
// SIMD lanes, matching the pattern of Test_general_coarse_hdcg_phys48.
//--------------------------------------------------------------------
const int nrhs = vComplex::Nsimd() * 2;
Coordinate mpi = GridDefaultMpi();
Coordinate rhMpi ({1, mpi[0], mpi[1], mpi[2], mpi[3]});
Coordinate rhLatt({nrhs, clatt[0], clatt[1], clatt[2], clatt[3]});
Coordinate rhSimd({vComplex::Nsimd(), 1, 1, 1, 1});
GridCartesian *CoarseMrhs = new GridCartesian(rhLatt, rhSimd, rhMpi);
typedef MultiGeneralCoarsenedMatrix<vColourVector, vTComplex, nbasis> MultiCoarseOp;
MultiCoarseOp mrhs(geom, CoarseMrhs);
mrhs.CoarsenOperator(HermOp, Aggregates, Coarse4d);
//--------------------------------------------------------------------
// Coarse-grid Lanczos for deflation
//--------------------------------------------------------------------
typedef HermitianLinearOperator<MultiCoarseOp, CoarseVector> MrhsHermOp;
MrhsHermOp MrhsCoarseOp(mrhs);
// Estimate spectral bounds for Lanczos Chebyshev filter
CoarseVector pm_src(CoarseMrhs); pm_src = ComplexD(1.0);
PowerMethod<CoarseVector> cPM;
RealD lambda_max = cPM(MrhsCoarseOp, pm_src);
// Chebyshev filter window [lo, hi]:
// lo must sit in the spectral gap between the Nstop-th and (Nstop+1)-th
// coarse eigenvalues so that only the target modes receive cosh amplification.
//
// From a pilot run (16^4 fine, 4^4 coarse, mass=0.05, hot config):
// Group 1 (near-null, 24 modes): lambda in [0.002647, 0.002746] ~= mass^2
// Spectral gap: factor 60 (lambda_24/lambda_23 = 0.165/0.00275)
// Group 2 (second group): lambda in [0.165, 0.179]
//
// lo = 0.02 sits in the spectral gap (factor 7x above lambda_23=0.00275,
// factor 8x below lambda_24=0.165).
// hi = lambda_max_coarse * 1.1 ~= 2.121
// y(lambda_0=0.002647) ~ -1.016 -> T_70 ~ 1.7e5 (cosh(70*0.182))
// y(lambda_23=0.002746) ~ -1.015 -> T_70 ~ 1.6e5
// Relative spread across near-null cluster: ~4.3%
// y(lambda_24=0.165) ~ -0.862 -> inside [lo,hi] -> |T_70| <= 1
//
// order=71 (degree 70) is needed to give ~4% relative spread across the
// near-null cluster of 24 nearly-degenerate eigenvalues; order=31 (tried)
// gave only ~1.7% spread, insufficient for Nk=24/Nm=48 to converge.
// Absolute amplification ~1e5; what matters for IRL convergence is the
// relative spread, not the absolute value.
// lo=0.005 failed (T_70~53, 0/24 modes in 10 restarts).
// lo=0.01 worked but needed 2 restarts (13/24 then 24/24); lo=0.02 converges in 1.
RealD lambda_lo = 0.02;
std::cout << GridLogMessage << "Chebyshev filter: lo=" << lambda_lo
<< " hi=" << lambda_max*1.1 << " order=71" << std::endl;
Chebyshev<CoarseVector> IRLCheby(lambda_lo, lambda_max * 1.1, 71);
// 24 near-null modes (eigenvalues ~mass^2) converge to resid^2~1e-28
// in the first Lanczos restart. The remaining modes (~0.165) are a
// second spectral group that needs more Krylov vectors; handle them
// separately once the basic HDCG solve is validated.
int Nk = 24;
int Nm = 48;
int Nstop = Nk;
GridParallelRNG CRNG(Coarse4d); CRNG.SeedFixedIntegers({13,14,15,16});
ImplicitlyRestartedBlockLanczosCoarse<CoarseVector>
IRL(MrhsCoarseOp, Coarse4d, CoarseMrhs, nrhs, IRLCheby,
Nstop, /*conv_test_interval*/1, nrhs, Nk, Nm, 1.0e-5, 10);
int Nconv;
std::vector<RealD> eval(Nm);
std::vector<CoarseVector> evec(Nm, Coarse4d); // evec on f_grid (single-RHS coarse)
std::vector<CoarseVector> c_srcs(nrhs, Coarse4d); // src on same grid as evec
for (int r = 0; r < nrhs; r++) random(CRNG, c_srcs[r]);
IRL.calc(eval, evec, c_srcs, Nconv, LanczosType::irbl);
//--------------------------------------------------------------------
// HDCG solver assembly
//--------------------------------------------------------------------
MultiRHSDeflation<CoarseVector> MrhsGuesser;
MrhsGuesser.ImportEigenBasis(evec, eval);
// MrhsProjector maps between fine (UrbGrid) and coarse (Coarse4d) spaces
MultiRHSBlockProject<LatticeStaggeredFermionD> MrhsProjector;
MrhsProjector.Allocate(nbasis, UrbGrid, Coarse4d);
MrhsProjector.ImportBasis(Aggregates.subspace);
ConjugateGradient<CoarseVector> CoarseCG(5.0e-2, 5000, false);
DoNothingGuesser<CoarseVector> DoNothing;
HPDSolver<CoarseVector> HPDSolve(MrhsCoarseOp, CoarseCG, DoNothing);
// Spectral radius of the fine operator, needed for the smoother shift.
// Use a random checkerboard vector (UrbGrid) as starting guess for PowerMethod.
LatticeStaggeredFermionD fine_pm_src(UrbGrid);
random(RNGrb, fine_pm_src);
PowerMethod<LatticeStaggeredFermionD> finePM;
RealD fine_lambda_max = finePM(HermOp, fine_pm_src);
// Shifted smoother: CG on (HermOp + shift*I) with shift = lambda_max / 100.
//
// The O(8) CG polynomial has 8 roots. With this shift all 8 roots lie in the
// interval [shift, lambda_max + shift] ~ [0.046, 4.65], so the polynomial
// focuses entirely on the HIGH-frequency part of the spectrum and leaves
// near-null modes (lambda << shift) essentially untouched (polynomial ~ 1 there).
//
// This is the right target because the coarse-grid correction always introduces
// high-frequency spectral leakage: the blocked coarse-grid degrees of freedom
// are piecewise constant across coarse cells and therefore have sharp edges at
// cell boundaries (like lego-block edges). Smoothness is measured by the
// covariant Dirac derivative, so promoting the coarse solution back to the
// fine grid inevitably excites high-frequency components — just as a step
// function always carries high-frequency Fourier content. The smoother must
// repair exactly these high modes.
//
// The smoother and the coarse-grid correction are applied alternately: together
// they both lift the low eigenvalues and pull down the upper eigenvalues of the
// composite preconditioned operator, reducing the condition number seen by the
// outer HDCG iterations.
//
// DWF HDCG convention; using mass^2 = 0.0025 was far too small: it scattered
// the 8 roots over [0.005, 4.6] and diluted their effect on the high modes.
RealD smootherShift = fine_lambda_max / 200.0;
std::cout << GridLogMessage << "Smoother shift: lambda_max_fine/200 = "
<< fine_lambda_max << "/200 = " << smootherShift << std::endl;
ShiftedHermOpLinearOperator<LatticeStaggeredFermionD> ShiftedOp(HermOp, smootherShift);
CGSmoother<LatticeStaggeredFermionD> smoother(8, ShiftedOp);
TwoLevelADEF2mrhs<LatticeStaggeredFermionD, CoarseVector>
HDCG(1.0e-8, 500,
HermOp,
smoother,
HPDSolve, // M1 (coarse correction)
HPDSolve, // Vstart (initial guess projection)
MrhsProjector,
MrhsGuesser,
CoarseMrhs);
//--------------------------------------------------------------------
// Solve: nrhs right-hand sides simultaneously
//--------------------------------------------------------------------
std::vector<LatticeStaggeredFermionD> src(nrhs, UrbGrid);
std::vector<LatticeStaggeredFermionD> sol(nrhs, UrbGrid);
GridParallelRNG RNGrb2(UGrid); RNGrb2.SeedFixedIntegers({17,18,19,20}); // must use full grid, not UrbGrid
for (int r = 0; r < nrhs; r++) {
random(RNGrb2, src[r]);
sol[r] = Zero();
}
//--------------------------------------------------------------------
// Baseline: standard single-RHS CG on HermOp (no preconditioning)
// Run before HDCG to establish the unpreconditioned iteration count
// and wall-clock time for direct comparison.
//--------------------------------------------------------------------
{
ConjugateGradient<LatticeStaggeredFermionD> CG(1.0e-8, 100000, false);
std::vector<LatticeStaggeredFermionD> cg_sol(nrhs, UrbGrid);
for (int r = 0; r < nrhs; r++) cg_sol[r] = Zero();
RealD t0 = usecond();
int total_iters = 0;
for (int r = 0; r < nrhs; r++) {
std::cout << GridLogMessage << "====== CG baseline RHS " << r
<< " ======" << std::endl;
CG(HermOp, src[r], cg_sol[r]);
total_iters += CG.IterationsToComplete;
}
RealD t1 = usecond();
std::cout << GridLogMessage << "CG baseline: " << nrhs << " RHS, "
<< total_iters << " total iterations, "
<< (t1 - t0) / 1.0e6 << " s total, "
<< (t1 - t0) / 1.0e6 / nrhs << " s/RHS" << std::endl;
}
//--------------------------------------------------------------------
// HDCG solve
//--------------------------------------------------------------------
HDCG(src, sol);
Grid_finalize();
return 0;
}