mirror of
https://github.com/paboyle/Grid.git
synced 2026-06-04 11:14:38 +01:00
Some improvements that should have been there if in synch with develop,
and also some staggered hdcg type work
This commit is contained in:
@@ -59,7 +59,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
|
|||||||
#if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 13)
|
#if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 13)
|
||||||
#define GRID_CUB_SUM_OP ::cuda::std::plus<>{}
|
#define GRID_CUB_SUM_OP ::cuda::std::plus<>{}
|
||||||
#else
|
#else
|
||||||
#define GRID_CUB_SUM_OP ::cub::Sum()
|
#define GRID_CUB_SUM_OP ::gpucub::Sum()
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, GRID_CUB_SUM_OP, zero_init, computeStream);
|
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, GRID_CUB_SUM_OP, zero_init, computeStream);
|
||||||
|
|||||||
@@ -33,7 +33,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
;
|
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
int main (int argc, char ** argv)
|
||||||
{
|
{
|
||||||
@@ -97,20 +96,38 @@ int main (int argc, char ** argv)
|
|||||||
RealD c2=-1.0/24.0;
|
RealD c2=-1.0/24.0;
|
||||||
RealD u0=1.0;
|
RealD u0=1.0;
|
||||||
ImprovedStaggeredFermionD Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0,params);
|
ImprovedStaggeredFermionD Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0,params);
|
||||||
|
NaiveStaggeredFermionD Dn(Umu,Grid,RBGrid,mass,c1,u0,params);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Calling Ds"<<std::endl;
|
std::cout<<GridLogMessage << "Calling Ds"<<std::endl;
|
||||||
int ncall=1000;
|
int ncall=100;
|
||||||
|
// warm perf only
|
||||||
|
for(int i=0;i<ncall;i++){
|
||||||
|
Ds.Dhop(src,result,0);
|
||||||
|
}
|
||||||
double t0=usecond();
|
double t0=usecond();
|
||||||
for(int i=0;i<ncall;i++){
|
for(int i=0;i<ncall;i++){
|
||||||
Ds.Dhop(src,result,0);
|
Ds.Dhop(src,result,0);
|
||||||
}
|
}
|
||||||
double t1=usecond();
|
double t1=usecond();
|
||||||
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146
|
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + 90 == 1146
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called Ds"<<std::endl;
|
std::cout<<GridLogMessage << "Called Ds"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
|
|
||||||
|
// Warm perf only
|
||||||
|
for(int i=0;i<ncall;i++){
|
||||||
|
Dn.Dhop(src,result,0);
|
||||||
|
}
|
||||||
|
t0=usecond();
|
||||||
|
for(int i=0;i<ncall;i++){
|
||||||
|
Ds.Dhop(src,result,0);
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
flops=(8*(3*(6+8+8)) + 7*3*2)*volume*ncall;
|
||||||
|
std::cout<<GridLogMessage << "Called Dn"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
|
|
||||||
Grid_finalize();
|
Grid_finalize();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -716,6 +716,161 @@ public:
|
|||||||
return mflops_best;
|
return mflops_best;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static double NaiveStaggered(int L)
|
||||||
|
{
|
||||||
|
double mflops;
|
||||||
|
double mflops_best = 0;
|
||||||
|
double mflops_worst= 0;
|
||||||
|
std::vector<double> mflops_all;
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////
|
||||||
|
// Set/Get the layout & grid size
|
||||||
|
///////////////////////////////////////////////////////
|
||||||
|
int threads = GridThread::GetThreads();
|
||||||
|
Coordinate mpi = GridDefaultMpi(); GRID_ASSERT(mpi.size()==4);
|
||||||
|
Coordinate local({L,L,L,L});
|
||||||
|
Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
|
||||||
|
|
||||||
|
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4,
|
||||||
|
GridDefaultSimd(Nd,vComplex::Nsimd()),
|
||||||
|
GridDefaultMpi());
|
||||||
|
uint64_t NP = TmpGrid->RankCount();
|
||||||
|
uint64_t NN = TmpGrid->NodeCount();
|
||||||
|
NN_global=NN;
|
||||||
|
uint64_t SHM=NP/NN;
|
||||||
|
|
||||||
|
|
||||||
|
///////// Welcome message ////////////
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Benchmark NaiveStaggered on "<<L<<"^4 local volume "<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "* ranks : "<<NP <<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "* nodes : "<<NN <<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "* ranks/node : "<<SHM <<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "* ranks geom : "<<GridCmdVectorIntToString(mpi)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
|
||||||
|
///////// Lattice Init ////////////
|
||||||
|
GridCartesian * FGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
|
||||||
|
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
|
||||||
|
|
||||||
|
///////// RNG Init ////////////
|
||||||
|
std::vector<int> seeds4({1,2,3,4});
|
||||||
|
GridParallelRNG RNG4(FGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||||
|
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
|
||||||
|
|
||||||
|
RealD mass=0.1;
|
||||||
|
RealD c1=9.0/8.0;
|
||||||
|
RealD c2=-1.0/24.0;
|
||||||
|
RealD u0=1.0;
|
||||||
|
|
||||||
|
typedef NaiveStaggeredFermionF Action;
|
||||||
|
typedef typename Action::FermionField Fermion;
|
||||||
|
typedef LatticeGaugeFieldF Gauge;
|
||||||
|
|
||||||
|
Gauge Umu(FGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
|
||||||
|
|
||||||
|
typename Action::ImplParams params;
|
||||||
|
Action Ds(Umu,*FGrid,*FrbGrid,mass,c1,u0,params);
|
||||||
|
|
||||||
|
///////// Source preparation ////////////
|
||||||
|
Fermion src (FGrid); random(RNG4,src);
|
||||||
|
Fermion src_e (FrbGrid);
|
||||||
|
Fermion src_o (FrbGrid);
|
||||||
|
Fermion r_e (FrbGrid);
|
||||||
|
Fermion r_o (FrbGrid);
|
||||||
|
Fermion r_eo (FGrid);
|
||||||
|
|
||||||
|
{
|
||||||
|
|
||||||
|
pickCheckerboard(Even,src_e,src);
|
||||||
|
pickCheckerboard(Odd,src_o,src);
|
||||||
|
|
||||||
|
const int num_cases = 2;
|
||||||
|
std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
|
||||||
|
|
||||||
|
controls Cases [] = {
|
||||||
|
{ StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
|
||||||
|
{ StaggeredKernelsStatic::OptHandUnroll, StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
|
||||||
|
{ StaggeredKernelsStatic::OptInlineAsm , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }
|
||||||
|
};
|
||||||
|
|
||||||
|
for(int c=0;c<num_cases;c++) {
|
||||||
|
|
||||||
|
StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap;
|
||||||
|
StaggeredKernelsStatic::Opt = Cases[c].Opt;
|
||||||
|
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
if ( StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc StaggeredKernels" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
|
||||||
|
int nwarm = 10;
|
||||||
|
double t0=usecond();
|
||||||
|
FGrid->Barrier();
|
||||||
|
for(int i=0;i<nwarm;i++){
|
||||||
|
Ds.DhopEO(src_o,r_e,DaggerNo);
|
||||||
|
}
|
||||||
|
FGrid->Barrier();
|
||||||
|
double t1=usecond();
|
||||||
|
|
||||||
|
uint64_t no = 50;
|
||||||
|
uint64_t ni = 100;
|
||||||
|
|
||||||
|
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
||||||
|
|
||||||
|
time_statistics timestat;
|
||||||
|
std::vector<double> t_time(no);
|
||||||
|
for(uint64_t i=0;i<no;i++){
|
||||||
|
t0=usecond();
|
||||||
|
for(uint64_t j=0;j<ni;j++){
|
||||||
|
Ds.DhopEO(src_o,r_e,DaggerNo);
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
t_time[i] = t1-t0;
|
||||||
|
}
|
||||||
|
FGrid->Barrier();
|
||||||
|
|
||||||
|
double volume=1; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||||
|
double flops=((8*(3*(6+8+8)) + 7*3*2)*1.0*volume)/2;
|
||||||
|
double mf_hi, mf_lo, mf_err;
|
||||||
|
|
||||||
|
timestat.statistics(t_time);
|
||||||
|
mf_hi = flops/timestat.min*ni;
|
||||||
|
mf_lo = flops/timestat.max*ni;
|
||||||
|
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
||||||
|
|
||||||
|
mflops = flops/timestat.mean*ni;
|
||||||
|
mflops_all.push_back(mflops);
|
||||||
|
if ( mflops_best == 0 ) mflops_best = mflops;
|
||||||
|
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
||||||
|
if ( mflops>mflops_best ) mflops_best = mflops;
|
||||||
|
if ( mflops<mflops_worst) mflops_worst= mflops;
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
||||||
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call "<< timestat.mean/ni<<std::endl;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << L<<"^4 Deo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
|
||||||
|
std::cout<<GridLogMessage << L<<"^4 Deo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
|
||||||
|
std::cout<<GridLogMessage <<fmt << std::endl;
|
||||||
|
std::cout<<GridLogMessage ;
|
||||||
|
|
||||||
|
for(int i=0;i<mflops_all.size();i++){
|
||||||
|
std::cout<<mflops_all[i]/NN<<" ; " ;
|
||||||
|
}
|
||||||
|
std::cout<<std::endl;
|
||||||
|
}
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
return mflops_best;
|
||||||
|
}
|
||||||
|
|
||||||
static double Clover(int L)
|
static double Clover(int L)
|
||||||
{
|
{
|
||||||
double mflops;
|
double mflops;
|
||||||
@@ -887,6 +1042,7 @@ int main (int argc, char ** argv)
|
|||||||
std::vector<double> clover;
|
std::vector<double> clover;
|
||||||
std::vector<double> dwf4;
|
std::vector<double> dwf4;
|
||||||
std::vector<double> staggered;
|
std::vector<double> staggered;
|
||||||
|
std::vector<double> naive_staggered;
|
||||||
|
|
||||||
int Ls=1;
|
int Ls=1;
|
||||||
if (do_dslash){
|
if (do_dslash){
|
||||||
@@ -914,13 +1070,21 @@ int main (int argc, char ** argv)
|
|||||||
staggered.push_back(result);
|
staggered.push_back(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << " Naive Staggered dslash 4D vectorised" <<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
for(int l=0;l<L_list.size();l++){
|
||||||
|
double result = Benchmark::NaiveStaggered(L_list[l]) ;
|
||||||
|
naive_staggered.push_back(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
|
std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << "L \t\t Clover \t\t DWF4 \t\t Staggered" <<std::endl;
|
std::cout<<GridLogMessage << "L \t\t Clover \t\t DWF4 \t\t Staggered \t\t Naive Staggered" <<std::endl;
|
||||||
for(int l=0;l<L_list.size();l++){
|
for(int l=0;l<L_list.size();l++){
|
||||||
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl;
|
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<" \t\t "<<naive_staggered[l]<<std::endl;
|
||||||
}
|
}
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
}
|
}
|
||||||
@@ -930,14 +1094,14 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
|
std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl;
|
std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered \t\t NaiveStag \t|\t (GF/s per node)" <<std::endl;
|
||||||
fprintf(FP,"Per node summary table\n");
|
fprintf(FP,"Per node summary table\n");
|
||||||
fprintf(FP,"\n");
|
fprintf(FP,"\n");
|
||||||
fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n");
|
fprintf(FP,"L , Wilson, DWF4, Staggered, NaiveStag\n");
|
||||||
fprintf(FP,"\n");
|
fprintf(FP,"\n");
|
||||||
for(int l=0;l<L_list.size();l++){
|
for(int l=0;l<L_list.size();l++){
|
||||||
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
|
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<" \t " <<naive_staggered[l]/NN<<std::endl;
|
||||||
fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.);
|
fprintf(FP,"%d , %.0f, %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.,naive_staggered[l]/NN/1000.);
|
||||||
}
|
}
|
||||||
fprintf(FP,"\n");
|
fprintf(FP,"\n");
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
|||||||
@@ -1,76 +1,91 @@
|
|||||||
|
Per node summary table
|
||||||
|
|
||||||
|
L , Wilson, DWF4, Staggered, NaiveStag
|
||||||
|
|
||||||
|
8 , 90, 933, 38, 23
|
||||||
|
12 , 403, 1688, 178, 113
|
||||||
|
16 , 188, 1647, 449, 295
|
||||||
|
24 , 947, 1574, 674, 553
|
||||||
|
32 , 931, 1371, 718, 643
|
||||||
|
|
||||||
Memory Bandwidth
|
Memory Bandwidth
|
||||||
|
|
||||||
Bytes, GB/s per node
|
Bytes, GB/s per node
|
||||||
6291456, 379.297050
|
786432, 40.271620
|
||||||
100663296, 3754.674992
|
12582912, 433.611792
|
||||||
509607936, 6521.472413
|
63700992, 905.374321
|
||||||
1610612736, 8513.456479
|
201326592, 1114.979152
|
||||||
3932160000, 9018.901766
|
491520000, 1180.241898
|
||||||
|
|
||||||
|
|
||||||
GEMM
|
|
||||||
|
|
||||||
M, N, K, BATCH, GF/s per rank
|
|
||||||
16, 8, 16, 256, 0.564958
|
|
||||||
16, 16, 16, 256, 243.148058
|
|
||||||
16, 32, 16, 256, 440.346877
|
|
||||||
32, 8, 32, 256, 439.194136
|
|
||||||
32, 16, 32, 256, 847.334141
|
|
||||||
32, 32, 32, 256, 1430.892623
|
|
||||||
64, 8, 64, 256, 1242.756741
|
|
||||||
64, 16, 64, 256, 2196.689493
|
|
||||||
64, 32, 64, 256, 3697.458072
|
|
||||||
16, 8, 256, 256, 899.582627
|
|
||||||
16, 16, 256, 256, 1673.537756
|
|
||||||
16, 32, 256, 256, 2959.597089
|
|
||||||
32, 8, 256, 256, 1558.858630
|
|
||||||
32, 16, 256, 256, 2864.839445
|
|
||||||
32, 32, 256, 256, 4810.671254
|
|
||||||
64, 8, 256, 256, 2386.092942
|
|
||||||
64, 16, 256, 256, 4451.665937
|
|
||||||
64, 32, 256, 256, 5942.124095
|
|
||||||
8, 256, 16, 256, 799.867271
|
|
||||||
16, 256, 16, 256, 1584.624888
|
|
||||||
32, 256, 16, 256, 1949.422338
|
|
||||||
8, 256, 32, 256, 1389.417474
|
|
||||||
16, 256, 32, 256, 2668.344493
|
|
||||||
32, 256, 32, 256, 3234.162120
|
|
||||||
8, 256, 64, 256, 2150.925128
|
|
||||||
16, 256, 64, 256, 4012.488132
|
|
||||||
32, 256, 64, 256, 5154.785521
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Communications
|
Communications
|
||||||
|
|
||||||
Packet bytes, direction, GB/s per node
|
Packet bytes, direction, GB/s per node
|
||||||
4718592, 1, 245.026198
|
|
||||||
4718592, 2, 251.180996
|
|
||||||
4718592, 3, 361.110977
|
|
||||||
4718592, 5, 247.898447
|
|
||||||
4718592, 6, 249.867523
|
|
||||||
4718592, 7, 359.033061
|
|
||||||
15925248, 1, 255.030946
|
|
||||||
15925248, 2, 264.453890
|
|
||||||
15925248, 3, 392.949183
|
|
||||||
15925248, 5, 256.040644
|
|
||||||
15925248, 6, 264.681896
|
|
||||||
15925248, 7, 392.102622
|
|
||||||
37748736, 1, 258.823333
|
|
||||||
37748736, 2, 268.181577
|
|
||||||
37748736, 3, 401.478191
|
|
||||||
37748736, 5, 258.995363
|
|
||||||
37748736, 6, 268.206586
|
|
||||||
37748736, 7, 400.397611
|
|
||||||
|
|
||||||
|
|
||||||
Per node summary table
|
GEMM
|
||||||
|
|
||||||
|
M, N, K, BATCH, GF/s per rank fp64
|
||||||
|
16, 8, 16, 4096, 693.316363
|
||||||
|
16, 12, 16, 4096, 657.277058
|
||||||
|
16, 16, 16, 4096, 711.992616
|
||||||
|
32, 8, 32, 4096, 821.084324
|
||||||
|
32, 12, 32, 4096, 1279.852719
|
||||||
|
32, 16, 32, 4096, 2647.096674
|
||||||
|
64, 8, 64, 4096, 2630.192325
|
||||||
|
64, 12, 64, 4096, 3338.071321
|
||||||
|
64, 16, 64, 4096, 3950.899281
|
||||||
|
16, 8, 256, 4096, 1638.362501
|
||||||
|
16, 12, 256, 4096, 2377.502234
|
||||||
|
16, 16, 256, 4096, 3048.328833
|
||||||
|
32, 8, 256, 4096, 2917.384276
|
||||||
|
32, 12, 256, 4096, 4103.085151
|
||||||
|
32, 16, 256, 4096, 5102.971860
|
||||||
|
64, 8, 256, 4096, 3222.258206
|
||||||
|
64, 12, 256, 4096, 4619.456391
|
||||||
|
64, 16, 256, 4096, 5847.916650
|
||||||
|
8, 256, 16, 4096, 1728.073337
|
||||||
|
12, 256, 16, 4096, 2356.653970
|
||||||
|
16, 256, 16, 4096, 2676.876038
|
||||||
|
8, 256, 32, 4096, 2611.531990
|
||||||
|
12, 256, 32, 4096, 3451.573106
|
||||||
|
16, 256, 32, 4096, 3966.915301
|
||||||
|
8, 256, 64, 4096, 3436.248737
|
||||||
|
12, 256, 64, 4096, 4539.497945
|
||||||
|
16, 256, 64, 4096, 5307.992323
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
GEMM
|
||||||
|
|
||||||
|
M, N, K, BATCH, GF/s per rank fp32
|
||||||
|
16, 8, 16, 4096, 499.017445
|
||||||
|
16, 12, 16, 4096, 731.543385
|
||||||
|
16, 16, 16, 4096, 958.800786
|
||||||
|
32, 8, 32, 4096, 1549.813550
|
||||||
|
32, 12, 32, 4096, 2147.907502
|
||||||
|
32, 16, 32, 4096, 2601.698596
|
||||||
|
64, 8, 64, 4096, 3785.446233
|
||||||
|
64, 12, 64, 4096, 5116.694843
|
||||||
|
64, 16, 64, 4096, 6109.345016
|
||||||
|
16, 8, 256, 4096, 1206.627737
|
||||||
|
16, 12, 256, 4096, 1809.699599
|
||||||
|
16, 16, 256, 4096, 2412.014053
|
||||||
|
32, 8, 256, 4096, 2406.114488
|
||||||
|
32, 12, 256, 4096, 3605.531907
|
||||||
|
32, 16, 256, 4096, 4798.444037
|
||||||
|
64, 8, 256, 4096, 4688.711196
|
||||||
|
64, 12, 256, 4096, 6990.696301
|
||||||
|
64, 16, 256, 4096, 9214.749925
|
||||||
|
8, 256, 16, 4096, 2596.307289
|
||||||
|
12, 256, 16, 4096, 3439.892562
|
||||||
|
16, 256, 16, 4096, 3907.201036
|
||||||
|
8, 256, 32, 4096, 3012.752067
|
||||||
|
12, 256, 32, 4096, 3904.217583
|
||||||
|
16, 256, 32, 4096, 4599.047092
|
||||||
|
8, 256, 64, 4096, 3721.999042
|
||||||
|
12, 256, 64, 4096, 5098.573927
|
||||||
|
16, 256, 64, 4096, 6159.080872
|
||||||
|
|
||||||
L , Wilson, DWF4, Staggered, GF/s per node
|
|
||||||
|
|
||||||
8 , 155, 1386, 50
|
|
||||||
12 , 694, 4208, 230
|
|
||||||
16 , 1841, 6675, 609
|
|
||||||
24 , 3934, 8573, 1641
|
|
||||||
32 , 5083, 9771, 3086
|
|
||||||
|
|
||||||
|
|||||||
|
@@ -1,4 +1,3 @@
|
|||||||
CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
|
|
||||||
../../configure --enable-comms=mpi-auto \
|
../../configure --enable-comms=mpi-auto \
|
||||||
--with-lime=$CLIME \
|
--with-lime=$CLIME \
|
||||||
--enable-unified=no \
|
--enable-unified=no \
|
||||||
@@ -9,12 +8,13 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
|
|||||||
--disable-gparity \
|
--disable-gparity \
|
||||||
--disable-fermion-reps \
|
--disable-fermion-reps \
|
||||||
--enable-simd=GPU \
|
--enable-simd=GPU \
|
||||||
--with-gmp=$OLCF_GMP_ROOT \
|
--with-gmp=$GMP \
|
||||||
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
|
--with-mpfr=$MPFR \
|
||||||
|
--with-openssl=$OPENSSL \
|
||||||
--disable-fermion-reps \
|
--disable-fermion-reps \
|
||||||
CXX=hipcc MPICXX=mpicxx \
|
CXX=hipcc MPICXX=mpicxx \
|
||||||
CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
|
CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include " \
|
||||||
LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas -lhipfft"
|
LDFLAGS="-L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -lmpi_gtl_hsa -lhipblas -lrocblas -lhipfft -lamdhip64"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,16 +1,14 @@
|
|||||||
|
|
||||||
echo spack
|
echo spack
|
||||||
. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
|
. /autofs/nccs-svm1_home1/paboyle/spack/share/spack/setup-env.sh
|
||||||
|
|
||||||
module load amd/7.0.2
|
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
|
||||||
module load cray-fftw
|
export MPFR=`spack find --paths mpfr | grep ^mpfr | awk '{print $2}' `
|
||||||
module load craype-accel-amd-gfx90a
|
export OPENSSL=`spack find --paths openssl | grep openssl | awk '{print $2}' `
|
||||||
mkdir $HOME/LD_PATH
|
export GMP=`spack find --paths gmp | grep ^gmp | awk '{print $2}' `
|
||||||
ln -s /opt/rocm-6.4.2/lib/libamdhip* $HOME/LD_PATH
|
|
||||||
|
|
||||||
#Ugly hacks to get down level software working on current system
|
module load cce/21.0.0
|
||||||
export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
|
module load cpe/26.03
|
||||||
export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
|
module load rocm/7.0.2
|
||||||
#export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
|
export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH
|
||||||
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/LD_PATH/
|
export LD_LIBRARY_PATH=/opt/rocm-7.0.2/lib/llvm/lib/:$LD_LIBRARY_PATH
|
||||||
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm-7.0.2/lib
|
|
||||||
|
|||||||
@@ -36,8 +36,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
using namespace std;
|
using namespace std;
|
||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
|
|
||||||
gridblasHandle_t GridBLAS::gridblasHandle;
|
//gridblasHandle_t GridBLAS::gridblasHandle;
|
||||||
int GridBLAS::gridblasInit;
|
//int GridBLAS::gridblasInit;
|
||||||
|
|
||||||
///////////////////////
|
///////////////////////
|
||||||
// Tells little dirac op to use MdagM as the .Op()
|
// Tells little dirac op to use MdagM as the .Op()
|
||||||
|
|||||||
@@ -0,0 +1,373 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: tests/debug/Test_staggered_hdcg.cc
|
||||||
|
|
||||||
|
Authors: Thomas Blum, Peter Boyle
|
||||||
|
|
||||||
|
HDCG (Hierarchical Deflation Conjugate Gradient) multigrid solver
|
||||||
|
for naive staggered fermions, based on arXiv:2409.03904.
|
||||||
|
|
||||||
|
Adapts the DWF HDCG infrastructure (Test_general_coarse_hdcg_phys48.cc) to:
|
||||||
|
- NaiveStaggeredFermion (nearest-neighbour only, no Naik 3-hop term)
|
||||||
|
- 4D SchurStaggeredOperator: Mpc = m^2 - D_oe * D_eo (hermitian, positive-definite)
|
||||||
|
- vColourVector fine field type (staggered has colour but no spin)
|
||||||
|
- NextToNearestStencilGeometry4D: 33-point coarse stencil
|
||||||
|
|
||||||
|
Stencil count: D_oe*D_eo has 2-hop fine range. With blocking B >= 2 the coarse
|
||||||
|
shifts have L1-distance <= 2, giving 33 stencil points in 4D:
|
||||||
|
1 (identity) + 8 (+-e_mu) + 24 (+-e_mu +- e_nu).
|
||||||
|
NaiveStaggeredFermion has no Naik term, so any B >= 2 suffices.
|
||||||
|
To extend to ImprovedStaggeredFermion later, use B >= 6.
|
||||||
|
|
||||||
|
Reference: arXiv:2409.03904 (mrhs hermitian multigrid for DWF).
|
||||||
|
|
||||||
|
Usage (after build):
|
||||||
|
./Test_staggered_hdcg --grid 16.16.16.16 --mpi 1.1.1.1
|
||||||
|
|
||||||
|
*************************************************************************************/
|
||||||
|
#include <Grid/Grid.h>
|
||||||
|
#include <Grid/algorithms/iterative/AdefMrhs.h>
|
||||||
|
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
|
||||||
|
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
|
||||||
|
|
||||||
|
using namespace Grid;
|
||||||
|
|
||||||
|
// Non-converging CG used as a smoother (fixed number of iterations)
|
||||||
|
template<class Field>
|
||||||
|
class CGSmoother : public LinearFunction<Field>
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
typedef LinearOperatorBase<Field> FineOperator;
|
||||||
|
FineOperator &_Op;
|
||||||
|
int iters;
|
||||||
|
CGSmoother(int _iters, FineOperator &Op) : _Op(Op), iters(_iters) {}
|
||||||
|
void operator()(const Field &in, Field &out)
|
||||||
|
{
|
||||||
|
ConjugateGradient<Field> CG(0.0, iters, false);
|
||||||
|
out = Zero();
|
||||||
|
CG(_Op, in, out);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "TRACE: entering main\n"); fflush(stderr);
|
||||||
|
Grid_init(&argc, &argv);
|
||||||
|
fprintf(stderr, "TRACE: Grid_init done\n"); fflush(stderr);
|
||||||
|
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
// Parameters — tune for production
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
const int nbasis = 24; // near-null space dimension
|
||||||
|
const int cb = 0; // even checkerboard
|
||||||
|
|
||||||
|
RealD mass = 0.00184;
|
||||||
|
|
||||||
|
// NaiveStaggeredFermion: nearest-neighbour hop only (no Naik term).
|
||||||
|
// c1 = coefficient of the hopping term (1.0 = standard normalisation).
|
||||||
|
// u0 = tadpole factor (1.0 = no tadpole improvement).
|
||||||
|
RealD c1 = 1.0;
|
||||||
|
RealD u0 = 1.0;
|
||||||
|
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
// Grids
|
||||||
|
// Fine: UGrid (4D full), UrbGrid (4D red-black)
|
||||||
|
// Coarse: Coarse4d with dimensions = GridDefaultLatt() / Block
|
||||||
|
//
|
||||||
|
// Recommended: GridDefaultLatt() >= 16^4, Block = {4,4,4,4}
|
||||||
|
// NaiveStaggeredFermion works with any Block >= {2,2,2,2}
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
fprintf(stderr, "TRACE: making UGrid\n"); fflush(stderr);
|
||||||
|
GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(
|
||||||
|
GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi());
|
||||||
|
fprintf(stderr, "TRACE: making UrbGrid\n"); fflush(stderr);
|
||||||
|
GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
|
|
||||||
|
Coordinate Block({4, 4, 4, 4});
|
||||||
|
Coordinate clatt = GridDefaultLatt();
|
||||||
|
for (int d = 0; d < clatt.size(); d++) clatt[d] /= Block[d];
|
||||||
|
Coordinate csimd = GridDefaultSimd(Nd, vComplex::Nsimd());
|
||||||
|
Coordinate cmpi = GridDefaultMpi();
|
||||||
|
fprintf(stderr, "TRACE: making Coarse4d clatt=%d %d %d %d simd=%d %d %d %d mpi=%d %d %d %d Nsimd=%d\n",
|
||||||
|
clatt[0],clatt[1],clatt[2],clatt[3],
|
||||||
|
csimd[0],csimd[1],csimd[2],csimd[3],
|
||||||
|
cmpi[0],cmpi[1],cmpi[2],cmpi[3],
|
||||||
|
(int)vComplex::Nsimd()); fflush(stderr);
|
||||||
|
|
||||||
|
GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt, csimd, cmpi);
|
||||||
|
fprintf(stderr, "TRACE: Coarse4d made\n"); fflush(stderr);
|
||||||
|
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
// RNG + gauge field
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
fprintf(stderr, "TRACE: RNG4\n"); fflush(stderr);
|
||||||
|
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers({1,2,3,4});
|
||||||
|
fprintf(stderr, "TRACE: RNGrb\n"); fflush(stderr);
|
||||||
|
GridParallelRNG RNGrb(UGrid); RNGrb.SeedFixedIntegers({5,6,7,8}); // must use full grid, not UrbGrid
|
||||||
|
fprintf(stderr, "TRACE: Umu\n"); fflush(stderr);
|
||||||
|
LatticeGaugeField Umu(UGrid);
|
||||||
|
int HotStart = 0;
|
||||||
|
if ( HotStart ) {
|
||||||
|
fprintf(stderr, "TRACE: HotConfig\n"); fflush(stderr);
|
||||||
|
SU<Nc>::HotConfiguration(RNG4, Umu);
|
||||||
|
} else {
|
||||||
|
FieldMetaData header;
|
||||||
|
std::string file("./configuration.ildg");
|
||||||
|
IldgReader IR;
|
||||||
|
IR.open(file);
|
||||||
|
IR.readConfiguration(Umu,header);
|
||||||
|
IR.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "TRACE: NaiveStaggeredFermionD\n"); fflush(stderr);
|
||||||
|
NaiveStaggeredFermionD Ds(Umu, *UGrid, *UrbGrid, mass, c1, u0);
|
||||||
|
fprintf(stderr, "TRACE: SchurStaggeredOperator\n"); fflush(stderr);
|
||||||
|
SchurStaggeredOperator<NaiveStaggeredFermionD, LatticeStaggeredFermionD> HermOp(Ds);
|
||||||
|
fprintf(stderr, "TRACE: HermOp done\n"); fflush(stderr);
|
||||||
|
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
// Subspace: inverse-iteration near-null vectors
|
||||||
|
//
|
||||||
|
// CreateSubspace applies CG (4 solves, tol=1e-4) to random noise vectors,
|
||||||
|
// converging naturally to the low modes of HermOp without needing spectral
|
||||||
|
// bound tuning. Switch to CreateSubspaceChebyshevNew once the spectrum is
|
||||||
|
// well characterised (hi ~ 5.0 for naive staggered SchurStaggeredOperator).
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
typedef Aggregation<vColourVector, vTComplex, nbasis> Subspace;
|
||||||
|
Subspace Aggregates(Coarse4d, UrbGrid, cb);
|
||||||
|
|
||||||
|
Aggregates.CreateSubspace(RNGrb, HermOp);
|
||||||
|
Aggregates.Orthogonalise();
|
||||||
|
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
// Coarse geometry: NextToNearestStencilGeometry4D
|
||||||
|
// hops=2 -> 33 stencil points in 4D
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
NextToNearestStencilGeometry4D geom(Coarse4d);
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "Coarse stencil: " << geom.npoint << " points" << std::endl;
|
||||||
|
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
// Single-RHS coarse operator (used for correctness check below)
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
typedef GeneralCoarsenedMatrix<vColourVector, vTComplex, nbasis> LittleDiracOp;
|
||||||
|
typedef LittleDiracOp::CoarseVector CoarseVector;
|
||||||
|
|
||||||
|
LittleDiracOp LDO(geom, UrbGrid, Coarse4d);
|
||||||
|
LDO.CoarsenOperator(HermOp, Aggregates);
|
||||||
|
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
// Correctness check: P M_fine P^T c ≈ M_coarse c
|
||||||
|
//
|
||||||
|
// Promote a random coarse vector into the fine subspace, apply the
|
||||||
|
// fine operator, project back, and compare with the coarse operator
|
||||||
|
// applied directly. Error should be at the level of subspace
|
||||||
|
// approximation quality (smaller = better basis vectors).
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
{
|
||||||
|
GridParallelRNG RNGc(Coarse4d); RNGc.SeedFixedIntegers({9,10,11,12});
|
||||||
|
CoarseVector c_src(Coarse4d), c_ldop(Coarse4d), c_proj(Coarse4d);
|
||||||
|
random(RNGc, c_src);
|
||||||
|
|
||||||
|
LatticeStaggeredFermionD f_v(UrbGrid), f_Mv(UrbGrid);
|
||||||
|
Aggregates.PromoteFromSubspace(c_src, f_v);
|
||||||
|
HermOp.Op(f_v, f_Mv);
|
||||||
|
Aggregates.ProjectToSubspace(c_proj, f_Mv);
|
||||||
|
|
||||||
|
LDO.M(c_src, c_ldop);
|
||||||
|
|
||||||
|
c_proj -= c_ldop;
|
||||||
|
RealD err = norm2(c_proj) / norm2(c_ldop);
|
||||||
|
std::cout << GridLogMessage
|
||||||
|
<< "Coarsen check |P*M_fine - M_coarse| / |M_coarse| = " << err << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
// Multi-RHS coarse grid
|
||||||
|
//
|
||||||
|
// The extra leading dimension holds nrhs right-hand sides packed into
|
||||||
|
// SIMD lanes, matching the pattern of Test_general_coarse_hdcg_phys48.
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
const int nrhs = vComplex::Nsimd() * 2;
|
||||||
|
|
||||||
|
Coordinate mpi = GridDefaultMpi();
|
||||||
|
Coordinate rhMpi ({1, mpi[0], mpi[1], mpi[2], mpi[3]});
|
||||||
|
Coordinate rhLatt({nrhs, clatt[0], clatt[1], clatt[2], clatt[3]});
|
||||||
|
Coordinate rhSimd({vComplex::Nsimd(), 1, 1, 1, 1});
|
||||||
|
|
||||||
|
GridCartesian *CoarseMrhs = new GridCartesian(rhLatt, rhSimd, rhMpi);
|
||||||
|
|
||||||
|
typedef MultiGeneralCoarsenedMatrix<vColourVector, vTComplex, nbasis> MultiCoarseOp;
|
||||||
|
MultiCoarseOp mrhs(geom, CoarseMrhs);
|
||||||
|
mrhs.CoarsenOperator(HermOp, Aggregates, Coarse4d);
|
||||||
|
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
// Coarse-grid Lanczos for deflation
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
typedef HermitianLinearOperator<MultiCoarseOp, CoarseVector> MrhsHermOp;
|
||||||
|
MrhsHermOp MrhsCoarseOp(mrhs);
|
||||||
|
|
||||||
|
// Estimate spectral bounds for Lanczos Chebyshev filter
|
||||||
|
CoarseVector pm_src(CoarseMrhs); pm_src = ComplexD(1.0);
|
||||||
|
PowerMethod<CoarseVector> cPM;
|
||||||
|
RealD lambda_max = cPM(MrhsCoarseOp, pm_src);
|
||||||
|
// Chebyshev filter window [lo, hi]:
|
||||||
|
// lo must sit in the spectral gap between the Nstop-th and (Nstop+1)-th
|
||||||
|
// coarse eigenvalues so that only the target modes receive cosh amplification.
|
||||||
|
//
|
||||||
|
// From a pilot run (16^4 fine, 4^4 coarse, mass=0.05, hot config):
|
||||||
|
// Group 1 (near-null, 24 modes): lambda in [0.002647, 0.002746] ~= mass^2
|
||||||
|
// Spectral gap: factor 60 (lambda_24/lambda_23 = 0.165/0.00275)
|
||||||
|
// Group 2 (second group): lambda in [0.165, 0.179]
|
||||||
|
//
|
||||||
|
// lo = 0.02 sits in the spectral gap (factor 7x above lambda_23=0.00275,
|
||||||
|
// factor 8x below lambda_24=0.165).
|
||||||
|
// hi = lambda_max_coarse * 1.1 ~= 2.121
|
||||||
|
// y(lambda_0=0.002647) ~ -1.016 -> T_70 ~ 1.7e5 (cosh(70*0.182))
|
||||||
|
// y(lambda_23=0.002746) ~ -1.015 -> T_70 ~ 1.6e5
|
||||||
|
// Relative spread across near-null cluster: ~4.3%
|
||||||
|
// y(lambda_24=0.165) ~ -0.862 -> inside [lo,hi] -> |T_70| <= 1
|
||||||
|
//
|
||||||
|
// order=71 (degree 70) is needed to give ~4% relative spread across the
|
||||||
|
// near-null cluster of 24 nearly-degenerate eigenvalues; order=31 (tried)
|
||||||
|
// gave only ~1.7% spread, insufficient for Nk=24/Nm=48 to converge.
|
||||||
|
// Absolute amplification ~1e5; what matters for IRL convergence is the
|
||||||
|
// relative spread, not the absolute value.
|
||||||
|
// lo=0.005 failed (T_70~53, 0/24 modes in 10 restarts).
|
||||||
|
// lo=0.01 worked but needed 2 restarts (13/24 then 24/24); lo=0.02 converges in 1.
|
||||||
|
RealD lambda_lo = 0.02;
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "Chebyshev filter: lo=" << lambda_lo
|
||||||
|
<< " hi=" << lambda_max*1.1 << " order=71" << std::endl;
|
||||||
|
|
||||||
|
Chebyshev<CoarseVector> IRLCheby(lambda_lo, lambda_max * 1.1, 71);
|
||||||
|
|
||||||
|
// 24 near-null modes (eigenvalues ~mass^2) converge to resid^2~1e-28
|
||||||
|
// in the first Lanczos restart. The remaining modes (~0.165) are a
|
||||||
|
// second spectral group that needs more Krylov vectors; handle them
|
||||||
|
// separately once the basic HDCG solve is validated.
|
||||||
|
int Nk = 24;
|
||||||
|
int Nm = 48;
|
||||||
|
int Nstop = Nk;
|
||||||
|
|
||||||
|
GridParallelRNG CRNG(Coarse4d); CRNG.SeedFixedIntegers({13,14,15,16});
|
||||||
|
|
||||||
|
ImplicitlyRestartedBlockLanczosCoarse<CoarseVector>
|
||||||
|
IRL(MrhsCoarseOp, Coarse4d, CoarseMrhs, nrhs, IRLCheby,
|
||||||
|
Nstop, /*conv_test_interval*/1, nrhs, Nk, Nm, 1.0e-5, 10);
|
||||||
|
|
||||||
|
int Nconv;
|
||||||
|
std::vector<RealD> eval(Nm);
|
||||||
|
std::vector<CoarseVector> evec(Nm, Coarse4d); // evec on f_grid (single-RHS coarse)
|
||||||
|
std::vector<CoarseVector> c_srcs(nrhs, Coarse4d); // src on same grid as evec
|
||||||
|
for (int r = 0; r < nrhs; r++) random(CRNG, c_srcs[r]);
|
||||||
|
IRL.calc(eval, evec, c_srcs, Nconv, LanczosType::irbl);
|
||||||
|
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
// HDCG solver assembly
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
MultiRHSDeflation<CoarseVector> MrhsGuesser;
|
||||||
|
MrhsGuesser.ImportEigenBasis(evec, eval);
|
||||||
|
|
||||||
|
// MrhsProjector maps between fine (UrbGrid) and coarse (Coarse4d) spaces
|
||||||
|
MultiRHSBlockProject<LatticeStaggeredFermionD> MrhsProjector;
|
||||||
|
MrhsProjector.Allocate(nbasis, UrbGrid, Coarse4d);
|
||||||
|
MrhsProjector.ImportBasis(Aggregates.subspace);
|
||||||
|
|
||||||
|
ConjugateGradient<CoarseVector> CoarseCG(5.0e-2, 5000, false);
|
||||||
|
DoNothingGuesser<CoarseVector> DoNothing;
|
||||||
|
HPDSolver<CoarseVector> HPDSolve(MrhsCoarseOp, CoarseCG, DoNothing);
|
||||||
|
|
||||||
|
// Spectral radius of the fine operator, needed for the smoother shift.
|
||||||
|
// Use a random checkerboard vector (UrbGrid) as starting guess for PowerMethod.
|
||||||
|
LatticeStaggeredFermionD fine_pm_src(UrbGrid);
|
||||||
|
random(RNGrb, fine_pm_src);
|
||||||
|
PowerMethod<LatticeStaggeredFermionD> finePM;
|
||||||
|
RealD fine_lambda_max = finePM(HermOp, fine_pm_src);
|
||||||
|
|
||||||
|
// Shifted smoother: CG on (HermOp + shift*I) with shift = lambda_max / 100.
|
||||||
|
//
|
||||||
|
// The O(8) CG polynomial has 8 roots. With this shift all 8 roots lie in the
|
||||||
|
// interval [shift, lambda_max + shift] ~ [0.046, 4.65], so the polynomial
|
||||||
|
// focuses entirely on the HIGH-frequency part of the spectrum and leaves
|
||||||
|
// near-null modes (lambda << shift) essentially untouched (polynomial ~ 1 there).
|
||||||
|
//
|
||||||
|
// This is the right target because the coarse-grid correction always introduces
|
||||||
|
// high-frequency spectral leakage: the blocked coarse-grid degrees of freedom
|
||||||
|
// are piecewise constant across coarse cells and therefore have sharp edges at
|
||||||
|
// cell boundaries (like lego-block edges). Smoothness is measured by the
|
||||||
|
// covariant Dirac derivative, so promoting the coarse solution back to the
|
||||||
|
// fine grid inevitably excites high-frequency components — just as a step
|
||||||
|
// function always carries high-frequency Fourier content. The smoother must
|
||||||
|
// repair exactly these high modes.
|
||||||
|
//
|
||||||
|
// The smoother and the coarse-grid correction are applied alternately: together
|
||||||
|
// they both lift the low eigenvalues and pull down the upper eigenvalues of the
|
||||||
|
// composite preconditioned operator, reducing the condition number seen by the
|
||||||
|
// outer HDCG iterations.
|
||||||
|
//
|
||||||
|
// DWF HDCG convention; using mass^2 = 0.0025 was far too small: it scattered
|
||||||
|
// the 8 roots over [0.005, 4.6] and diluted their effect on the high modes.
|
||||||
|
RealD smootherShift = fine_lambda_max / 200.0;
|
||||||
|
std::cout << GridLogMessage << "Smoother shift: lambda_max_fine/200 = "
|
||||||
|
<< fine_lambda_max << "/200 = " << smootherShift << std::endl;
|
||||||
|
ShiftedHermOpLinearOperator<LatticeStaggeredFermionD> ShiftedOp(HermOp, smootherShift);
|
||||||
|
CGSmoother<LatticeStaggeredFermionD> smoother(8, ShiftedOp);
|
||||||
|
|
||||||
|
TwoLevelADEF2mrhs<LatticeStaggeredFermionD, CoarseVector>
|
||||||
|
HDCG(1.0e-8, 500,
|
||||||
|
HermOp,
|
||||||
|
smoother,
|
||||||
|
HPDSolve, // M1 (coarse correction)
|
||||||
|
HPDSolve, // Vstart (initial guess projection)
|
||||||
|
MrhsProjector,
|
||||||
|
MrhsGuesser,
|
||||||
|
CoarseMrhs);
|
||||||
|
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
// Solve: nrhs right-hand sides simultaneously
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
std::vector<LatticeStaggeredFermionD> src(nrhs, UrbGrid);
|
||||||
|
std::vector<LatticeStaggeredFermionD> sol(nrhs, UrbGrid);
|
||||||
|
|
||||||
|
GridParallelRNG RNGrb2(UGrid); RNGrb2.SeedFixedIntegers({17,18,19,20}); // must use full grid, not UrbGrid
|
||||||
|
for (int r = 0; r < nrhs; r++) {
|
||||||
|
random(RNGrb2, src[r]);
|
||||||
|
sol[r] = Zero();
|
||||||
|
}
|
||||||
|
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
// Baseline: standard single-RHS CG on HermOp (no preconditioning)
|
||||||
|
// Run before HDCG to establish the unpreconditioned iteration count
|
||||||
|
// and wall-clock time for direct comparison.
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
{
|
||||||
|
ConjugateGradient<LatticeStaggeredFermionD> CG(1.0e-8, 100000, false);
|
||||||
|
std::vector<LatticeStaggeredFermionD> cg_sol(nrhs, UrbGrid);
|
||||||
|
for (int r = 0; r < nrhs; r++) cg_sol[r] = Zero();
|
||||||
|
|
||||||
|
RealD t0 = usecond();
|
||||||
|
int total_iters = 0;
|
||||||
|
for (int r = 0; r < nrhs; r++) {
|
||||||
|
std::cout << GridLogMessage << "====== CG baseline RHS " << r
|
||||||
|
<< " ======" << std::endl;
|
||||||
|
CG(HermOp, src[r], cg_sol[r]);
|
||||||
|
total_iters += CG.IterationsToComplete;
|
||||||
|
}
|
||||||
|
RealD t1 = usecond();
|
||||||
|
std::cout << GridLogMessage << "CG baseline: " << nrhs << " RHS, "
|
||||||
|
<< total_iters << " total iterations, "
|
||||||
|
<< (t1 - t0) / 1.0e6 << " s total, "
|
||||||
|
<< (t1 - t0) / 1.0e6 / nrhs << " s/RHS" << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
// HDCG solve
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
HDCG(src, sol);
|
||||||
|
|
||||||
|
Grid_finalize();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user