diff --git a/Grid/log/Log.cc b/Grid/log/Log.cc index b54c368d..166aea0a 100644 --- a/Grid/log/Log.cc +++ b/Grid/log/Log.cc @@ -79,6 +79,7 @@ void GridLogConfigure(std::vector &logstreams) { GridLogWarning.Active(0); GridLogMessage.Active(1); // at least the messages should be always on GridLogMemory.Active(0); + GridLogTracing.Active(0); GridLogIterative.Active(0); GridLogDebug.Active(0); GridLogPerformance.Active(0); diff --git a/Grid/qcd/action/ActionBase.h b/Grid/qcd/action/ActionBase.h index 5027fcf4..1e8d6d7a 100644 --- a/Grid/qcd/action/ActionBase.h +++ b/Grid/qcd/action/ActionBase.h @@ -55,14 +55,18 @@ public: deriv_num=0; } void deriv_log(RealD nrm, RealD max,RealD Fdt_nrm,RealD Fdt_max) { - deriv_max_sum+=max; + if ( max > deriv_max_sum ) { + deriv_max_sum=max; + } deriv_norm_sum+=nrm; - Fdt_max_sum+=Fdt_max; + if ( Fdt_max > Fdt_max_sum ) { + Fdt_max_sum=Fdt_max; + } Fdt_norm_sum+=Fdt_nrm; deriv_num++; } - RealD deriv_max_average(void) { return deriv_max_sum/deriv_num; }; + RealD deriv_max_average(void) { return deriv_max_sum; }; RealD deriv_norm_average(void) { return deriv_norm_sum/deriv_num; }; - RealD Fdt_max_average(void) { return Fdt_max_sum/deriv_num; }; + RealD Fdt_max_average(void) { return Fdt_max_sum; }; RealD Fdt_norm_average(void) { return Fdt_norm_sum/deriv_num; }; RealD deriv_timer(void) { return deriv_us; }; RealD S_timer(void) { return S_us; }; diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index 89aeca61..f62b3aba 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -36,10 +36,12 @@ NAMESPACE_BEGIN(Grid); // Wilson compressor will need FaceGather policies for: // Periodic, Dirichlet, and partial Dirichlet for DWF /////////////////////////////////////////////////////////////// +const int dwf_compressor_depth=2; class FaceGatherPartialDWF { public: - static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/2;}; + // static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);}; + static int PartialCompressionFactor(GridBase *grid) {return 1;} // static int PartialCompressionFactor(GridBase *grid) { return 1;} template static void Gather_plane_simple (commVector >& table, @@ -52,14 +54,17 @@ public: // Shrinks local and remote comms buffers GridBase *Grid = rhs.Grid(); int Ls = Grid->_rdimensions[0]; + // int depth=dwf_compressor_depth; + int depth=Ls/2; std::pair *table_v = & table[0]; auto rhs_v = rhs.View(AcceleratorRead); int vol=table.size()/Ls; accelerator_forNB( idx,table.size(), vobj::Nsimd(), { Integer i=idx/Ls; Integer s=idx%Ls; - if(s==0) compress.Compress(buffer[off+i ],rhs_v[so+table_v[idx].second]); - if(s==Ls-1) compress.Compress(buffer[off+i+vol],rhs_v[so+table_v[idx].second]); + Integer sc=depth+s-(Ls-depth); + if(s=Ls-depth) compress.Compress(buffer[off+i+sc*vol],rhs_v[so+table_v[idx].second]); }); rhs_v.ViewClose(); } @@ -67,6 +72,8 @@ public: static void DecompressFace(decompressor decompress,Decompression &dd) { auto Ls = dd.dims[0]; + // int depth=dwf_compressor_depth; + int depth=Ls/2; // Just pass in the Grid auto kp = dd.kernel_p; auto mp = dd.mpi_p; @@ -75,11 +82,12 @@ public: accelerator_forNB(o,size,1,{ int idx=o/Ls; int s=o%Ls; - if ( s == 0 ) { - int oo=idx; + if ( s < depth ) { + int oo=s*vol+idx; kp[o]=mp[oo]; - } else if ( s == Ls-1 ) { - int oo=vol+idx; + } else if ( s >= Ls-depth ) { + int sc = depth + s - (Ls-depth); + int oo=sc*vol+idx; kp[o]=mp[oo]; } else { kp[o] = Zero();//fill rest with zero if partial dirichlet @@ -98,7 +106,9 @@ public: { GridBase *Grid = rhs.Grid(); int Ls = Grid->_rdimensions[0]; - + // int depth=dwf_compressor_depth; + int depth = Ls/2; + // insertion of zeroes... assert( (table.size()&0x1)==0); int num=table.size()/2; @@ -113,7 +123,7 @@ public: // Reorders both local and remote comms buffers // int s = j % Ls; - int sp1 = (s+1)%Ls; // peri incremented s slice + int sp1 = (s+depth)%Ls; // peri incremented s slice int hxyz= j/Ls; @@ -136,6 +146,8 @@ public: static void MergeFace(decompressor decompress,Merger &mm) { auto Ls = mm.dims[0]; + int depth = Ls/2; + // int depth=dwf_compressor_depth; int num= mm.buffer_size/2; // relate vol and Ls to buffer size auto mp = &mm.mpointer[0]; auto vp0= &mm.vpointers[0][0]; // First arg is exchange first @@ -149,7 +161,7 @@ public: int xyz0=hxyz*2; int xyz1=hxyz*2+1; - int sp = (s+1)%Ls; + int sp = (s+depth)%Ls; int jj= hxyz + sp*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice .... int oo0= s+xyz0*Ls; @@ -163,7 +175,8 @@ public: class FaceGatherDWFMixedBCs { public: - static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/2;}; + // static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);}; + static int PartialCompressionFactor(GridBase *grid) {return 1;} template static void Gather_plane_simple (commVector >& table, diff --git a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h index ff605362..cb680f2f 100644 --- a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h +++ b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h @@ -279,6 +279,7 @@ NAMESPACE_BEGIN(Grid); // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi ////////////////////////////////////////////////////// virtual RealD Sinitial(const GaugeField &U) { + std::cout << GridLogMessage << "Returning stored two flavour refresh action "< { public: INHERIT_IMPL_TYPES(Impl); - + private: FermionOperator & NumOp;// the basic operator FermionOperator & DenOp;// the basic operator @@ -112,28 +112,48 @@ NAMESPACE_BEGIN(Grid); // NumOp == V // DenOp == M // + AUDIT(); FermionField etaOdd (NumOp.FermionRedBlackGrid()); FermionField etaEven(NumOp.FermionRedBlackGrid()); FermionField tmp (NumOp.FermionRedBlackGrid()); + AUDIT(); pickCheckerboard(Even,etaEven,eta); + AUDIT(); pickCheckerboard(Odd,etaOdd,eta); + AUDIT(); NumOp.ImportGauge(U); + AUDIT(); DenOp.ImportGauge(U); + std::cout << " TwoFlavourRefresh: Imported gauge "< Mpc(DenOp); + AUDIT(); SchurDifferentiableOperator Vpc(NumOp); + AUDIT(); + std::cout << " TwoFlavourRefresh: Diff ops "<is_smeared); double start_force = usecond(); + + std::cout << GridLogMessage << "AuditForce["<deriv_timer_start(); as[level].actions.at(a)->deriv(Us, force); // deriv should NOT include Ta as[level].actions.at(a)->deriv_timer_stop(); + std::cout << GridLogMessage << "AuditForce["<is_smeared << std::endl; auto name = as[level].actions.at(a)->action_name(); if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force); @@ -284,7 +291,7 @@ public: for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { std::cout << GridLogMessage << as[level].actions.at(actionID)->action_name() - <<"["<deriv_max_average() <<" norm " << as[level].actions.at(actionID)->deriv_norm_average() <<" Fdt max " << as[level].actions.at(actionID)->Fdt_max_average() @@ -364,9 +371,14 @@ public: std::cout << GridLogMessage << "refresh [" << level << "][" << actionID << "] "<is_smeared); + + std::cout << GridLogMessage << "AuditRefresh["<refresh_timer_start(); as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG); as[level].actions.at(actionID)->refresh_timer_stop(); + std::cout << GridLogMessage << "AuditRefresh["<is_smeared); @@ -412,6 +425,7 @@ public: as[level].actions.at(actionID)->S_timer_stop(); std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; H += Hterm; + AUDIT(); } as[level].apply(S_hireps, Representations, level, H); } @@ -424,7 +438,9 @@ public: void operator()(std::vector*> repr_set, Repr& Rep, int level, RealD& H) { for (int a = 0; a < repr_set.size(); ++a) { + AUDIT(); RealD Hterm = repr_set.at(a)->Sinitial(Rep.U); + AUDIT(); std::cout << GridLogMessage << "Sinitial Level " << level << " term " << a << " H Hirep = " << Hterm << std::endl; H += Hterm; @@ -449,8 +465,10 @@ public: Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl; as[level].actions.at(actionID)->S_timer_start(); + AUDIT(); Hterm = as[level].actions.at(actionID)->Sinitial(Us); as[level].actions.at(actionID)->S_timer_stop(); + AUDIT(); std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; H += Hterm; } @@ -463,6 +481,7 @@ public: void integrate(Field& U) { + AUDIT(); // reset the clocks t_U = 0; for (int level = 0; level < as.size(); ++level) { @@ -480,8 +499,10 @@ public: assert(fabs(t_U - t_P[level]) < 1.0e-6); // must be the same std::cout << GridLogIntegrator << " times[" << level << "]= " << t_P[level] << " " << t_U << std::endl; } + AUDIT(); FieldImplementation::Project(U); + AUDIT(); // and that we indeed got to the end of the trajectory assert(fabs(t_U - Params.trajL) < 1.0e-6); diff --git a/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc b/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc index f4ca8515..e85bec7e 100644 --- a/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc +++ b/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc @@ -179,8 +179,11 @@ int main(int argc, char **argv) { MD.name = std::string("Force Gradient"); //typedef GenericHMCRunner HMCWrapper; // MD.name = std::string("MinimumNorm2"); - // MD.MDsteps = 4; - MD.MDsteps = 4; + // TrajL = 2 + // 4/2 => 0.6 dH + // 3/3 => ?? dH + //MD.MDsteps = 4; + MD.MDsteps = 3; MD.trajL = 0.5; HMCparameters HMCparams; @@ -223,7 +226,7 @@ int main(int argc, char **argv) { Real light_mass = 7.8e-4; Real strange_mass = 0.0362; Real pv_mass = 1.0; - std::vector hasenbusch({ light_mass, 3.8e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); + std::vector hasenbusch({ light_mass, 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // std::vector hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass }); @@ -327,6 +330,8 @@ int main(int argc, char **argv) { ParamsF.dirichlet=NonDirichlet; ParamsDir.dirichlet=Dirichlet; ParamsDirF.dirichlet=Dirichlet; + ParamsDir.partialDirichlet=1; + ParamsDirF.partialDirichlet=1; // double StoppingCondition = 1e-14; // double MDStoppingCondition = 1e-9; @@ -342,8 +347,8 @@ int main(int argc, char **argv) { // Collect actions //////////////////////////////////// ActionLevel Level1(1); - ActionLevel Level2(2); - ActionLevel Level3(30); + ActionLevel Level2(3); + ActionLevel Level3(15); //////////////////////////////////// // Strange action @@ -474,13 +479,21 @@ int main(int argc, char **argv) { if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet; else ParamsDen.dirichlet = NonDirichlet; + if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1; + else ParamsNum.partialDirichlet = 0; + + if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1; + else ParamsDen.partialDirichlet = 0; + Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum)); Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen)); ParamsDenF.dirichlet = ParamsDen.dirichlet; + ParamsDenF.partialDirichlet = ParamsDen.partialDirichlet; DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenF)); ParamsNumF.dirichlet = ParamsNum.dirichlet; + ParamsNumF.partialDirichlet = ParamsNum.partialDirichlet; NumeratorsF.push_back (new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumF)); LinOpD.push_back(new LinearOperatorD(*Denominators[h])); @@ -516,9 +529,11 @@ int main(int argc, char **argv) { FermionActionD2::ImplParams ParamsNumD2(boundary); ParamsDenD2.dirichlet = ParamsDen.dirichlet; + ParamsDenD2.partialDirichlet = ParamsDen.partialDirichlet; DenominatorsD2.push_back(new FermionActionD2(UD2,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenD2)); ParamsNumD2.dirichlet = ParamsNum.dirichlet; + ParamsNumD2.partialDirichlet = ParamsNum.partialDirichlet; NumeratorsD2.push_back (new FermionActionD2(UD2,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumD2)); Bdys.push_back( new OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction( @@ -543,7 +558,8 @@ int main(int argc, char **argv) { int nquo=Quotients.size(); Level1.push_back(Bdys[0]); Level1.push_back(Bdys[1]); - for(int h=0;h HIP and SYCL GPU code + + ====== DDHMC ====== diff --git a/benchmarks/Benchmark_dwf_fp32_partial.cc b/benchmarks/Benchmark_dwf_fp32_partial.cc index 4db3022a..0cbd4a6c 100644 --- a/benchmarks/Benchmark_dwf_fp32_partial.cc +++ b/benchmarks/Benchmark_dwf_fp32_partial.cc @@ -88,6 +88,7 @@ int main (int argc, char ** argv) // Node level ////////////////////// for(int d=0;d1 ? 1 : 0; + // for(int d=0;doSites();ss++){ for(int s=0;s=Ls-depth)){ tmp_v[Ls*ss+s] = Ucopy_v[ss]*tmp_v[Ls*ss+s]; } else { tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s]; @@ -246,7 +247,7 @@ void Benchmark(int Ls, Coordinate Dirichlet, int partial) autoView( src_v, src , CpuRead); for(int ss=0;ssoSites();ss++){ for(int s=0;s=Ls-depth)){ tmp_v[Ls*ss+s] = adj(Ucopy_v[ss])*src_v[Ls*ss+s]; } else { tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s]; @@ -342,6 +343,7 @@ void Benchmark(int Ls, Coordinate Dirichlet, int partial) ref = Zero(); for(int mu=0;muoSites();ss++){ for(int s=0;s=Ls-depth)){ tmp_v[Ls*ss+s] = Ucopy_v[ss]*tmp_v[Ls*ss+s]; } else { tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s]; @@ -365,7 +367,7 @@ void Benchmark(int Ls, Coordinate Dirichlet, int partial) autoView( src_v, src , CpuRead); for(int ss=0;ssoSites();ss++){ for(int s=0;s=Ls-depth)){ tmp_v[Ls*ss+s] = adj(Ucopy_v[ss])*src_v[Ls*ss+s]; } else { tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s]; diff --git a/systems/Crusher/config-command b/systems/Crusher/config-command index bd825ab3..3965767f 100644 --- a/systems/Crusher/config-command +++ b/systems/Crusher/config-command @@ -3,6 +3,7 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-` --with-lime=$CLIME \ --enable-unified=no \ --enable-shm=nvlink \ +--enable-tracing=timer \ --enable-accelerator=hip \ --enable-gen-simd-width=64 \ --enable-simd=GPU \