diff --git a/Grid/qcd/utils/CovariantLaplacian.h b/Grid/qcd/utils/CovariantLaplacian.h index d6323ed2..9e1204d4 100644 --- a/Grid/qcd/utils/CovariantLaplacian.h +++ b/Grid/qcd/utils/CovariantLaplacian.h @@ -109,8 +109,7 @@ public: }; virtual GridBase *Grid(void) { return grid; }; - - virtual void M (const Field &_in, Field &_out) + virtual void Morig(const Field &_in, Field &_out) { /////////////////////////////////////////////// // Halo exchange for this geometry of stencil @@ -120,7 +119,8 @@ public: /////////////////////////////////// // Arithmetic expressions /////////////////////////////////// - auto st = Stencil.View(AcceleratorRead); +// auto st = Stencil.View(AcceleratorRead); + autoView( st , Stencil , AcceleratorRead); auto buf = st.CommBuf(); autoView( in , _in , AcceleratorRead); @@ -172,7 +172,165 @@ public: coalescedWrite(out[ss], res,lane); }); + }; + virtual void Mnew (const Field &_in, Field &_out) + { + /////////////////////////////////////////////// + // Halo exchange for this geometry of stencil + /////////////////////////////////////////////// +// Stencil.HaloExchange(_in, Compressor); + std::vector > requests; + Stencil.Prepare(); + { + GRID_TRACE("Laplace Gather"); + Stencil.HaloGather(_in,Compressor); + } + + tracePush("Laplace Communication"); + Stencil.CommunicateBegin(requests); + { + GRID_TRACE("MergeSHM"); + Stencil.CommsMergeSHM(Compressor); + } + + + /////////////////////////////////// + // Arithmetic expressions + /////////////////////////////////// +// auto st = Stencil.View(AcceleratorRead); + autoView( st , Stencil , AcceleratorRead); + auto buf = st.CommBuf(); + + autoView( in , _in , AcceleratorRead); + autoView( out , _out , AcceleratorWrite); + autoView( U , Uds , AcceleratorRead); + + typedef typename Field::vector_object vobj; + typedef decltype(coalescedRead(in[0])) calcObj; + typedef decltype(coalescedRead(U[0](0))) calcLink; + + const int Nsimd = vobj::Nsimd(); + const uint64_t NN = grid->oSites(); + + accelerator_for( ss, NN, Nsimd, { + + StencilEntry *SE; + + const int lane=acceleratorSIMTlane(Nsimd); + + calcObj chi; + calcObj res; + calcObj Uchi; + calcObj Utmp; + calcObj Utmp2; + calcLink UU; + calcLink Udag; + int ptype; + + res = coalescedRead(in[ss])*(-8.0); + + + SE = st.GetEntry(ptype, 0, ss); + if (SE->_is_local ) { + LEG_LOAD_MULT(0,Xp); + } + SE = st.GetEntry(ptype, 1, ss); + if (SE->_is_local ) { + LEG_LOAD_MULT(1,Yp); + } + SE = st.GetEntry(ptype, 2, ss); + if (SE->_is_local ) { + LEG_LOAD_MULT(2,Zp); + } + SE = st.GetEntry(ptype, 3, ss); + if (SE->_is_local ) { + LEG_LOAD_MULT(3,Tp); + } + SE = st.GetEntry(ptype, 4, ss); + if (SE->_is_local ) { + LEG_LOAD_MULT(4,Xm); + } + SE = st.GetEntry(ptype, 5, ss); + if (SE->_is_local ) { + LEG_LOAD_MULT(5,Ym); + } + SE = st.GetEntry(ptype, 6, ss); + if (SE->_is_local ) { + LEG_LOAD_MULT(6,Zm); + } + SE = st.GetEntry(ptype, 7, ss); + if (SE->_is_local ) { + LEG_LOAD_MULT(7,Tm); + } + + coalescedWrite(out[ss], res,lane); + }); + + Stencil.CommunicateComplete(requests); + tracePop("Communication"); + + { + GRID_TRACE("Merge"); + Stencil.CommsMerge(Compressor); + } + + + accelerator_for( ss, NN, Nsimd, { + + StencilEntry *SE; + + const int lane=acceleratorSIMTlane(Nsimd); + + calcObj chi; + calcObj res; + calcObj Uchi; + calcObj Utmp; + calcObj Utmp2; + calcLink UU; + calcLink Udag; + int ptype; + +// res = coalescedRead(in[ss])*(-8.0); + res = coalescedRead(out[ss]); + + SE = st.GetEntry(ptype, 0, ss); + if ((SE->_is_local )==0){ + LEG_LOAD_MULT(0,Xp); + } + SE = st.GetEntry(ptype, 1, ss); + if ((SE->_is_local )==0){ + LEG_LOAD_MULT(1,Yp); + } + SE = st.GetEntry(ptype, 2, ss); + if ((SE->_is_local )==0){ + LEG_LOAD_MULT(2,Zp); + } + SE = st.GetEntry(ptype, 3, ss); + if ((SE->_is_local )==0){ + LEG_LOAD_MULT(3,Tp); + } + SE = st.GetEntry(ptype, 4, ss); + if ((SE->_is_local )==0){ + LEG_LOAD_MULT(4,Xm); + } + SE = st.GetEntry(ptype, 5, ss); + if ((SE->_is_local )==0){ + LEG_LOAD_MULT(5,Ym); + } + SE = st.GetEntry(ptype, 6, ss); + if ((SE->_is_local )==0){ + LEG_LOAD_MULT(6,Zm); + } + SE = st.GetEntry(ptype, 7, ss); + if ((SE->_is_local )==0){ + LEG_LOAD_MULT(7,Tm); + } + + coalescedWrite(out[ss], res,lane); + }); + }; + virtual void M(const Field &in, Field &out) {Mnew(in,out);}; virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid diff --git a/Grid/qcd/utils/CovariantLaplacianRat.h b/Grid/qcd/utils/CovariantLaplacianRat.h index d0eae17a..6cd2d927 100644 --- a/Grid/qcd/utils/CovariantLaplacianRat.h +++ b/Grid/qcd/utils/CovariantLaplacianRat.h @@ -189,12 +189,12 @@ public: for(int i =0;i,GaugeLinkField> QuadOp(LapStencil,par.b0[i],fac*par.b1[i],fac*fac*par.b2); - #if USE_CHRONO +#if USE_CHRONO MinvMom[i] = Forecast(QuadOp, right_nu, prev_solns[nu]); - #endif - #ifndef MIXED_CG +#endif +#ifndef MIXED_CG CG(QuadOp,right_nu,MinvMom[i]); - #else +#else QuadLinearOperator,GaugeLinkFieldF> QuadOpF(LapStencilF,par.b0[i],fac*par.b1[i],fac*fac*par.b2); // QuadLinearOperator,GaugeLinkFieldF> QuadOpF(LapStencilF,par.b0[i],par.b1[i],par.b2); MixedPrecisionConjugateGradient MixedCG(par.tolerance,10000,10000,grid_f,QuadOpF,QuadOp); @@ -227,9 +227,9 @@ public: // Laplacian.M(MinvGMom, LMinvGMom); MixedCG(right_nu,MinvMom[i]); #endif - #if USE_CHRONO +#if USE_CHRONO prev_solns[nu].push_back(MinvGMom); - #endif +#endif LapStencil.M(MinvMom[i], Gtemp2); LMinvMom=fac*Gtemp2; AMinvMom = par.a1[i]*LMinvMom; diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 5591fc69..7413a5b8 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -641,6 +641,342 @@ public: std::cout< mflops_all; + + /////////////////////////////////////////////////////// + // Set/Get the layout & grid size + /////////////////////////////////////////////////////// + int threads = GridThread::GetThreads(); + Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4); + Coordinate local({L,L,L,L}); + Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); + + GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4, + GridDefaultSimd(Nd,vComplex::Nsimd()), + GridDefaultMpi()); + uint64_t NP = TmpGrid->RankCount(); + uint64_t NN = TmpGrid->NodeCount(); + NN_global=NN; + uint64_t SHM=NP/NN; + + + ///////// Welcome message //////////// + std::cout< seeds4({1,2,3,4}); + GridParallelRNG RNG4(FGrid); RNG4.SeedFixedIntegers(seeds4); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + RealD mass=0.1; + RealD c1=9.0/8.0; + RealD c2=-1.0/24.0; + RealD u0=1.0; + + typedef ImprovedStaggeredFermionF Action; + typedef typename Action::FermionField Fermion; + typedef LatticeGaugeFieldF Gauge; + + typedef typename PeriodicGimplF::Field GaugeFieldF; + typedef typename PeriodicGimplF::LinkField GaugeLinkFieldF; + + Gauge Umu(FGrid); SU::HotConfiguration(RNG4,Umu); + + typename Action::ImplParams params; + Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params); + + ///////// Source preparation //////////// + Fermion src (FGrid); random(RNG4,src); + Fermion src_e (FrbGrid); + Fermion src_o (FrbGrid); + Fermion r_e (FrbGrid); + Fermion r_o (FrbGrid); + Fermion r_eo (FGrid); + + { + + pickCheckerboard(Even,src_e,src); + pickCheckerboard(Odd,src_o,src); + CovariantAdjointLaplacianStencil LapStencilF(FGrid); + QuadLinearOperator,GaugeLinkFieldF> QuadOpF(LapStencilF,c2,c1,1.); + + const int num_cases = 4; + std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); + + controls Cases [] = { + { StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }, + { StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }, + { StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, + { StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential } + }; + + for(int c=0;c(Umu, 0); + + SU::HotConfiguration(RNG4,Umu); + LapStencilF.GaugeImport(Umu); + + GaugeLinkFieldF r_l(FGrid); + + std::cout<Barrier(); + for(int i=0;i CG(1e-7,10000,false); + + FGrid->Barrier(); + double t1=usecond(); + uint64_t ncall = 500; + + FGrid->Broadcast(0,&ncall,sizeof(ncall)); + + // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); + for(uint64_t i=0;iBarrier(); + + double volume=1; for(int mu=0;mumflops_best ) mflops_best = mflops; + if ( mflops mflops_all; + + /////////////////////////////////////////////////////// + // Set/Get the layout & grid size + /////////////////////////////////////////////////////// + int threads = GridThread::GetThreads(); + Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4); + Coordinate local({L,L,L,L}); + Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); + + GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4, + GridDefaultSimd(Nd,vComplex::Nsimd()), + GridDefaultMpi()); + uint64_t NP = TmpGrid->RankCount(); + uint64_t NN = TmpGrid->NodeCount(); + NN_global=NN; + uint64_t SHM=NP/NN; + + + ///////// Welcome message //////////// + std::cout< seeds4({1,2,3,4}); + GridParallelRNG RNG4(FGrid); RNG4.SeedFixedIntegers(seeds4); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + RealD mass=0.1; + RealD c1=9.0/8.0; + RealD c2=-1.0/24.0; + RealD u0=1.0; + +// typedef ImprovedStaggeredFermionF Action; +// typedef typename Action::FermionField Fermion; + typedef LatticeGaugeFieldF Gauge; + + Gauge Umu(FGrid); SU::HotConfiguration(RNG4,Umu); + +// typename Action::ImplParams params; +// Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params); + +// PeriodicGimplF + typedef typename PeriodicGimplF::LinkField GaugeLinkFieldF; + + ///////// Source preparation //////////// + GaugeLinkFieldF src (FGrid); random(RNG4,src); +// GaugeLinkFieldF src_e (FrbGrid); +// GaugeLinkFieldF src_o (FrbGrid); +// GaugeLinkFieldF r_e (FrbGrid); +// GaugeLinkFieldF r_o (FrbGrid); + GaugeLinkFieldF r_eo (FGrid); + + { + + // pickCheckerboard(Even,src_e,src); + // pickCheckerboard(Odd,src_o,src); + + const int num_cases = 1; + std::string fmt("G/O/C "); + + controls Cases [] = { + { StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }, + }; + + for(int c=0;c LapStencilF(FGrid); + QuadLinearOperator,PeriodicGimplF::LinkField> QuadOpF(LapStencilF,c2,c1,1.); + LapStencilF.GaugeImport(Umu); + + + StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap; + StaggeredKernelsStatic::Opt = Cases[c].Opt; + CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch); + + std::cout<Barrier(); + for(int i=0;iBarrier(); + double t1=usecond(); + uint64_t ncall = 500; + + FGrid->Broadcast(0,&ncall,sizeof(ncall)); + + // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); + for(uint64_t i=0;iBarrier(); + + double volume=1; for(int mu=0;mumflops_best ) mflops_best = mflops; + if ( mflopsBarrier(); + + } + + std::cout<Barrier(); + + for(int i=0;i L_list({8,12,16,24,32}); + std::vector L_list({16,24,32}); int selm1=sel-1; std::vector wilson; std::vector dwf4; std::vector staggered; + std::vector lap; int Ls=1; std::cout<