1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-04 19:25:56 +01:00

Tracing replaces self timing

This commit is contained in:
Peter Boyle 2022-08-31 17:19:02 -04:00
parent fd33c835dd
commit bb049847d5

View File

@ -76,91 +76,6 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
StencilOdd.BuildSurfaceList(1,vol4);
}
template<class Impl>
void WilsonFermion<Impl>::Report(void)
{
RealD NP = _grid->_Nprocessors;
RealD NN = _grid->NodeCount();
RealD volume = 1;
Coordinate latt = _grid->GlobalDimensions();
for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
if ( DhopCalls > 0 ) {
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
std::cout << GridLogMessage << "WilsonFermion Number of DhopEO Calls : " << DhopCalls << std::endl;
std::cout << GridLogMessage << "WilsonFermion TotalTime /Calls : " << DhopTotalTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion CommTime /Calls : " << DhopCommTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion FaceTime /Calls : " << DhopFaceTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion ComputeTime1/Calls : " << DhopComputeTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion ComputeTime2/Calls : " << DhopComputeTime2/ DhopCalls << " us" << std::endl;
// Average the compute time
_grid->GlobalSum(DhopComputeTime);
DhopComputeTime/=NP;
RealD mflops = 1320*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
RealD Fullmflops = 1320*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
}
if ( DerivCalls > 0 ) {
std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
std::cout << GridLogMessage << "WilsonFermion Number of Deriv Calls : " <<DerivCalls <<std::endl;
std::cout << GridLogMessage << "WilsonFermion CommTime/Calls : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
std::cout << GridLogMessage << "WilsonFermion ComputeTime/Calls : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
std::cout << GridLogMessage << "WilsonFermion Dhop ComputeTime/Calls : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
// how to count flops here?
RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
std::cout << GridLogMessage << "Average mflops/s per call ? : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node ? : " << mflops/NP << std::endl;
// how to count flops here?
RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) ? : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full) ? : " << Fullmflops/NP << std::endl; }
if (DerivCalls > 0 || DhopCalls > 0){
std::cout << GridLogMessage << "WilsonFermion Stencil" <<std::endl; Stencil.Report();
std::cout << GridLogMessage << "WilsonFermion StencilEven"<<std::endl; StencilEven.Report();
std::cout << GridLogMessage << "WilsonFermion StencilOdd" <<std::endl; StencilOdd.Report();
}
if ( DhopCalls > 0){
std::cout << GridLogMessage << "WilsonFermion Stencil Reporti()" <<std::endl; Stencil.Reporti(DhopCalls);
std::cout << GridLogMessage << "WilsonFermion StencilEven Reporti()"<<std::endl; StencilEven.Reporti(DhopCalls);
std::cout << GridLogMessage << "WilsonFermion StencilOdd Reporti()" <<std::endl; StencilOdd.Reporti(DhopCalls);
}
}
template<class Impl>
void WilsonFermion<Impl>::ZeroCounters(void) {
DhopCalls = 0; // ok
DhopCommTime = 0;
DhopComputeTime = 0;
DhopComputeTime2= 0;
DhopFaceTime = 0;
DhopTotalTime = 0;
DerivCalls = 0; // ok
DerivCommTime = 0;
DerivComputeTime = 0;
DerivDhopComputeTime = 0;
Stencil.ZeroCounters();
StencilEven.ZeroCounters();
StencilOdd.ZeroCounters();
Stencil.ZeroCountersi();
StencilEven.ZeroCountersi();
StencilOdd.ZeroCountersi();
}
template <class Impl>
void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
{
@ -320,7 +235,6 @@ template <class Impl>
void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
GaugeField &mat, const FermionField &A,
const FermionField &B, int dag) {
DerivCalls++;
assert((dag == DaggerNo) || (dag == DaggerYes));
Compressor compressor(dag);
@ -329,11 +243,8 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
FermionField Atilde(B.Grid());
Atilde = A;
DerivCommTime-=usecond();
st.HaloExchange(B, compressor);
DerivCommTime+=usecond();
DerivComputeTime-=usecond();
for (int mu = 0; mu < Nd; mu++) {
////////////////////////////////////////////////////////////////////////
// Flip gamma (1+g)<->(1-g) if dag
@ -341,7 +252,6 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
int gamma = mu;
if (!dag) gamma += Nd;
DerivDhopComputeTime -= usecond();
int Ls=1;
Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, B.Grid()->oSites(), B, Btilde, mu, gamma);
@ -349,9 +259,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
// spin trace outer product
//////////////////////////////////////////////////
Impl::InsertForce4D(mat, Btilde, Atilde, mu);
DerivDhopComputeTime += usecond();
}
DerivComputeTime += usecond();
}
template <class Impl>
@ -398,7 +306,6 @@ void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, co
template <class Impl>
void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
{
DhopCalls+=2;
conformable(in.Grid(), _grid); // verifies full grid
conformable(in.Grid(), out.Grid());
@ -410,7 +317,6 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da
template <class Impl>
void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
{
DhopCalls++;
conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check
@ -423,7 +329,6 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
template <class Impl>
void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
{
DhopCalls++;
conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check
@ -488,14 +393,12 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
const FermionField &in,
FermionField &out, int dag)
{
DhopTotalTime-=usecond();
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
else
#endif
DhopInternalSerial(st,lo,U,in,out,dag);
DhopTotalTime+=usecond();
}
template <class Impl>
@ -504,6 +407,7 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
const FermionField &in,
FermionField &out, int dag)
{
GRID_TRACE("DhopOverlapped");
assert((dag == DaggerNo) || (dag == DaggerYes));
Compressor compressor(dag);
@ -514,53 +418,55 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
/////////////////////////////
std::vector<std::vector<CommsRequest_t> > requests;
st.Prepare();
DhopFaceTime-=usecond();
st.HaloGather(in,compressor);
DhopFaceTime+=usecond();
{
GRID_TRACE("Gather");
st.HaloGather(in,compressor);
}
DhopCommTime -=usecond();
tracePush("Communication");
st.CommunicateBegin(requests);
/////////////////////////////
// Overlap with comms
/////////////////////////////
DhopFaceTime-=usecond();
st.CommsMergeSHM(compressor);
DhopFaceTime+=usecond();
{
GRID_TRACE("MergeSHM");
st.CommsMergeSHM(compressor);
}
/////////////////////////////
// do the compute interior
/////////////////////////////
int Opt = WilsonKernelsStatic::Opt;
DhopComputeTime-=usecond();
if (dag == DaggerYes) {
GRID_TRACE("DhopDagInterior");
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
} else {
GRID_TRACE("DhopInterior");
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
}
DhopComputeTime+=usecond();
/////////////////////////////
// Complete comms
/////////////////////////////
st.CommunicateComplete(requests);
DhopCommTime +=usecond();
DhopFaceTime-=usecond();
st.CommsMerge(compressor);
DhopFaceTime+=usecond();
tracePop("Communication");
{
GRID_TRACE("Merge");
st.CommsMerge(compressor);
}
/////////////////////////////
// do the compute exterior
/////////////////////////////
DhopComputeTime2-=usecond();
if (dag == DaggerYes) {
GRID_TRACE("DhopDagExterior");
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
} else {
GRID_TRACE("DhopExterior");
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
}
DhopComputeTime2+=usecond();
};
@ -570,20 +476,22 @@ void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
const FermionField &in,
FermionField &out, int dag)
{
GRID_TRACE("DhopSerial");
assert((dag == DaggerNo) || (dag == DaggerYes));
Compressor compressor(dag);
DhopCommTime-=usecond();
st.HaloExchange(in, compressor);
DhopCommTime+=usecond();
{
GRID_TRACE("HaloExchange");
st.HaloExchange(in, compressor);
}
DhopComputeTime-=usecond();
int Opt = WilsonKernelsStatic::Opt;
if (dag == DaggerYes) {
GRID_TRACE("DhopDag");
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
} else {
GRID_TRACE("Dhop");
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
}
DhopComputeTime+=usecond();
};
/*Change ends */