diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h index c958019d..2833fdc4 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h @@ -76,91 +76,6 @@ WilsonFermion::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, StencilOdd.BuildSurfaceList(1,vol4); } -template -void WilsonFermion::Report(void) -{ - RealD NP = _grid->_Nprocessors; - RealD NN = _grid->NodeCount(); - RealD volume = 1; - Coordinate latt = _grid->GlobalDimensions(); - for(int mu=0;mu 0 ) { - std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; - std::cout << GridLogMessage << "WilsonFermion Number of DhopEO Calls : " << DhopCalls << std::endl; - std::cout << GridLogMessage << "WilsonFermion TotalTime /Calls : " << DhopTotalTime / DhopCalls << " us" << std::endl; - std::cout << GridLogMessage << "WilsonFermion CommTime /Calls : " << DhopCommTime / DhopCalls << " us" << std::endl; - std::cout << GridLogMessage << "WilsonFermion FaceTime /Calls : " << DhopFaceTime / DhopCalls << " us" << std::endl; - std::cout << GridLogMessage << "WilsonFermion ComputeTime1/Calls : " << DhopComputeTime / DhopCalls << " us" << std::endl; - std::cout << GridLogMessage << "WilsonFermion ComputeTime2/Calls : " << DhopComputeTime2/ DhopCalls << " us" << std::endl; - - // Average the compute time - _grid->GlobalSum(DhopComputeTime); - DhopComputeTime/=NP; - RealD mflops = 1320*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting - std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; - std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; - std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl; - - RealD Fullmflops = 1320*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting - std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl; - std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl; - std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl; - - } - - if ( DerivCalls > 0 ) { - std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl; - std::cout << GridLogMessage << "WilsonFermion Number of Deriv Calls : " < 0 || DhopCalls > 0){ - std::cout << GridLogMessage << "WilsonFermion Stencil" < 0){ - std::cout << GridLogMessage << "WilsonFermion Stencil Reporti()" < -void WilsonFermion::ZeroCounters(void) { - DhopCalls = 0; // ok - DhopCommTime = 0; - DhopComputeTime = 0; - DhopComputeTime2= 0; - DhopFaceTime = 0; - DhopTotalTime = 0; - - DerivCalls = 0; // ok - DerivCommTime = 0; - DerivComputeTime = 0; - DerivDhopComputeTime = 0; - - Stencil.ZeroCounters(); - StencilEven.ZeroCounters(); - StencilOdd.ZeroCounters(); - Stencil.ZeroCountersi(); - StencilEven.ZeroCountersi(); - StencilOdd.ZeroCountersi(); -} - - template void WilsonFermion::ImportGauge(const GaugeField &_Umu) { @@ -320,7 +235,6 @@ template void WilsonFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat, const FermionField &A, const FermionField &B, int dag) { - DerivCalls++; assert((dag == DaggerNo) || (dag == DaggerYes)); Compressor compressor(dag); @@ -329,11 +243,8 @@ void WilsonFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, FermionField Atilde(B.Grid()); Atilde = A; - DerivCommTime-=usecond(); st.HaloExchange(B, compressor); - DerivCommTime+=usecond(); - DerivComputeTime-=usecond(); for (int mu = 0; mu < Nd; mu++) { //////////////////////////////////////////////////////////////////////// // Flip gamma (1+g)<->(1-g) if dag @@ -341,7 +252,6 @@ void WilsonFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, int gamma = mu; if (!dag) gamma += Nd; - DerivDhopComputeTime -= usecond(); int Ls=1; Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, B.Grid()->oSites(), B, Btilde, mu, gamma); @@ -349,9 +259,7 @@ void WilsonFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, // spin trace outer product ////////////////////////////////////////////////// Impl::InsertForce4D(mat, Btilde, Atilde, mu); - DerivDhopComputeTime += usecond(); } - DerivComputeTime += usecond(); } template @@ -398,7 +306,6 @@ void WilsonFermion::DhopDerivEO(GaugeField &mat, const FermionField &U, co template void WilsonFermion::Dhop(const FermionField &in, FermionField &out, int dag) { - DhopCalls+=2; conformable(in.Grid(), _grid); // verifies full grid conformable(in.Grid(), out.Grid()); @@ -410,7 +317,6 @@ void WilsonFermion::Dhop(const FermionField &in, FermionField &out, int da template void WilsonFermion::DhopOE(const FermionField &in, FermionField &out, int dag) { - DhopCalls++; conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), out.Grid()); // drops the cb check @@ -423,7 +329,6 @@ void WilsonFermion::DhopOE(const FermionField &in, FermionField &out, int template void WilsonFermion::DhopEO(const FermionField &in, FermionField &out,int dag) { - DhopCalls++; conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), out.Grid()); // drops the cb check @@ -488,14 +393,12 @@ void WilsonFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, const FermionField &in, FermionField &out, int dag) { - DhopTotalTime-=usecond(); #ifdef GRID_OMP if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) DhopInternalOverlappedComms(st,lo,U,in,out,dag); else #endif DhopInternalSerial(st,lo,U,in,out,dag); - DhopTotalTime+=usecond(); } template @@ -504,6 +407,7 @@ void WilsonFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO const FermionField &in, FermionField &out, int dag) { + GRID_TRACE("DhopOverlapped"); assert((dag == DaggerNo) || (dag == DaggerYes)); Compressor compressor(dag); @@ -514,53 +418,55 @@ void WilsonFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO ///////////////////////////// std::vector > requests; st.Prepare(); - DhopFaceTime-=usecond(); - st.HaloGather(in,compressor); - DhopFaceTime+=usecond(); + { + GRID_TRACE("Gather"); + st.HaloGather(in,compressor); + } - DhopCommTime -=usecond(); + tracePush("Communication"); st.CommunicateBegin(requests); ///////////////////////////// // Overlap with comms ///////////////////////////// - DhopFaceTime-=usecond(); - st.CommsMergeSHM(compressor); - DhopFaceTime+=usecond(); + { + GRID_TRACE("MergeSHM"); + st.CommsMergeSHM(compressor); + } ///////////////////////////// // do the compute interior ///////////////////////////// int Opt = WilsonKernelsStatic::Opt; - DhopComputeTime-=usecond(); if (dag == DaggerYes) { + GRID_TRACE("DhopDagInterior"); Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0); } else { + GRID_TRACE("DhopInterior"); Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0); } - DhopComputeTime+=usecond(); ///////////////////////////// // Complete comms ///////////////////////////// st.CommunicateComplete(requests); - DhopCommTime +=usecond(); - - DhopFaceTime-=usecond(); - st.CommsMerge(compressor); - DhopFaceTime+=usecond(); + tracePop("Communication"); + { + GRID_TRACE("Merge"); + st.CommsMerge(compressor); + } ///////////////////////////// // do the compute exterior ///////////////////////////// - DhopComputeTime2-=usecond(); if (dag == DaggerYes) { + GRID_TRACE("DhopDagExterior"); Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1); } else { + GRID_TRACE("DhopExterior"); Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1); } - DhopComputeTime2+=usecond(); }; @@ -570,20 +476,22 @@ void WilsonFermion::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, const FermionField &in, FermionField &out, int dag) { + GRID_TRACE("DhopSerial"); assert((dag == DaggerNo) || (dag == DaggerYes)); Compressor compressor(dag); - DhopCommTime-=usecond(); - st.HaloExchange(in, compressor); - DhopCommTime+=usecond(); + { + GRID_TRACE("HaloExchange"); + st.HaloExchange(in, compressor); + } - DhopComputeTime-=usecond(); int Opt = WilsonKernelsStatic::Opt; if (dag == DaggerYes) { + GRID_TRACE("DhopDag"); Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out); } else { + GRID_TRACE("Dhop"); Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out); } - DhopComputeTime+=usecond(); }; /*Change ends */