From 2cbb72a81c68832b6df2d448f4001aa2ff467104 Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Thu, 26 Apr 2018 10:10:07 +0100 Subject: [PATCH] Provide info if EE term is trivial (m^2 factor) Better timing in staggered 4d case --- .../fermion/ImprovedStaggeredFermion.cc | 75 +++++++++++++++++++ .../action/fermion/ImprovedStaggeredFermion.h | 15 +++- .../fermion/ImprovedStaggeredFermion5D.cc | 9 ++- .../fermion/ImprovedStaggeredFermion5D.h | 3 + lib/qcd/action/fermion/WilsonFermion.h | 2 + 5 files changed, 101 insertions(+), 3 deletions(-) diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc index 811e482d..545af639 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc @@ -330,6 +330,7 @@ void ImprovedStaggeredFermion::DhopDerivEO(GaugeField &mat, const FermionF template void ImprovedStaggeredFermion::Dhop(const FermionField &in, FermionField &out, int dag) { + DhopCalls+=2; conformable(in._grid, _grid); // verifies full grid conformable(in._grid, out._grid); @@ -340,6 +341,7 @@ void ImprovedStaggeredFermion::Dhop(const FermionField &in, FermionField & template void ImprovedStaggeredFermion::DhopOE(const FermionField &in, FermionField &out, int dag) { + DhopCalls+=1; conformable(in._grid, _cbgrid); // verifies half grid conformable(in._grid, out._grid); // drops the cb check @@ -351,6 +353,7 @@ void ImprovedStaggeredFermion::DhopOE(const FermionField &in, FermionField template void ImprovedStaggeredFermion::DhopEO(const FermionField &in, FermionField &out, int dag) { + DhopCalls+=1; conformable(in._grid, _cbgrid); // verifies half grid conformable(in._grid, out._grid); // drops the cb check @@ -403,13 +406,18 @@ void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st int len = U._grid->oSites(); const int LLs = 1; + DhopTotalTime -= usecond(); + + DhopFaceTime -= usecond(); st.Prepare(); st.HaloGather(in,compressor); st.CommsMergeSHM(compressor); + DhopFaceTime += usecond(); ////////////////////////////////////////////////////////////////////////////////////////////////////// // Ugly explicit thread mapping introduced for OPA reasons. ////////////////////////////////////////////////////////////////////////////////////////////////////// + DhopComputeTime -= usecond(); #pragma omp parallel { int tid = omp_get_thread_num(); @@ -451,10 +459,14 @@ void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st st.CommunicateThreaded(); } } + DhopComputeTime += usecond(); // First to enter, last to leave timing + DhopFaceTime -= usecond(); st.CommsMerge(compressor); + DhopFaceTime -= usecond(); + DhopComputeTime2 -= usecond(); if (dag == DaggerYes) { int sz=st.surface_list.size(); parallel_for (int ss = 0; ss < sz; ss++) { @@ -468,6 +480,7 @@ void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1); } } + DhopComputeTime2 += usecond(); #else assert(0); #endif @@ -483,9 +496,14 @@ void ImprovedStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, Le { assert((dag == DaggerNo) || (dag == DaggerYes)); + DhopTotalTime -= usecond(); + + DhopCommTime -= usecond(); Compressor compressor; st.HaloExchange(in, compressor); + DhopCommTime += usecond(); + DhopComputeTime -= usecond(); if (dag == DaggerYes) { parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) { Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out); @@ -495,8 +513,65 @@ void ImprovedStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, Le Kernels::DhopSite(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out); } } + DhopComputeTime += usecond(); + DhopTotalTime += usecond(); }; + //////////////////////////////////////////////////////////////// + // Reporting + //////////////////////////////////////////////////////////////// +template +void ImprovedStaggeredFermion::Report(void) +{ + std::vector latt = GridDefaultLatt(); + RealD volume = 1; for(int mu=0;mu_Nprocessors; + RealD NN = _grid->NodeCount(); + + std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; + + std::cout << GridLogMessage << "ImprovedStaggeredFermion Number of DhopEO Calls : " + << DhopCalls << std::endl; + std::cout << GridLogMessage << "ImprovedStaggeredFermion TotalTime /Calls : " + << DhopTotalTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "ImprovedStaggeredFermion CommTime /Calls : " + << DhopCommTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "ImprovedStaggeredFermion ComputeTime/Calls : " + << DhopComputeTime / DhopCalls << " us" << std::endl; + + // Average the compute time + _grid->GlobalSum(DhopComputeTime); + DhopComputeTime/=NP; + + RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl; + + RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl; + + std::cout << GridLogMessage << "ImprovedStaggeredFermion Stencil" < +void ImprovedStaggeredFermion::ZeroCounters(void) +{ + DhopCalls = 0; + DhopTotalTime = 0; + DhopCommTime = 0; + DhopComputeTime = 0; + DhopFaceTime = 0; + + Stencil.ZeroCounters(); + StencilEven.ZeroCounters(); + StencilOdd.ZeroCounters(); +} + + FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion); //AdjointFermOpTemplateInstantiate(ImprovedStaggeredFermion); diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion.h b/lib/qcd/action/fermion/ImprovedStaggeredFermion.h index 69d0aef4..750d29c6 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion.h +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion.h @@ -49,6 +49,18 @@ class ImprovedStaggeredFermion : public StaggeredKernels, public ImprovedS FermionField _tmp; FermionField &tmp(void) { return _tmp; } + //////////////////////////////////////// + // Performance monitoring + //////////////////////////////////////// + void Report(void); + void ZeroCounters(void); + double DhopTotalTime; + double DhopCalls; + double DhopCommTime; + double DhopComputeTime; + double DhopComputeTime2; + double DhopFaceTime; + /////////////////////////////////////////////////////////////// // Implement the abstract base /////////////////////////////////////////////////////////////// @@ -142,7 +154,8 @@ class ImprovedStaggeredFermion : public StaggeredKernels, public ImprovedS // protected: public: // any other parameters of action ??? - + virtual int isTrivialEE(void) { return 1; }; + virtual RealD Mass(void) { return mass; } RealD mass; RealD u0; RealD c1; diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc index e5146d7a..ab9c9c48 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc @@ -291,14 +291,12 @@ void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, LebesgueOr DoubledGaugeField & U,DoubledGaugeField & UUU, const FermionField &in, FermionField &out,int dag) { - DhopTotalTime-=usecond(); #ifdef GRID_OMP if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); else #endif DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); - DhopTotalTime+=usecond(); } template @@ -412,6 +410,7 @@ void ImprovedStaggeredFermion5D::DhopInternalSerialComms(StencilImpl & st, + //double t1=usecond(); DhopTotalTime -= usecond(); DhopCommTime -= usecond(); st.HaloExchange(in,compressor); @@ -432,6 +431,12 @@ void ImprovedStaggeredFermion5D::DhopInternalSerialComms(StencilImpl & st, } DhopComputeTime += usecond(); DhopTotalTime += usecond(); + //double t2=usecond(); + //std::cout << __FILE__ << " " << __func__ << " Total Time " << DhopTotalTime << std::endl; + //std::cout << __FILE__ << " " << __func__ << " Total Time Org " << t2-t1 << std::endl; + //std::cout << __FILE__ << " " << __func__ << " Comml Time " << DhopCommTime << std::endl; + //std::cout << __FILE__ << " " << __func__ << " Compute Time " << DhopComputeTime << std::endl; + } /*CHANGE END*/ diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h index f2fce1c1..4024b472 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h @@ -178,6 +178,9 @@ namespace QCD { // Data members require to support the functionality /////////////////////////////////////////////////////////////// public: + + virtual int isTrivialEE(void) { return 1; }; + virtual RealD Mass(void) { return mass; } GridBase *_FourDimGrid; GridBase *_FourDimRedBlackGrid; diff --git a/lib/qcd/action/fermion/WilsonFermion.h b/lib/qcd/action/fermion/WilsonFermion.h index 55433854..d0181d68 100644 --- a/lib/qcd/action/fermion/WilsonFermion.h +++ b/lib/qcd/action/fermion/WilsonFermion.h @@ -135,6 +135,8 @@ class WilsonFermion : public WilsonKernels, public WilsonFermionStatic { // protected: public: + virtual RealD Mass(void) { return mass; } + virtual int isTrivialEE(void) { return 1; }; RealD mass; GridBase *_grid;