1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-09-20 01:05:38 +01:00

Hadrons: module and VM instrumentation

This commit is contained in:
Antonin Portelli 2018-08-10 16:07:30 +01:00
parent f4cbfd63ff
commit 99a85116f8
8 changed files with 151 additions and 16 deletions

View File

@ -173,3 +173,29 @@ void Hadrons::makeFileDir(const std::string filename, GridBase *g)
}
}
}
void Hadrons::printTimeProfile(const std::map<std::string, GridTime> &timing,
GridTime total)
{
typedef decltype(total.count()) Count;
std::map<Count, std::string, std::greater<Count>> rtiming;
const double dtotal = static_cast<double>(total.count());
auto cf = std::cout.flags();
unsigned int width = 0;
for (auto &t: timing)
{
width = std::max(width, static_cast<unsigned int>(t.first.length()));
rtiming[t.second.count()] = t.first;
}
for (auto &rt: rtiming)
{
LOG(Message) << std::right << std::setw(width) << rt.second << ": "
<< rt.first << " us (" << std::fixed
<< std::setprecision(1)
<< static_cast<double>(rt.first)/dtotal*100 << "%)"
<< std::endl;
}
std::cout.flags(cf);
}

View File

@ -214,6 +214,9 @@ void makeFileDir(const std::string filename, GridBase *g);
#define _HADRONS_STR(x) #x
#define HADRONS_STR(x) _HADRONS_STR(x)
// pretty print time profile
void printTimeProfile(const std::map<std::string, GridTime> &timing, GridTime total);
END_HADRONS_NAMESPACE
#include <Grid/Hadrons/Exceptions.hpp>

View File

@ -56,6 +56,75 @@ std::string ModuleBase::getRegisteredName(void)
// execution ///////////////////////////////////////////////////////////////////
void ModuleBase::operator()(void)
{
resetTimers();
startTimer("_total");
startTimer("_setup");
setup();
stopTimer("_setup");
startTimer("_execute");
execute();
stopAllTimers();
}
// timers //////////////////////////////////////////////////////////////////////
void ModuleBase::startTimer(const std::string &name)
{
if (!name.empty())
{
timer_[name].Start();
}
}
void ModuleBase::startCurrentTimer(const std::string &name)
{
if (!name.empty())
{
stopCurrentTimer();
startTimer(name);
currentTimer_ = name;
}
}
void ModuleBase::stopTimer(const std::string &name)
{
if (timer_.at(name).isRunning())
{
timer_.at(name).Stop();
}
}
void ModuleBase::stopCurrentTimer(void)
{
if (!currentTimer_.empty())
{
stopTimer(currentTimer_);
currentTimer_ = "";
}
}
void ModuleBase::stopAllTimers(void)
{
for (auto &t: timer_)
{
stopTimer(t.first);
}
currentTimer_ = "";
}
void ModuleBase::resetTimers(void)
{
timer_.clear();
currentTimer_ = "";
}
std::map<std::string, GridTime> ModuleBase::getTimings(void)
{
std::map<std::string, GridTime> timing;
for (auto &t: timer_)
{
timing[t.first] = t.second.Elapsed();
}
return timing;
}

View File

@ -129,6 +129,8 @@ if (env().getGrid()->IsBoss() and !ioStem.empty())\
}\
}
#define MODULE_TIMER(name) this->startCurrentTimer(name)
/******************************************************************************
* Module class *
******************************************************************************/
@ -161,13 +163,22 @@ public:
virtual void execute(void) = 0;
// execution
void operator()(void);
// timers
void startTimer(const std::string &name);
void startCurrentTimer(const std::string &name);
void stopTimer(const std::string &name);
void stopCurrentTimer(void);
void stopAllTimers(void);
void resetTimers(void);
std::map<std::string, GridTime> getTimings(void);
protected:
// environment shortcut
DEFINE_ENV_ALIAS;
// virtual machine shortcut
DEFINE_VM_ALIAS;
private:
std::string name_;
std::string name_, currentTimer_;
std::map<std::string, GridStopWatch> timer_;
};
// derived class, templating the parameter class

View File

@ -189,6 +189,7 @@ void TA2AMesonField<FImpl>::MesonField(Eigen::Tensor<ComplexD,5> &mat,
int stride=grid->_slice_stride[orthogdim];
t0-=usecond();
MODULE_TIMER("Colour trace * mom.");
// Nested parallelism would be ok
// Wasting cores here. Test case r
parallel_for(int r=0;r<rd;r++)
@ -233,6 +234,7 @@ void TA2AMesonField<FImpl>::MesonField(Eigen::Tensor<ComplexD,5> &mat,
t0+=usecond();
// Sum across simd lanes in the plane, breaking out orthog dir.
MODULE_TIMER("Local space sum");
t1-=usecond();
parallel_for(int rt=0;rt<rd;rt++)
{
@ -265,6 +267,7 @@ void TA2AMesonField<FImpl>::MesonField(Eigen::Tensor<ComplexD,5> &mat,
t2-=usecond();
// ld loop and local only??
MODULE_TIMER("Spin trace");
int pd = grid->_processors[orthogdim];
int pc = grid->_processor_coor[orthogdim];
parallel_for_nest2(int lt=0;lt<ld;lt++)
@ -308,6 +311,7 @@ void TA2AMesonField<FImpl>::MesonField(Eigen::Tensor<ComplexD,5> &mat,
// Healthy size that should suffice
////////////////////////////////////////////////////////////////////
t3-=usecond();
MODULE_TIMER("Global sum");
grid->GlobalSumVector(&mat(0,0,0,0,0),Nmom*Ngamma*Nt*Lblock*Rblock);
t3+=usecond();
}
@ -430,6 +434,7 @@ void TA2AMesonField<FImpl>::execute(void)
bytes += vol * (12.0 * sizeof(Complex) ) * N_iii*N_jjj
+ vol * ( 2.0 * sizeof(Complex) *nmom ) * N_iii*N_jjj* ngamma;
MODULE_TIMER("Cache copy");
for(int iii=0;iii< N_iii;iii++)
for(int jjj=0;jjj< N_jjj;jjj++)
for(int m =0;m< nmom;m++)
@ -442,21 +447,10 @@ void TA2AMesonField<FImpl>::execute(void)
}
double nodes=grid->NodeCount();
double t1 = usecond();
LOG(Message) << "Contraction of MesonFields took "<<(t1-t0)/1.0e6<< " s" << std::endl;
LOG(Message) << " Schur " << (t_schur)/1.0e6 << " s" << std::endl;
LOG(Message) << " Contr " << (t_contr)/1.0e6 << " s" << std::endl;
LOG(Message) << " Intern0 " << (t_int_0)/1.0e6 << " s" << std::endl;
LOG(Message) << " Intern1 " << (t_int_1)/1.0e6 << " s" << std::endl;
LOG(Message) << " Intern2 " << (t_int_2)/1.0e6 << " s" << std::endl;
LOG(Message) << " Intern3 " << (t_int_3)/1.0e6 << " s" << std::endl;
double t_kernel = t_int_0 + t_int_1;
LOG(Message) << " Arith " << flops/(t_kernel)/1.0e3/nodes << " Gflop/s/ node " << std::endl;
LOG(Message) << " Arith " << bytes/(t_kernel)/1.0e3/nodes << " GB/s/node " << std::endl;
for(int t=0;t<nt;t++) corr[t] = corr[t]/ (double)nt;
for(int t=0;t<nt;t++) LOG(Message) << " " << t << " " << corr[t]<<std::endl;
LOG(Message) << "Perf " << flops/(t_kernel)/1.0e3/nodes << " Gflop/s/node " << std::endl;
LOG(Message) << "Perf " << bytes/(t_kernel)/1.0e3/nodes << " GB/s/node " << std::endl;
}
END_MODULE_NAMESPACE

View File

@ -178,6 +178,7 @@ void TA2AVectors<FImpl, Pack>::execute(void)
{
auto &epack = envGet(Pack, par().eigenPack);
MODULE_TIMER("V low mode");
LOG(Message) << "V vector i = " << il << " (low mode)" << std::endl;
if (Ls == 1)
{
@ -188,6 +189,7 @@ void TA2AVectors<FImpl, Pack>::execute(void)
envGetTmp(FermionField, f5);
a2a.makeLowModeV5D(v[il], f5, epack.evec[il], epack.eval[il]);
}
MODULE_TIMER("W low mode");
LOG(Message) << "W vector i = " << il << " (low mode)" << std::endl;
if (Ls == 1)
{
@ -203,6 +205,7 @@ void TA2AVectors<FImpl, Pack>::execute(void)
// High modes
for (unsigned int ih = 0; ih < noise.size(); ih++)
{
MODULE_TIMER("V high mode");
LOG(Message) << "V vector i = " << Nl_ + ih
<< " (" << ((Nl_ > 0) ? "high " : "")
<< "stochastic mode)" << std::endl;
@ -215,6 +218,7 @@ void TA2AVectors<FImpl, Pack>::execute(void)
envGetTmp(FermionField, f5);
a2a.makeHighModeV5D(v[Nl_ + ih], f5, noise[ih]);
}
MODULE_TIMER("W high mode");
LOG(Message) << "W vector i = " << Nl_ + ih
<< " (" << ((Nl_ > 0) ? "high " : "")
<< "stochastic mode)" << std::endl;

View File

@ -622,9 +622,14 @@ VirtualMachine::Program VirtualMachine::schedule(const GeneticPar &par)
}
// general execution ///////////////////////////////////////////////////////////
#define BIG_SEP "==============="
#define SEP "---------------"
#define BIG_SEP "==============="
#define SEP "---------------"
#define SMALL_SEP "..............."
#define MEM_MSG(size) sizeString(size)
#define PRINT_TIME(name, t, total) \
std::right << std::setw(30) << name << ": " << t.count() << " us (" \
<< std::fixed << std::setprecision(1) \
<< static_cast<double>((t).count())/total*100 << "%)"
void VirtualMachine::executeProgram(const Program &p)
{
@ -654,11 +659,31 @@ void VirtualMachine::executeProgram(const Program &p)
LOG(Message) << SEP << " Measurement step " << i + 1 << "/"
<< p.size() << " (module '" << module_[p[i]].name
<< "') " << SEP << std::endl;
LOG(Message) << SMALL_SEP << " Module execution" << std::endl;
currentModule_ = p[i];
(*module_[p[i]].data)();
currentModule_ = -1;
sizeBefore = env().getTotalSize();
// print time profile after execution
LOG(Message) << SMALL_SEP << " Timings" << std::endl;
std::map<std::string, GridTime> ctiming, gtiming;
GridTime total;
ctiming = module_[p[i]].data->getTimings();
total = ctiming.at("_total");
gtiming["total"] = ctiming["_total"]; ctiming.erase("_total");
gtiming["setup"] = ctiming["_setup"]; ctiming.erase("_setup");
gtiming["execution"] = ctiming["_execute"]; ctiming.erase("_execute");
LOG(Message) << "* GLOBAL TIMERS" << std::endl;
printTimeProfile(gtiming, total);
if (!ctiming.empty())
{
LOG(Message) << "* CUSTOM TIMERS" << std::endl;
printTimeProfile(ctiming, total);
}
// print used memory after execution
LOG(Message) << SMALL_SEP << " Memory management" << std::endl;
LOG(Message) << "Allocated objects: " << MEM_MSG(sizeBefore)
<< std::endl;
if (sizeBefore > memPeak)

View File

@ -102,6 +102,9 @@ public:
assert(running == false);
return (uint64_t) accumulator.count();
}
bool isRunning(void){
return running;
}
};
}