mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Hadrons: module and VM instrumentation
This commit is contained in:
parent
f4cbfd63ff
commit
99a85116f8
@ -173,3 +173,29 @@ void Hadrons::makeFileDir(const std::string filename, GridBase *g)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Hadrons::printTimeProfile(const std::map<std::string, GridTime> &timing,
|
||||
GridTime total)
|
||||
{
|
||||
typedef decltype(total.count()) Count;
|
||||
|
||||
std::map<Count, std::string, std::greater<Count>> rtiming;
|
||||
const double dtotal = static_cast<double>(total.count());
|
||||
auto cf = std::cout.flags();
|
||||
unsigned int width = 0;
|
||||
|
||||
for (auto &t: timing)
|
||||
{
|
||||
width = std::max(width, static_cast<unsigned int>(t.first.length()));
|
||||
rtiming[t.second.count()] = t.first;
|
||||
}
|
||||
for (auto &rt: rtiming)
|
||||
{
|
||||
LOG(Message) << std::right << std::setw(width) << rt.second << ": "
|
||||
<< rt.first << " us (" << std::fixed
|
||||
<< std::setprecision(1)
|
||||
<< static_cast<double>(rt.first)/dtotal*100 << "%)"
|
||||
<< std::endl;
|
||||
}
|
||||
std::cout.flags(cf);
|
||||
}
|
||||
|
@ -214,6 +214,9 @@ void makeFileDir(const std::string filename, GridBase *g);
|
||||
#define _HADRONS_STR(x) #x
|
||||
#define HADRONS_STR(x) _HADRONS_STR(x)
|
||||
|
||||
// pretty print time profile
|
||||
void printTimeProfile(const std::map<std::string, GridTime> &timing, GridTime total);
|
||||
|
||||
END_HADRONS_NAMESPACE
|
||||
|
||||
#include <Grid/Hadrons/Exceptions.hpp>
|
||||
|
@ -56,6 +56,75 @@ std::string ModuleBase::getRegisteredName(void)
|
||||
// execution ///////////////////////////////////////////////////////////////////
|
||||
void ModuleBase::operator()(void)
|
||||
{
|
||||
resetTimers();
|
||||
startTimer("_total");
|
||||
startTimer("_setup");
|
||||
setup();
|
||||
stopTimer("_setup");
|
||||
startTimer("_execute");
|
||||
execute();
|
||||
stopAllTimers();
|
||||
}
|
||||
|
||||
// timers //////////////////////////////////////////////////////////////////////
|
||||
void ModuleBase::startTimer(const std::string &name)
|
||||
{
|
||||
if (!name.empty())
|
||||
{
|
||||
timer_[name].Start();
|
||||
}
|
||||
}
|
||||
|
||||
void ModuleBase::startCurrentTimer(const std::string &name)
|
||||
{
|
||||
if (!name.empty())
|
||||
{
|
||||
stopCurrentTimer();
|
||||
startTimer(name);
|
||||
currentTimer_ = name;
|
||||
}
|
||||
}
|
||||
|
||||
void ModuleBase::stopTimer(const std::string &name)
|
||||
{
|
||||
if (timer_.at(name).isRunning())
|
||||
{
|
||||
timer_.at(name).Stop();
|
||||
}
|
||||
}
|
||||
|
||||
void ModuleBase::stopCurrentTimer(void)
|
||||
{
|
||||
if (!currentTimer_.empty())
|
||||
{
|
||||
stopTimer(currentTimer_);
|
||||
currentTimer_ = "";
|
||||
}
|
||||
}
|
||||
|
||||
void ModuleBase::stopAllTimers(void)
|
||||
{
|
||||
for (auto &t: timer_)
|
||||
{
|
||||
stopTimer(t.first);
|
||||
}
|
||||
currentTimer_ = "";
|
||||
}
|
||||
|
||||
void ModuleBase::resetTimers(void)
|
||||
{
|
||||
timer_.clear();
|
||||
currentTimer_ = "";
|
||||
}
|
||||
|
||||
std::map<std::string, GridTime> ModuleBase::getTimings(void)
|
||||
{
|
||||
std::map<std::string, GridTime> timing;
|
||||
|
||||
for (auto &t: timer_)
|
||||
{
|
||||
timing[t.first] = t.second.Elapsed();
|
||||
}
|
||||
|
||||
return timing;
|
||||
}
|
||||
|
@ -129,6 +129,8 @@ if (env().getGrid()->IsBoss() and !ioStem.empty())\
|
||||
}\
|
||||
}
|
||||
|
||||
#define MODULE_TIMER(name) this->startCurrentTimer(name)
|
||||
|
||||
/******************************************************************************
|
||||
* Module class *
|
||||
******************************************************************************/
|
||||
@ -161,13 +163,22 @@ public:
|
||||
virtual void execute(void) = 0;
|
||||
// execution
|
||||
void operator()(void);
|
||||
// timers
|
||||
void startTimer(const std::string &name);
|
||||
void startCurrentTimer(const std::string &name);
|
||||
void stopTimer(const std::string &name);
|
||||
void stopCurrentTimer(void);
|
||||
void stopAllTimers(void);
|
||||
void resetTimers(void);
|
||||
std::map<std::string, GridTime> getTimings(void);
|
||||
protected:
|
||||
// environment shortcut
|
||||
DEFINE_ENV_ALIAS;
|
||||
// virtual machine shortcut
|
||||
DEFINE_VM_ALIAS;
|
||||
private:
|
||||
std::string name_;
|
||||
std::string name_, currentTimer_;
|
||||
std::map<std::string, GridStopWatch> timer_;
|
||||
};
|
||||
|
||||
// derived class, templating the parameter class
|
||||
|
@ -189,6 +189,7 @@ void TA2AMesonField<FImpl>::MesonField(Eigen::Tensor<ComplexD,5> &mat,
|
||||
int stride=grid->_slice_stride[orthogdim];
|
||||
|
||||
t0-=usecond();
|
||||
MODULE_TIMER("Colour trace * mom.");
|
||||
// Nested parallelism would be ok
|
||||
// Wasting cores here. Test case r
|
||||
parallel_for(int r=0;r<rd;r++)
|
||||
@ -233,6 +234,7 @@ void TA2AMesonField<FImpl>::MesonField(Eigen::Tensor<ComplexD,5> &mat,
|
||||
t0+=usecond();
|
||||
|
||||
// Sum across simd lanes in the plane, breaking out orthog dir.
|
||||
MODULE_TIMER("Local space sum");
|
||||
t1-=usecond();
|
||||
parallel_for(int rt=0;rt<rd;rt++)
|
||||
{
|
||||
@ -265,6 +267,7 @@ void TA2AMesonField<FImpl>::MesonField(Eigen::Tensor<ComplexD,5> &mat,
|
||||
t2-=usecond();
|
||||
|
||||
// ld loop and local only??
|
||||
MODULE_TIMER("Spin trace");
|
||||
int pd = grid->_processors[orthogdim];
|
||||
int pc = grid->_processor_coor[orthogdim];
|
||||
parallel_for_nest2(int lt=0;lt<ld;lt++)
|
||||
@ -308,6 +311,7 @@ void TA2AMesonField<FImpl>::MesonField(Eigen::Tensor<ComplexD,5> &mat,
|
||||
// Healthy size that should suffice
|
||||
////////////////////////////////////////////////////////////////////
|
||||
t3-=usecond();
|
||||
MODULE_TIMER("Global sum");
|
||||
grid->GlobalSumVector(&mat(0,0,0,0,0),Nmom*Ngamma*Nt*Lblock*Rblock);
|
||||
t3+=usecond();
|
||||
}
|
||||
@ -430,6 +434,7 @@ void TA2AMesonField<FImpl>::execute(void)
|
||||
bytes += vol * (12.0 * sizeof(Complex) ) * N_iii*N_jjj
|
||||
+ vol * ( 2.0 * sizeof(Complex) *nmom ) * N_iii*N_jjj* ngamma;
|
||||
|
||||
MODULE_TIMER("Cache copy");
|
||||
for(int iii=0;iii< N_iii;iii++)
|
||||
for(int jjj=0;jjj< N_jjj;jjj++)
|
||||
for(int m =0;m< nmom;m++)
|
||||
@ -442,21 +447,10 @@ void TA2AMesonField<FImpl>::execute(void)
|
||||
}
|
||||
|
||||
double nodes=grid->NodeCount();
|
||||
double t1 = usecond();
|
||||
LOG(Message) << "Contraction of MesonFields took "<<(t1-t0)/1.0e6<< " s" << std::endl;
|
||||
LOG(Message) << " Schur " << (t_schur)/1.0e6 << " s" << std::endl;
|
||||
LOG(Message) << " Contr " << (t_contr)/1.0e6 << " s" << std::endl;
|
||||
LOG(Message) << " Intern0 " << (t_int_0)/1.0e6 << " s" << std::endl;
|
||||
LOG(Message) << " Intern1 " << (t_int_1)/1.0e6 << " s" << std::endl;
|
||||
LOG(Message) << " Intern2 " << (t_int_2)/1.0e6 << " s" << std::endl;
|
||||
LOG(Message) << " Intern3 " << (t_int_3)/1.0e6 << " s" << std::endl;
|
||||
|
||||
double t_kernel = t_int_0 + t_int_1;
|
||||
LOG(Message) << " Arith " << flops/(t_kernel)/1.0e3/nodes << " Gflop/s/ node " << std::endl;
|
||||
LOG(Message) << " Arith " << bytes/(t_kernel)/1.0e3/nodes << " GB/s/node " << std::endl;
|
||||
|
||||
for(int t=0;t<nt;t++) corr[t] = corr[t]/ (double)nt;
|
||||
for(int t=0;t<nt;t++) LOG(Message) << " " << t << " " << corr[t]<<std::endl;
|
||||
LOG(Message) << "Perf " << flops/(t_kernel)/1.0e3/nodes << " Gflop/s/node " << std::endl;
|
||||
LOG(Message) << "Perf " << bytes/(t_kernel)/1.0e3/nodes << " GB/s/node " << std::endl;
|
||||
}
|
||||
|
||||
END_MODULE_NAMESPACE
|
||||
|
@ -178,6 +178,7 @@ void TA2AVectors<FImpl, Pack>::execute(void)
|
||||
{
|
||||
auto &epack = envGet(Pack, par().eigenPack);
|
||||
|
||||
MODULE_TIMER("V low mode");
|
||||
LOG(Message) << "V vector i = " << il << " (low mode)" << std::endl;
|
||||
if (Ls == 1)
|
||||
{
|
||||
@ -188,6 +189,7 @@ void TA2AVectors<FImpl, Pack>::execute(void)
|
||||
envGetTmp(FermionField, f5);
|
||||
a2a.makeLowModeV5D(v[il], f5, epack.evec[il], epack.eval[il]);
|
||||
}
|
||||
MODULE_TIMER("W low mode");
|
||||
LOG(Message) << "W vector i = " << il << " (low mode)" << std::endl;
|
||||
if (Ls == 1)
|
||||
{
|
||||
@ -203,6 +205,7 @@ void TA2AVectors<FImpl, Pack>::execute(void)
|
||||
// High modes
|
||||
for (unsigned int ih = 0; ih < noise.size(); ih++)
|
||||
{
|
||||
MODULE_TIMER("V high mode");
|
||||
LOG(Message) << "V vector i = " << Nl_ + ih
|
||||
<< " (" << ((Nl_ > 0) ? "high " : "")
|
||||
<< "stochastic mode)" << std::endl;
|
||||
@ -215,6 +218,7 @@ void TA2AVectors<FImpl, Pack>::execute(void)
|
||||
envGetTmp(FermionField, f5);
|
||||
a2a.makeHighModeV5D(v[Nl_ + ih], f5, noise[ih]);
|
||||
}
|
||||
MODULE_TIMER("W high mode");
|
||||
LOG(Message) << "W vector i = " << Nl_ + ih
|
||||
<< " (" << ((Nl_ > 0) ? "high " : "")
|
||||
<< "stochastic mode)" << std::endl;
|
||||
|
@ -622,9 +622,14 @@ VirtualMachine::Program VirtualMachine::schedule(const GeneticPar &par)
|
||||
}
|
||||
|
||||
// general execution ///////////////////////////////////////////////////////////
|
||||
#define BIG_SEP "==============="
|
||||
#define SEP "---------------"
|
||||
#define BIG_SEP "==============="
|
||||
#define SEP "---------------"
|
||||
#define SMALL_SEP "..............."
|
||||
#define MEM_MSG(size) sizeString(size)
|
||||
#define PRINT_TIME(name, t, total) \
|
||||
std::right << std::setw(30) << name << ": " << t.count() << " us (" \
|
||||
<< std::fixed << std::setprecision(1) \
|
||||
<< static_cast<double>((t).count())/total*100 << "%)"
|
||||
|
||||
void VirtualMachine::executeProgram(const Program &p)
|
||||
{
|
||||
@ -654,11 +659,31 @@ void VirtualMachine::executeProgram(const Program &p)
|
||||
LOG(Message) << SEP << " Measurement step " << i + 1 << "/"
|
||||
<< p.size() << " (module '" << module_[p[i]].name
|
||||
<< "') " << SEP << std::endl;
|
||||
LOG(Message) << SMALL_SEP << " Module execution" << std::endl;
|
||||
currentModule_ = p[i];
|
||||
(*module_[p[i]].data)();
|
||||
currentModule_ = -1;
|
||||
sizeBefore = env().getTotalSize();
|
||||
// print time profile after execution
|
||||
LOG(Message) << SMALL_SEP << " Timings" << std::endl;
|
||||
|
||||
std::map<std::string, GridTime> ctiming, gtiming;
|
||||
GridTime total;
|
||||
|
||||
ctiming = module_[p[i]].data->getTimings();
|
||||
total = ctiming.at("_total");
|
||||
gtiming["total"] = ctiming["_total"]; ctiming.erase("_total");
|
||||
gtiming["setup"] = ctiming["_setup"]; ctiming.erase("_setup");
|
||||
gtiming["execution"] = ctiming["_execute"]; ctiming.erase("_execute");
|
||||
LOG(Message) << "* GLOBAL TIMERS" << std::endl;
|
||||
printTimeProfile(gtiming, total);
|
||||
if (!ctiming.empty())
|
||||
{
|
||||
LOG(Message) << "* CUSTOM TIMERS" << std::endl;
|
||||
printTimeProfile(ctiming, total);
|
||||
}
|
||||
// print used memory after execution
|
||||
LOG(Message) << SMALL_SEP << " Memory management" << std::endl;
|
||||
LOG(Message) << "Allocated objects: " << MEM_MSG(sizeBefore)
|
||||
<< std::endl;
|
||||
if (sizeBefore > memPeak)
|
||||
|
@ -102,6 +102,9 @@ public:
|
||||
assert(running == false);
|
||||
return (uint64_t) accumulator.count();
|
||||
}
|
||||
bool isRunning(void){
|
||||
return running;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user