mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Hadrons: module and VM instrumentation
This commit is contained in:
parent
f4cbfd63ff
commit
99a85116f8
@ -173,3 +173,29 @@ void Hadrons::makeFileDir(const std::string filename, GridBase *g)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Hadrons::printTimeProfile(const std::map<std::string, GridTime> &timing,
|
||||||
|
GridTime total)
|
||||||
|
{
|
||||||
|
typedef decltype(total.count()) Count;
|
||||||
|
|
||||||
|
std::map<Count, std::string, std::greater<Count>> rtiming;
|
||||||
|
const double dtotal = static_cast<double>(total.count());
|
||||||
|
auto cf = std::cout.flags();
|
||||||
|
unsigned int width = 0;
|
||||||
|
|
||||||
|
for (auto &t: timing)
|
||||||
|
{
|
||||||
|
width = std::max(width, static_cast<unsigned int>(t.first.length()));
|
||||||
|
rtiming[t.second.count()] = t.first;
|
||||||
|
}
|
||||||
|
for (auto &rt: rtiming)
|
||||||
|
{
|
||||||
|
LOG(Message) << std::right << std::setw(width) << rt.second << ": "
|
||||||
|
<< rt.first << " us (" << std::fixed
|
||||||
|
<< std::setprecision(1)
|
||||||
|
<< static_cast<double>(rt.first)/dtotal*100 << "%)"
|
||||||
|
<< std::endl;
|
||||||
|
}
|
||||||
|
std::cout.flags(cf);
|
||||||
|
}
|
||||||
|
@ -214,6 +214,9 @@ void makeFileDir(const std::string filename, GridBase *g);
|
|||||||
#define _HADRONS_STR(x) #x
|
#define _HADRONS_STR(x) #x
|
||||||
#define HADRONS_STR(x) _HADRONS_STR(x)
|
#define HADRONS_STR(x) _HADRONS_STR(x)
|
||||||
|
|
||||||
|
// pretty print time profile
|
||||||
|
void printTimeProfile(const std::map<std::string, GridTime> &timing, GridTime total);
|
||||||
|
|
||||||
END_HADRONS_NAMESPACE
|
END_HADRONS_NAMESPACE
|
||||||
|
|
||||||
#include <Grid/Hadrons/Exceptions.hpp>
|
#include <Grid/Hadrons/Exceptions.hpp>
|
||||||
|
@ -56,6 +56,75 @@ std::string ModuleBase::getRegisteredName(void)
|
|||||||
// execution ///////////////////////////////////////////////////////////////////
|
// execution ///////////////////////////////////////////////////////////////////
|
||||||
void ModuleBase::operator()(void)
|
void ModuleBase::operator()(void)
|
||||||
{
|
{
|
||||||
|
resetTimers();
|
||||||
|
startTimer("_total");
|
||||||
|
startTimer("_setup");
|
||||||
setup();
|
setup();
|
||||||
|
stopTimer("_setup");
|
||||||
|
startTimer("_execute");
|
||||||
execute();
|
execute();
|
||||||
|
stopAllTimers();
|
||||||
|
}
|
||||||
|
|
||||||
|
// timers //////////////////////////////////////////////////////////////////////
|
||||||
|
void ModuleBase::startTimer(const std::string &name)
|
||||||
|
{
|
||||||
|
if (!name.empty())
|
||||||
|
{
|
||||||
|
timer_[name].Start();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ModuleBase::startCurrentTimer(const std::string &name)
|
||||||
|
{
|
||||||
|
if (!name.empty())
|
||||||
|
{
|
||||||
|
stopCurrentTimer();
|
||||||
|
startTimer(name);
|
||||||
|
currentTimer_ = name;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ModuleBase::stopTimer(const std::string &name)
|
||||||
|
{
|
||||||
|
if (timer_.at(name).isRunning())
|
||||||
|
{
|
||||||
|
timer_.at(name).Stop();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ModuleBase::stopCurrentTimer(void)
|
||||||
|
{
|
||||||
|
if (!currentTimer_.empty())
|
||||||
|
{
|
||||||
|
stopTimer(currentTimer_);
|
||||||
|
currentTimer_ = "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ModuleBase::stopAllTimers(void)
|
||||||
|
{
|
||||||
|
for (auto &t: timer_)
|
||||||
|
{
|
||||||
|
stopTimer(t.first);
|
||||||
|
}
|
||||||
|
currentTimer_ = "";
|
||||||
|
}
|
||||||
|
|
||||||
|
void ModuleBase::resetTimers(void)
|
||||||
|
{
|
||||||
|
timer_.clear();
|
||||||
|
currentTimer_ = "";
|
||||||
|
}
|
||||||
|
|
||||||
|
std::map<std::string, GridTime> ModuleBase::getTimings(void)
|
||||||
|
{
|
||||||
|
std::map<std::string, GridTime> timing;
|
||||||
|
|
||||||
|
for (auto &t: timer_)
|
||||||
|
{
|
||||||
|
timing[t.first] = t.second.Elapsed();
|
||||||
|
}
|
||||||
|
|
||||||
|
return timing;
|
||||||
}
|
}
|
||||||
|
@ -129,6 +129,8 @@ if (env().getGrid()->IsBoss() and !ioStem.empty())\
|
|||||||
}\
|
}\
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define MODULE_TIMER(name) this->startCurrentTimer(name)
|
||||||
|
|
||||||
/******************************************************************************
|
/******************************************************************************
|
||||||
* Module class *
|
* Module class *
|
||||||
******************************************************************************/
|
******************************************************************************/
|
||||||
@ -161,13 +163,22 @@ public:
|
|||||||
virtual void execute(void) = 0;
|
virtual void execute(void) = 0;
|
||||||
// execution
|
// execution
|
||||||
void operator()(void);
|
void operator()(void);
|
||||||
|
// timers
|
||||||
|
void startTimer(const std::string &name);
|
||||||
|
void startCurrentTimer(const std::string &name);
|
||||||
|
void stopTimer(const std::string &name);
|
||||||
|
void stopCurrentTimer(void);
|
||||||
|
void stopAllTimers(void);
|
||||||
|
void resetTimers(void);
|
||||||
|
std::map<std::string, GridTime> getTimings(void);
|
||||||
protected:
|
protected:
|
||||||
// environment shortcut
|
// environment shortcut
|
||||||
DEFINE_ENV_ALIAS;
|
DEFINE_ENV_ALIAS;
|
||||||
// virtual machine shortcut
|
// virtual machine shortcut
|
||||||
DEFINE_VM_ALIAS;
|
DEFINE_VM_ALIAS;
|
||||||
private:
|
private:
|
||||||
std::string name_;
|
std::string name_, currentTimer_;
|
||||||
|
std::map<std::string, GridStopWatch> timer_;
|
||||||
};
|
};
|
||||||
|
|
||||||
// derived class, templating the parameter class
|
// derived class, templating the parameter class
|
||||||
|
@ -189,6 +189,7 @@ void TA2AMesonField<FImpl>::MesonField(Eigen::Tensor<ComplexD,5> &mat,
|
|||||||
int stride=grid->_slice_stride[orthogdim];
|
int stride=grid->_slice_stride[orthogdim];
|
||||||
|
|
||||||
t0-=usecond();
|
t0-=usecond();
|
||||||
|
MODULE_TIMER("Colour trace * mom.");
|
||||||
// Nested parallelism would be ok
|
// Nested parallelism would be ok
|
||||||
// Wasting cores here. Test case r
|
// Wasting cores here. Test case r
|
||||||
parallel_for(int r=0;r<rd;r++)
|
parallel_for(int r=0;r<rd;r++)
|
||||||
@ -233,6 +234,7 @@ void TA2AMesonField<FImpl>::MesonField(Eigen::Tensor<ComplexD,5> &mat,
|
|||||||
t0+=usecond();
|
t0+=usecond();
|
||||||
|
|
||||||
// Sum across simd lanes in the plane, breaking out orthog dir.
|
// Sum across simd lanes in the plane, breaking out orthog dir.
|
||||||
|
MODULE_TIMER("Local space sum");
|
||||||
t1-=usecond();
|
t1-=usecond();
|
||||||
parallel_for(int rt=0;rt<rd;rt++)
|
parallel_for(int rt=0;rt<rd;rt++)
|
||||||
{
|
{
|
||||||
@ -265,6 +267,7 @@ void TA2AMesonField<FImpl>::MesonField(Eigen::Tensor<ComplexD,5> &mat,
|
|||||||
t2-=usecond();
|
t2-=usecond();
|
||||||
|
|
||||||
// ld loop and local only??
|
// ld loop and local only??
|
||||||
|
MODULE_TIMER("Spin trace");
|
||||||
int pd = grid->_processors[orthogdim];
|
int pd = grid->_processors[orthogdim];
|
||||||
int pc = grid->_processor_coor[orthogdim];
|
int pc = grid->_processor_coor[orthogdim];
|
||||||
parallel_for_nest2(int lt=0;lt<ld;lt++)
|
parallel_for_nest2(int lt=0;lt<ld;lt++)
|
||||||
@ -308,6 +311,7 @@ void TA2AMesonField<FImpl>::MesonField(Eigen::Tensor<ComplexD,5> &mat,
|
|||||||
// Healthy size that should suffice
|
// Healthy size that should suffice
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
t3-=usecond();
|
t3-=usecond();
|
||||||
|
MODULE_TIMER("Global sum");
|
||||||
grid->GlobalSumVector(&mat(0,0,0,0,0),Nmom*Ngamma*Nt*Lblock*Rblock);
|
grid->GlobalSumVector(&mat(0,0,0,0,0),Nmom*Ngamma*Nt*Lblock*Rblock);
|
||||||
t3+=usecond();
|
t3+=usecond();
|
||||||
}
|
}
|
||||||
@ -430,6 +434,7 @@ void TA2AMesonField<FImpl>::execute(void)
|
|||||||
bytes += vol * (12.0 * sizeof(Complex) ) * N_iii*N_jjj
|
bytes += vol * (12.0 * sizeof(Complex) ) * N_iii*N_jjj
|
||||||
+ vol * ( 2.0 * sizeof(Complex) *nmom ) * N_iii*N_jjj* ngamma;
|
+ vol * ( 2.0 * sizeof(Complex) *nmom ) * N_iii*N_jjj* ngamma;
|
||||||
|
|
||||||
|
MODULE_TIMER("Cache copy");
|
||||||
for(int iii=0;iii< N_iii;iii++)
|
for(int iii=0;iii< N_iii;iii++)
|
||||||
for(int jjj=0;jjj< N_jjj;jjj++)
|
for(int jjj=0;jjj< N_jjj;jjj++)
|
||||||
for(int m =0;m< nmom;m++)
|
for(int m =0;m< nmom;m++)
|
||||||
@ -442,21 +447,10 @@ void TA2AMesonField<FImpl>::execute(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
double nodes=grid->NodeCount();
|
double nodes=grid->NodeCount();
|
||||||
double t1 = usecond();
|
|
||||||
LOG(Message) << "Contraction of MesonFields took "<<(t1-t0)/1.0e6<< " s" << std::endl;
|
|
||||||
LOG(Message) << " Schur " << (t_schur)/1.0e6 << " s" << std::endl;
|
|
||||||
LOG(Message) << " Contr " << (t_contr)/1.0e6 << " s" << std::endl;
|
|
||||||
LOG(Message) << " Intern0 " << (t_int_0)/1.0e6 << " s" << std::endl;
|
|
||||||
LOG(Message) << " Intern1 " << (t_int_1)/1.0e6 << " s" << std::endl;
|
|
||||||
LOG(Message) << " Intern2 " << (t_int_2)/1.0e6 << " s" << std::endl;
|
|
||||||
LOG(Message) << " Intern3 " << (t_int_3)/1.0e6 << " s" << std::endl;
|
|
||||||
|
|
||||||
double t_kernel = t_int_0 + t_int_1;
|
double t_kernel = t_int_0 + t_int_1;
|
||||||
LOG(Message) << " Arith " << flops/(t_kernel)/1.0e3/nodes << " Gflop/s/ node " << std::endl;
|
|
||||||
LOG(Message) << " Arith " << bytes/(t_kernel)/1.0e3/nodes << " GB/s/node " << std::endl;
|
|
||||||
|
|
||||||
for(int t=0;t<nt;t++) corr[t] = corr[t]/ (double)nt;
|
LOG(Message) << "Perf " << flops/(t_kernel)/1.0e3/nodes << " Gflop/s/node " << std::endl;
|
||||||
for(int t=0;t<nt;t++) LOG(Message) << " " << t << " " << corr[t]<<std::endl;
|
LOG(Message) << "Perf " << bytes/(t_kernel)/1.0e3/nodes << " GB/s/node " << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
END_MODULE_NAMESPACE
|
END_MODULE_NAMESPACE
|
||||||
|
@ -178,6 +178,7 @@ void TA2AVectors<FImpl, Pack>::execute(void)
|
|||||||
{
|
{
|
||||||
auto &epack = envGet(Pack, par().eigenPack);
|
auto &epack = envGet(Pack, par().eigenPack);
|
||||||
|
|
||||||
|
MODULE_TIMER("V low mode");
|
||||||
LOG(Message) << "V vector i = " << il << " (low mode)" << std::endl;
|
LOG(Message) << "V vector i = " << il << " (low mode)" << std::endl;
|
||||||
if (Ls == 1)
|
if (Ls == 1)
|
||||||
{
|
{
|
||||||
@ -188,6 +189,7 @@ void TA2AVectors<FImpl, Pack>::execute(void)
|
|||||||
envGetTmp(FermionField, f5);
|
envGetTmp(FermionField, f5);
|
||||||
a2a.makeLowModeV5D(v[il], f5, epack.evec[il], epack.eval[il]);
|
a2a.makeLowModeV5D(v[il], f5, epack.evec[il], epack.eval[il]);
|
||||||
}
|
}
|
||||||
|
MODULE_TIMER("W low mode");
|
||||||
LOG(Message) << "W vector i = " << il << " (low mode)" << std::endl;
|
LOG(Message) << "W vector i = " << il << " (low mode)" << std::endl;
|
||||||
if (Ls == 1)
|
if (Ls == 1)
|
||||||
{
|
{
|
||||||
@ -203,6 +205,7 @@ void TA2AVectors<FImpl, Pack>::execute(void)
|
|||||||
// High modes
|
// High modes
|
||||||
for (unsigned int ih = 0; ih < noise.size(); ih++)
|
for (unsigned int ih = 0; ih < noise.size(); ih++)
|
||||||
{
|
{
|
||||||
|
MODULE_TIMER("V high mode");
|
||||||
LOG(Message) << "V vector i = " << Nl_ + ih
|
LOG(Message) << "V vector i = " << Nl_ + ih
|
||||||
<< " (" << ((Nl_ > 0) ? "high " : "")
|
<< " (" << ((Nl_ > 0) ? "high " : "")
|
||||||
<< "stochastic mode)" << std::endl;
|
<< "stochastic mode)" << std::endl;
|
||||||
@ -215,6 +218,7 @@ void TA2AVectors<FImpl, Pack>::execute(void)
|
|||||||
envGetTmp(FermionField, f5);
|
envGetTmp(FermionField, f5);
|
||||||
a2a.makeHighModeV5D(v[Nl_ + ih], f5, noise[ih]);
|
a2a.makeHighModeV5D(v[Nl_ + ih], f5, noise[ih]);
|
||||||
}
|
}
|
||||||
|
MODULE_TIMER("W high mode");
|
||||||
LOG(Message) << "W vector i = " << Nl_ + ih
|
LOG(Message) << "W vector i = " << Nl_ + ih
|
||||||
<< " (" << ((Nl_ > 0) ? "high " : "")
|
<< " (" << ((Nl_ > 0) ? "high " : "")
|
||||||
<< "stochastic mode)" << std::endl;
|
<< "stochastic mode)" << std::endl;
|
||||||
|
@ -622,9 +622,14 @@ VirtualMachine::Program VirtualMachine::schedule(const GeneticPar &par)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// general execution ///////////////////////////////////////////////////////////
|
// general execution ///////////////////////////////////////////////////////////
|
||||||
#define BIG_SEP "==============="
|
#define BIG_SEP "==============="
|
||||||
#define SEP "---------------"
|
#define SEP "---------------"
|
||||||
|
#define SMALL_SEP "..............."
|
||||||
#define MEM_MSG(size) sizeString(size)
|
#define MEM_MSG(size) sizeString(size)
|
||||||
|
#define PRINT_TIME(name, t, total) \
|
||||||
|
std::right << std::setw(30) << name << ": " << t.count() << " us (" \
|
||||||
|
<< std::fixed << std::setprecision(1) \
|
||||||
|
<< static_cast<double>((t).count())/total*100 << "%)"
|
||||||
|
|
||||||
void VirtualMachine::executeProgram(const Program &p)
|
void VirtualMachine::executeProgram(const Program &p)
|
||||||
{
|
{
|
||||||
@ -654,11 +659,31 @@ void VirtualMachine::executeProgram(const Program &p)
|
|||||||
LOG(Message) << SEP << " Measurement step " << i + 1 << "/"
|
LOG(Message) << SEP << " Measurement step " << i + 1 << "/"
|
||||||
<< p.size() << " (module '" << module_[p[i]].name
|
<< p.size() << " (module '" << module_[p[i]].name
|
||||||
<< "') " << SEP << std::endl;
|
<< "') " << SEP << std::endl;
|
||||||
|
LOG(Message) << SMALL_SEP << " Module execution" << std::endl;
|
||||||
currentModule_ = p[i];
|
currentModule_ = p[i];
|
||||||
(*module_[p[i]].data)();
|
(*module_[p[i]].data)();
|
||||||
currentModule_ = -1;
|
currentModule_ = -1;
|
||||||
sizeBefore = env().getTotalSize();
|
sizeBefore = env().getTotalSize();
|
||||||
|
// print time profile after execution
|
||||||
|
LOG(Message) << SMALL_SEP << " Timings" << std::endl;
|
||||||
|
|
||||||
|
std::map<std::string, GridTime> ctiming, gtiming;
|
||||||
|
GridTime total;
|
||||||
|
|
||||||
|
ctiming = module_[p[i]].data->getTimings();
|
||||||
|
total = ctiming.at("_total");
|
||||||
|
gtiming["total"] = ctiming["_total"]; ctiming.erase("_total");
|
||||||
|
gtiming["setup"] = ctiming["_setup"]; ctiming.erase("_setup");
|
||||||
|
gtiming["execution"] = ctiming["_execute"]; ctiming.erase("_execute");
|
||||||
|
LOG(Message) << "* GLOBAL TIMERS" << std::endl;
|
||||||
|
printTimeProfile(gtiming, total);
|
||||||
|
if (!ctiming.empty())
|
||||||
|
{
|
||||||
|
LOG(Message) << "* CUSTOM TIMERS" << std::endl;
|
||||||
|
printTimeProfile(ctiming, total);
|
||||||
|
}
|
||||||
// print used memory after execution
|
// print used memory after execution
|
||||||
|
LOG(Message) << SMALL_SEP << " Memory management" << std::endl;
|
||||||
LOG(Message) << "Allocated objects: " << MEM_MSG(sizeBefore)
|
LOG(Message) << "Allocated objects: " << MEM_MSG(sizeBefore)
|
||||||
<< std::endl;
|
<< std::endl;
|
||||||
if (sizeBefore > memPeak)
|
if (sizeBefore > memPeak)
|
||||||
|
@ -102,6 +102,9 @@ public:
|
|||||||
assert(running == false);
|
assert(running == false);
|
||||||
return (uint64_t) accumulator.count();
|
return (uint64_t) accumulator.count();
|
||||||
}
|
}
|
||||||
|
bool isRunning(void){
|
||||||
|
return running;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user