1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-12 20:27:06 +01:00

Benchmark prep

This commit is contained in:
Peter Boyle
2017-08-25 09:25:54 +01:00
parent b49bec0cec
commit c3b1263e75
10 changed files with 494 additions and 71 deletions

View File

@ -176,6 +176,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
// Timing info; ugly; possibly temporary
/////////////////////////////////////////
double commtime;
double mpi3synctime;
double mpi3synctime_g;
double shmmergetime;
double gathertime;
double gathermtime;
double halogtime;
@ -185,8 +188,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
double splicetime;
double nosplicetime;
double calls;
std::vector<double> comms_bytesthr;
std::vector<double> commtimethr;
std::vector<double> comm_bytes_thr;
std::vector<double> comm_time_thr;
std::vector<double> comm_enter_thr;
std::vector<double> comm_leave_thr;
////////////////////////////////////////
// Stencil query
@ -262,18 +267,45 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
#endif
if (nthreads == -1) nthreads = 1;
if (mythread < nthreads) {
comm_enter_thr[mythread] = usecond();
for (int i = mythread; i < Packets.size(); i += nthreads) {
double start = usecond();
uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf,
Packets[i].to_rank,
Packets[i].recv_buf,
Packets[i].from_rank,
Packets[i].bytes,i);
comms_bytesthr[mythread] += bytes;
commtimethr[mythread] += usecond() - start;
comm_bytes_thr[mythread] += bytes;
}
comm_leave_thr[mythread]= usecond();
comm_time_thr[mythread] += comm_leave_thr[mythread] - comm_enter_thr[mythread];
}
}
void CollateThreads(void)
{
int nthreads = CartesianCommunicator::nCommThreads;
double first=0.0;
double last =0.0;
for(int t=0;t<nthreads;t++) {
double t0 = comm_enter_thr[t];
double t1 = comm_leave_thr[t];
comms_bytes+=comm_bytes_thr[t];
comm_enter_thr[t] = 0.0;
comm_leave_thr[t] = 0.0;
comm_time_thr[t] = 0.0;
comm_bytes_thr[t]=0;
if ( first == 0.0 ) first = t0; // first is t0
if ( (t0 > 0.0) && ( t0 < first ) ) first = t0; // min time seen
if ( t1 > last ) last = t1; // max time seen
}
commtime+= last-first;
}
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
{
reqs.resize(Packets.size());
@ -295,14 +327,48 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
}
commtime+=usecond();
}
void Communicate(void)
{
#ifdef GRID_OMP
#pragma omp parallel
{
// must be called in parallel region
int mythread = omp_get_thread_num();
int maxthreads= omp_get_max_threads();
int nthreads = CartesianCommunicator::nCommThreads;
assert(nthreads <= maxthreads);
if (nthreads == -1) nthreads = 1;
#else
int mythread = 0;
int nthreads = 1;
#endif
if (mythread < nthreads) {
for (int i = mythread; i < Packets.size(); i += nthreads) {
double start = usecond();
comm_bytes_thr[mythread] += _grid->StencilSendToRecvFrom(Packets[i].send_buf,
Packets[i].to_rank,
Packets[i].recv_buf,
Packets[i].from_rank,
Packets[i].bytes,i);
comm_time_thr[mythread] += usecond() - start;
}
}
#ifdef GRID_OMP
}
#endif
}
template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
{
std::vector<std::vector<CommsRequest_t> > reqs;
Prepare();
HaloGather(source,compress);
// Concurrent
CommunicateBegin(reqs);
CommunicateComplete(reqs);
// Sequential
// Communicate();
CommsMergeSHM(compress);
CommsMerge(compress);
}
@ -363,7 +429,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
template<class compressor>
void HaloGather(const Lattice<vobj> &source,compressor &compress)
{
mpi3synctime_g-=usecond();
_grid->StencilBarrier();// Synch shared memory on a single nodes
mpi3synctime_g+=usecond();
// conformable(source._grid,_grid);
assert(source._grid==_grid);
@ -423,8 +491,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
CommsMerge(decompress,Mergers,Decompressions);
}
template<class decompressor> void CommsMergeSHM(decompressor decompress) {
mpi3synctime-=usecond();
_grid->StencilBarrier();// Synch shared memory on a single nodes
mpi3synctime+=usecond();
shmmergetime-=usecond();
CommsMerge(decompress,MergersSHM,DecompressionsSHM);
shmmergetime+=usecond();
}
template<class decompressor>
@ -470,8 +542,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
const std::vector<int> &distances)
: _permute_type(npoints),
_comm_buf_size(npoints),
comms_bytesthr(npoints),
commtimethr(npoints)
comm_bytes_thr(npoints),
comm_enter_thr(npoints),
comm_leave_thr(npoints),
comm_time_thr(npoints)
{
face_table_computed=0;
_npoints = npoints;
@ -1025,8 +1099,15 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
void ZeroCounters(void) {
gathertime = 0.;
commtime = 0.;
memset(&commtimethr[0], 0, sizeof(commtimethr));
memset(&comms_bytesthr[0], 0, sizeof(comms_bytesthr));
mpi3synctime=0.;
mpi3synctime_g=0.;
shmmergetime=0.;
for(int i=0;i<_npoints;i++){
comm_time_thr[i]=0;
comm_bytes_thr[i]=0;
comm_enter_thr[i]=0;
comm_leave_thr[i]=0;
}
halogtime = 0.;
mergetime = 0.;
decompresstime = 0.;
@ -1043,13 +1124,17 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
RealD NP = _grid->_Nprocessors;
RealD NN = _grid->NodeCount();
double t = 0;
// if commtimethr is set they were all done in parallel so take the max
// if comm_time_thr is set they were all done in parallel so take the max
// but add up the bytes
int threaded = 0 ;
for (int i = 0; i < 8; ++i) {
comms_bytes += comms_bytesthr[i];
if (t < commtimethr[i]) t = commtimethr[i];
if ( comm_time_thr[i]>0.0 ) {
threaded = 1;
comms_bytes += comm_bytes_thr[i];
if (t < comm_time_thr[i]) t = comm_time_thr[i];
}
}
commtime += t;
if (threaded) commtime += t;
_grid->GlobalSum(commtime); commtime/=NP;
if ( calls > 0. ) {
@ -1065,6 +1150,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl;
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl;
}
PRINTIT(mpi3synctime);
PRINTIT(mpi3synctime_g);
PRINTIT(shmmergetime);
PRINTIT(splicetime);
PRINTIT(nosplicetime);
}