1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

Threaded MPI calls patches

This commit is contained in:
Peter Boyle 2017-07-29 13:06:53 -04:00
parent 6f5a5cd9b3
commit 14d53e1c9e
8 changed files with 128 additions and 66 deletions

View File

@ -489,7 +489,7 @@ int main (int argc, char ** argv)
//assert(norm2(src_e)<1.0e-4); //assert(norm2(src_e)<1.0e-4);
//assert(norm2(src_o)<1.0e-4); //assert(norm2(src_o)<1.0e-4);
exit(0);
Grid_finalize(); Grid_finalize();
} }

View File

@ -92,11 +92,15 @@ public:
size_type bytes = __n*sizeof(_Tp); size_type bytes = __n*sizeof(_Tp);
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes); _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
//////////////////
// Hack 2MB align; could make option probably doesn't need configurability
//////////////////
//define GRID_ALLOC_ALIGN (128)
#define GRID_ALLOC_ALIGN (2*1024*1024)
#ifdef HAVE_MM_MALLOC_H #ifdef HAVE_MM_MALLOC_H
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,128); if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
#else #else
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes); if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
#endif #endif
return ptr; return ptr;

View File

@ -34,7 +34,9 @@ namespace Grid {
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
void * CartesianCommunicator::ShmCommBuf; void * CartesianCommunicator::ShmCommBuf;
uint64_t CartesianCommunicator::MAX_MPI_SHM_BYTES = 128*1024*1024; uint64_t CartesianCommunicator::MAX_MPI_SHM_BYTES = 128*1024*1024;
CartesianCommunicator::CommunicatorPolicy_t CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent; CartesianCommunicator::CommunicatorPolicy_t
CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
int CartesianCommunicator::nCommThreads = -1;
///////////////////////////////// /////////////////////////////////
// Alloc, free shmem region // Alloc, free shmem region

View File

@ -54,8 +54,9 @@ class CartesianCommunicator {
// 128MB shared memory for comms enought for 48^4 local vol comms // 128MB shared memory for comms enought for 48^4 local vol comms
// Give external control (command line override?) of this // Give external control (command line override?) of this
static const int MAXLOG2RANKSPERNODE = 16; static const int MAXLOG2RANKSPERNODE = 16;
static uint64_t MAX_MPI_SHM_BYTES; static uint64_t MAX_MPI_SHM_BYTES;
static int nCommThreads;
// Communicator should know nothing of the physics grid, only processor grid. // Communicator should know nothing of the physics grid, only processor grid.
int _Nprocessors; // How many in all int _Nprocessors; // How many in all
@ -215,6 +216,12 @@ class CartesianCommunicator {
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall); void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
double StencilSendToRecvFrom(void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes,int dir);
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int xmit_to_rank, int xmit_to_rank,
@ -222,6 +229,7 @@ class CartesianCommunicator {
int recv_from_rank, int recv_from_rank,
int bytes,int dir); int bytes,int dir);
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i); void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
void StencilBarrier(void); void StencilBarrier(void);

View File

@ -242,7 +242,8 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
int recv_from_rank, int recv_from_rank,
int bytes,int dir) int bytes,int dir)
{ {
assert(false);
/*
int myrank = _processor; int myrank = _processor;
int ierr; int ierr;
assert(dir < communicator_halo.size()); assert(dir < communicator_halo.size());
@ -254,6 +255,28 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
communicator_halo[dir],MPI_STATUS_IGNORE); communicator_halo[dir],MPI_STATUS_IGNORE);
assert(ierr==0); assert(ierr==0);
return 2.0*bytes; return 2.0*bytes;
*/
}
double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes,int dir)
{
int myrank = _processor;
int ierr;
assert(dir < communicator_halo.size());
// std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
// Give the CPU to MPI immediately; can use threads to overlap optionally
MPI_Request req[2];
MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank,
communicator_halo[dir],&req[1]);
MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank,myrank,
communicator_halo[dir], &req[0]);
MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
return 2.0*bytes;
} }
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir) void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{ {

View File

@ -391,37 +391,57 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
DhopFaceTime+=usecond(); DhopFaceTime+=usecond();
// Rely on async comms; start comms before merge of local data // Rely on async comms; start comms before merge of local data
DhopComputeTime-=usecond(); double ctime=0;
DhopCommTime-=usecond(); double ptime=0;
#pragma omp parallel // DhopComputeTime-=usecond();
// DhopCommTime-=usecond();
#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
{ {
// Should time this somehow; hard as the threads fork nowait int tid = omp_get_thread_num();
st.CommunicateThreaded(); int nthreads = omp_get_num_threads();
int ncomms = CartesianCommunicator::nCommThreads;
if (ncomms == -1) ncomms = st.Packets.size();
assert(nthreads > ncomms);
if (tid >= ncomms) {
double start = usecond();
nthreads -= ncomms;
int ttid = tid - ncomms;
int n = U._grid->oSites();
int chunk = n / nthreads;
int rem = n % nthreads;
int myblock, myn;
if (ttid < rem) {
myblock = ttid * chunk + ttid;
myn = chunk+1;
} else {
myblock = ttid*chunk + rem;
myn = chunk;
}
if (dag == DaggerYes) { // do the compute
#pragma omp for if (dag == DaggerYes) {
for (int ss = 0; ss < U._grid->oSites(); ss++) { for (int ss = myblock; ss < myblock+myn; ++ss) {
int sU = ss; int sU = ss;
int sF = LLs * sU; int sF = LLs * sU;
Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0); Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
}
} else {
for (int ss = myblock; ss < myblock+myn; ++ss) {
int sU = ss;
int sF = LLs * sU;
Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
}
}
ptime = usecond() - start;
} }
} else { {
#pragma omp for double start = usecond();
for (int ss = 0; ss < U._grid->oSites(); ss++) { st.CommunicateThreaded();
int sU = ss; ctime = usecond() - start;
int sF = LLs * sU;
Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
} }
} }
#pragma omp single DhopCommTime += ctime;
DhopComputeTime+=usecond(); DhopComputeTime+=ptime;
#pragma omp taskwait
#pragma omp single
DhopCommTime+=usecond();
} // Closes parallel region and waits the comms (I hope)
DhopFaceTime-=usecond(); DhopFaceTime-=usecond();
st.CommsMerge(compressor); st.CommsMerge(compressor);

View File

@ -185,6 +185,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
double splicetime; double splicetime;
double nosplicetime; double nosplicetime;
double calls; double calls;
std::vector<double> comms_bytesthr;
std::vector<double> commtimethr;
//////////////////////////////////////// ////////////////////////////////////////
// Stencil query // Stencil query
@ -250,36 +252,22 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
////////////////////////////////////////// //////////////////////////////////////////
void CommunicateThreaded() void CommunicateThreaded()
{ {
for(int i=0;i<Packets.size();i++){ // must be called in parallel region
#pragma omp task int mythread = omp_get_thread_num();
{ int nthreads = CartesianCommunicator::nCommThreads;
double start; if (nthreads == -1) nthreads = Packets.size();
double stop; if (mythread < nthreads) {
start = usecond(); for (int i = mythread; i < Packets.size(); i += nthreads) {
uint64_t bytes; double start = usecond();
std::vector<CommsRequest_t> reqs; uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf,
bytes=_grid->StencilSendToRecvFromBegin(reqs, Packets[i].to_rank,
Packets[i].send_buf, Packets[i].recv_buf,
Packets[i].to_rank, Packets[i].from_rank,
Packets[i].recv_buf, Packets[i].bytes,i);
Packets[i].from_rank, comms_bytesthr[mythread] += bytes;
Packets[i].bytes,i); commtimethr[mythread] += usecond() - start;
_grid->StencilSendToRecvFromComplete(reqs,i);
// Last task logged; this is approximate but hard to catch
// the last to complete
stop = usecond();
stop = stop - start;
if ( i==0 ) commtime+=stop;
#pragma omp critical
{
comms_bytes+=bytes;
}
} }
} }
} }
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
{ {
@ -475,7 +463,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
int checkerboard, int checkerboard,
const std::vector<int> &directions, const std::vector<int> &directions,
const std::vector<int> &distances) const std::vector<int> &distances)
: _permute_type(npoints), _comm_buf_size(npoints) : _permute_type(npoints),
_comm_buf_size(npoints),
comms_bytesthr(npoints),
commtimethr(npoints)
{ {
face_table_computed=0; face_table_computed=0;
_npoints = npoints; _npoints = npoints;
@ -1029,6 +1020,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
void ZeroCounters(void) { void ZeroCounters(void) {
gathertime = 0.; gathertime = 0.;
commtime = 0.; commtime = 0.;
memset(&commtimethr[0], 0, sizeof(commtimethr));
memset(&comms_bytesthr[0], 0, sizeof(comms_bytesthr));
halogtime = 0.; halogtime = 0.;
mergetime = 0.; mergetime = 0.;
decompresstime = 0.; decompresstime = 0.;
@ -1044,6 +1037,14 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
#define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl; #define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
RealD NP = _grid->_Nprocessors; RealD NP = _grid->_Nprocessors;
RealD NN = _grid->NodeCount(); RealD NN = _grid->NodeCount();
double t = 0;
// if commtimethr is set they were all done in parallel so take the max
// but add up the bytes
for (int i = 0; i < 8; ++i) {
comms_bytes += comms_bytesthr[i];
if (t < commtimethr[i]) t = commtimethr[i];
}
commtime += t;
_grid->GlobalSum(commtime); commtime/=NP; _grid->GlobalSum(commtime); commtime/=NP;
if ( calls > 0. ) { if ( calls > 0. ) {

View File

@ -359,7 +359,11 @@ void Grid_init(int *argc,char ***argv)
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
LebesgueOrder::UseLebesgueOrder=1; LebesgueOrder::UseLebesgueOrder=1;
} }
CartesianCommunicator::nCommThreads = -1;
if( GridCmdOptionExists(*argv,*argv+*argc,"--commthreads") ){
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--commthreads");
GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads);
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking"); arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
GridCmdOptionIntVector(arg,LebesgueOrder::Block); GridCmdOptionIntVector(arg,LebesgueOrder::Block);