mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Threaded MPI calls patches
This commit is contained in:
parent
6f5a5cd9b3
commit
14d53e1c9e
@ -489,7 +489,7 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
//assert(norm2(src_e)<1.0e-4);
|
//assert(norm2(src_e)<1.0e-4);
|
||||||
//assert(norm2(src_o)<1.0e-4);
|
//assert(norm2(src_o)<1.0e-4);
|
||||||
|
exit(0);
|
||||||
Grid_finalize();
|
Grid_finalize();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -92,11 +92,15 @@ public:
|
|||||||
size_type bytes = __n*sizeof(_Tp);
|
size_type bytes = __n*sizeof(_Tp);
|
||||||
|
|
||||||
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
|
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
|
||||||
|
//////////////////
|
||||||
|
// Hack 2MB align; could make option probably doesn't need configurability
|
||||||
|
//////////////////
|
||||||
|
//define GRID_ALLOC_ALIGN (128)
|
||||||
|
#define GRID_ALLOC_ALIGN (2*1024*1024)
|
||||||
#ifdef HAVE_MM_MALLOC_H
|
#ifdef HAVE_MM_MALLOC_H
|
||||||
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,128);
|
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
|
||||||
#else
|
#else
|
||||||
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes);
|
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return ptr;
|
return ptr;
|
||||||
|
@ -34,7 +34,9 @@ namespace Grid {
|
|||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
void * CartesianCommunicator::ShmCommBuf;
|
void * CartesianCommunicator::ShmCommBuf;
|
||||||
uint64_t CartesianCommunicator::MAX_MPI_SHM_BYTES = 128*1024*1024;
|
uint64_t CartesianCommunicator::MAX_MPI_SHM_BYTES = 128*1024*1024;
|
||||||
CartesianCommunicator::CommunicatorPolicy_t CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
|
CartesianCommunicator::CommunicatorPolicy_t
|
||||||
|
CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
|
||||||
|
int CartesianCommunicator::nCommThreads = -1;
|
||||||
|
|
||||||
/////////////////////////////////
|
/////////////////////////////////
|
||||||
// Alloc, free shmem region
|
// Alloc, free shmem region
|
||||||
|
@ -56,6 +56,7 @@ class CartesianCommunicator {
|
|||||||
|
|
||||||
static const int MAXLOG2RANKSPERNODE = 16;
|
static const int MAXLOG2RANKSPERNODE = 16;
|
||||||
static uint64_t MAX_MPI_SHM_BYTES;
|
static uint64_t MAX_MPI_SHM_BYTES;
|
||||||
|
static int nCommThreads;
|
||||||
|
|
||||||
// Communicator should know nothing of the physics grid, only processor grid.
|
// Communicator should know nothing of the physics grid, only processor grid.
|
||||||
int _Nprocessors; // How many in all
|
int _Nprocessors; // How many in all
|
||||||
@ -215,6 +216,12 @@ class CartesianCommunicator {
|
|||||||
|
|
||||||
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
|
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
|
||||||
|
|
||||||
|
double StencilSendToRecvFrom(void *xmit,
|
||||||
|
int xmit_to_rank,
|
||||||
|
void *recv,
|
||||||
|
int recv_from_rank,
|
||||||
|
int bytes,int dir);
|
||||||
|
|
||||||
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,
|
||||||
int xmit_to_rank,
|
int xmit_to_rank,
|
||||||
@ -222,6 +229,7 @@ class CartesianCommunicator {
|
|||||||
int recv_from_rank,
|
int recv_from_rank,
|
||||||
int bytes,int dir);
|
int bytes,int dir);
|
||||||
|
|
||||||
|
|
||||||
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
|
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
|
||||||
void StencilBarrier(void);
|
void StencilBarrier(void);
|
||||||
|
|
||||||
|
@ -242,7 +242,8 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
int recv_from_rank,
|
int recv_from_rank,
|
||||||
int bytes,int dir)
|
int bytes,int dir)
|
||||||
{
|
{
|
||||||
|
assert(false);
|
||||||
|
/*
|
||||||
int myrank = _processor;
|
int myrank = _processor;
|
||||||
int ierr;
|
int ierr;
|
||||||
assert(dir < communicator_halo.size());
|
assert(dir < communicator_halo.size());
|
||||||
@ -254,6 +255,28 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
communicator_halo[dir],MPI_STATUS_IGNORE);
|
communicator_halo[dir],MPI_STATUS_IGNORE);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
return 2.0*bytes;
|
return 2.0*bytes;
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
|
||||||
|
double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
|
||||||
|
int xmit_to_rank,
|
||||||
|
void *recv,
|
||||||
|
int recv_from_rank,
|
||||||
|
int bytes,int dir)
|
||||||
|
{
|
||||||
|
int myrank = _processor;
|
||||||
|
int ierr;
|
||||||
|
assert(dir < communicator_halo.size());
|
||||||
|
|
||||||
|
// std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
|
||||||
|
// Give the CPU to MPI immediately; can use threads to overlap optionally
|
||||||
|
MPI_Request req[2];
|
||||||
|
MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank,
|
||||||
|
communicator_halo[dir],&req[1]);
|
||||||
|
MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank,myrank,
|
||||||
|
communicator_halo[dir], &req[0]);
|
||||||
|
MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
|
||||||
|
return 2.0*bytes;
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
||||||
{
|
{
|
||||||
|
@ -391,37 +391,57 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
|||||||
DhopFaceTime+=usecond();
|
DhopFaceTime+=usecond();
|
||||||
|
|
||||||
// Rely on async comms; start comms before merge of local data
|
// Rely on async comms; start comms before merge of local data
|
||||||
DhopComputeTime-=usecond();
|
double ctime=0;
|
||||||
DhopCommTime-=usecond();
|
double ptime=0;
|
||||||
#pragma omp parallel
|
// DhopComputeTime-=usecond();
|
||||||
|
// DhopCommTime-=usecond();
|
||||||
|
#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
|
||||||
{
|
{
|
||||||
// Should time this somehow; hard as the threads fork nowait
|
int tid = omp_get_thread_num();
|
||||||
st.CommunicateThreaded();
|
int nthreads = omp_get_num_threads();
|
||||||
|
int ncomms = CartesianCommunicator::nCommThreads;
|
||||||
|
if (ncomms == -1) ncomms = st.Packets.size();
|
||||||
|
assert(nthreads > ncomms);
|
||||||
|
if (tid >= ncomms) {
|
||||||
|
double start = usecond();
|
||||||
|
nthreads -= ncomms;
|
||||||
|
int ttid = tid - ncomms;
|
||||||
|
int n = U._grid->oSites();
|
||||||
|
int chunk = n / nthreads;
|
||||||
|
int rem = n % nthreads;
|
||||||
|
int myblock, myn;
|
||||||
|
if (ttid < rem) {
|
||||||
|
myblock = ttid * chunk + ttid;
|
||||||
|
myn = chunk+1;
|
||||||
|
} else {
|
||||||
|
myblock = ttid*chunk + rem;
|
||||||
|
myn = chunk;
|
||||||
|
}
|
||||||
|
|
||||||
|
// do the compute
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
#pragma omp for
|
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||||
for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
|
||||||
int sU = ss;
|
int sU = ss;
|
||||||
int sF = LLs * sU;
|
int sF = LLs * sU;
|
||||||
Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
|
Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#pragma omp for
|
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||||
for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
|
||||||
int sU = ss;
|
int sU = ss;
|
||||||
int sF = LLs * sU;
|
int sF = LLs * sU;
|
||||||
Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
|
Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#pragma omp single
|
ptime = usecond() - start;
|
||||||
DhopComputeTime+=usecond();
|
}
|
||||||
|
{
|
||||||
#pragma omp taskwait
|
double start = usecond();
|
||||||
|
st.CommunicateThreaded();
|
||||||
#pragma omp single
|
ctime = usecond() - start;
|
||||||
DhopCommTime+=usecond();
|
}
|
||||||
} // Closes parallel region and waits the comms (I hope)
|
}
|
||||||
|
DhopCommTime += ctime;
|
||||||
|
DhopComputeTime+=ptime;
|
||||||
|
|
||||||
DhopFaceTime-=usecond();
|
DhopFaceTime-=usecond();
|
||||||
st.CommsMerge(compressor);
|
st.CommsMerge(compressor);
|
||||||
|
@ -185,6 +185,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
double splicetime;
|
double splicetime;
|
||||||
double nosplicetime;
|
double nosplicetime;
|
||||||
double calls;
|
double calls;
|
||||||
|
std::vector<double> comms_bytesthr;
|
||||||
|
std::vector<double> commtimethr;
|
||||||
|
|
||||||
////////////////////////////////////////
|
////////////////////////////////////////
|
||||||
// Stencil query
|
// Stencil query
|
||||||
@ -250,36 +252,22 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
//////////////////////////////////////////
|
//////////////////////////////////////////
|
||||||
void CommunicateThreaded()
|
void CommunicateThreaded()
|
||||||
{
|
{
|
||||||
for(int i=0;i<Packets.size();i++){
|
// must be called in parallel region
|
||||||
#pragma omp task
|
int mythread = omp_get_thread_num();
|
||||||
{
|
int nthreads = CartesianCommunicator::nCommThreads;
|
||||||
double start;
|
if (nthreads == -1) nthreads = Packets.size();
|
||||||
double stop;
|
if (mythread < nthreads) {
|
||||||
start = usecond();
|
for (int i = mythread; i < Packets.size(); i += nthreads) {
|
||||||
uint64_t bytes;
|
double start = usecond();
|
||||||
std::vector<CommsRequest_t> reqs;
|
uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf,
|
||||||
bytes=_grid->StencilSendToRecvFromBegin(reqs,
|
|
||||||
Packets[i].send_buf,
|
|
||||||
Packets[i].to_rank,
|
Packets[i].to_rank,
|
||||||
Packets[i].recv_buf,
|
Packets[i].recv_buf,
|
||||||
Packets[i].from_rank,
|
Packets[i].from_rank,
|
||||||
Packets[i].bytes,i);
|
Packets[i].bytes,i);
|
||||||
_grid->StencilSendToRecvFromComplete(reqs,i);
|
comms_bytesthr[mythread] += bytes;
|
||||||
// Last task logged; this is approximate but hard to catch
|
commtimethr[mythread] += usecond() - start;
|
||||||
// the last to complete
|
|
||||||
stop = usecond();
|
|
||||||
stop = stop - start;
|
|
||||||
|
|
||||||
if ( i==0 ) commtime+=stop;
|
|
||||||
|
|
||||||
#pragma omp critical
|
|
||||||
{
|
|
||||||
comms_bytes+=bytes;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
{
|
{
|
||||||
@ -475,7 +463,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
int checkerboard,
|
int checkerboard,
|
||||||
const std::vector<int> &directions,
|
const std::vector<int> &directions,
|
||||||
const std::vector<int> &distances)
|
const std::vector<int> &distances)
|
||||||
: _permute_type(npoints), _comm_buf_size(npoints)
|
: _permute_type(npoints),
|
||||||
|
_comm_buf_size(npoints),
|
||||||
|
comms_bytesthr(npoints),
|
||||||
|
commtimethr(npoints)
|
||||||
{
|
{
|
||||||
face_table_computed=0;
|
face_table_computed=0;
|
||||||
_npoints = npoints;
|
_npoints = npoints;
|
||||||
@ -1029,6 +1020,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
void ZeroCounters(void) {
|
void ZeroCounters(void) {
|
||||||
gathertime = 0.;
|
gathertime = 0.;
|
||||||
commtime = 0.;
|
commtime = 0.;
|
||||||
|
memset(&commtimethr[0], 0, sizeof(commtimethr));
|
||||||
|
memset(&comms_bytesthr[0], 0, sizeof(comms_bytesthr));
|
||||||
halogtime = 0.;
|
halogtime = 0.;
|
||||||
mergetime = 0.;
|
mergetime = 0.;
|
||||||
decompresstime = 0.;
|
decompresstime = 0.;
|
||||||
@ -1044,6 +1037,14 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
#define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
|
#define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
|
||||||
RealD NP = _grid->_Nprocessors;
|
RealD NP = _grid->_Nprocessors;
|
||||||
RealD NN = _grid->NodeCount();
|
RealD NN = _grid->NodeCount();
|
||||||
|
double t = 0;
|
||||||
|
// if commtimethr is set they were all done in parallel so take the max
|
||||||
|
// but add up the bytes
|
||||||
|
for (int i = 0; i < 8; ++i) {
|
||||||
|
comms_bytes += comms_bytesthr[i];
|
||||||
|
if (t < commtimethr[i]) t = commtimethr[i];
|
||||||
|
}
|
||||||
|
commtime += t;
|
||||||
|
|
||||||
_grid->GlobalSum(commtime); commtime/=NP;
|
_grid->GlobalSum(commtime); commtime/=NP;
|
||||||
if ( calls > 0. ) {
|
if ( calls > 0. ) {
|
||||||
|
@ -359,7 +359,11 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
|
||||||
LebesgueOrder::UseLebesgueOrder=1;
|
LebesgueOrder::UseLebesgueOrder=1;
|
||||||
}
|
}
|
||||||
|
CartesianCommunicator::nCommThreads = -1;
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--commthreads") ){
|
||||||
|
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--commthreads");
|
||||||
|
GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads);
|
||||||
|
}
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
|
||||||
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
|
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
|
||||||
GridCmdOptionIntVector(arg,LebesgueOrder::Block);
|
GridCmdOptionIntVector(arg,LebesgueOrder::Block);
|
||||||
|
Loading…
Reference in New Issue
Block a user