mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-27 14:15:55 +01:00
Move the copy synch out to stencil and do one per call instead of one per packet
This commit is contained in:
parent
dd3bbb8fa2
commit
05e562e3d7
@ -398,6 +398,8 @@ public:
|
|||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
{
|
{
|
||||||
|
// Buffers are gathered AND synchronised
|
||||||
|
// Copies are MPI ISend OR asynch copy on copy stream
|
||||||
reqs.resize(Packets.size());
|
reqs.resize(Packets.size());
|
||||||
commtime-=usecond();
|
commtime-=usecond();
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
@ -410,14 +412,18 @@ public:
|
|||||||
comms_bytes+=bytes;
|
comms_bytes+=bytes;
|
||||||
shm_bytes +=2*Packets[i].bytes-bytes;
|
shm_bytes +=2*Packets[i].bytes-bytes;
|
||||||
}
|
}
|
||||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
{
|
{
|
||||||
|
// complete intranode
|
||||||
|
acceleratorCopySynchronise();
|
||||||
|
// complete MPI
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
_grid->StencilSendToRecvFromComplete(reqs[i],i);
|
_grid->StencilSendToRecvFromComplete(reqs[i],i);
|
||||||
}
|
}
|
||||||
|
// Everyone agrees we are all done
|
||||||
|
_grid->StencilBarrier();
|
||||||
commtime+=usecond();
|
commtime+=usecond();
|
||||||
}
|
}
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
@ -425,34 +431,10 @@ public:
|
|||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
void Communicate(void)
|
void Communicate(void)
|
||||||
{
|
{
|
||||||
if ( 0 ){
|
|
||||||
thread_region {
|
|
||||||
// must be called in parallel region
|
|
||||||
int mythread = thread_num();
|
|
||||||
int maxthreads= thread_max();
|
|
||||||
int nthreads = CartesianCommunicator::nCommThreads;
|
|
||||||
assert(nthreads <= maxthreads);
|
|
||||||
if (nthreads == -1) nthreads = 1;
|
|
||||||
if (mythread < nthreads) {
|
|
||||||
for (int i = mythread; i < Packets.size(); i += nthreads) {
|
|
||||||
double start = usecond();
|
|
||||||
uint64_t bytes= _grid->StencilSendToRecvFrom(Packets[i].send_buf,
|
|
||||||
Packets[i].to_rank,
|
|
||||||
Packets[i].recv_buf,
|
|
||||||
Packets[i].from_rank,
|
|
||||||
Packets[i].bytes,i);
|
|
||||||
comm_bytes_thr[mythread] += bytes;
|
|
||||||
shm_bytes_thr[mythread] += Packets[i].bytes - bytes;
|
|
||||||
comm_time_thr[mythread] += usecond() - start;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else { // Concurrent and non-threaded asynch calls to MPI
|
|
||||||
std::vector<std::vector<CommsRequest_t> > reqs;
|
std::vector<std::vector<CommsRequest_t> > reqs;
|
||||||
this->CommunicateBegin(reqs);
|
this->CommunicateBegin(reqs);
|
||||||
this->CommunicateComplete(reqs);
|
this->CommunicateComplete(reqs);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
|
template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
|
||||||
{
|
{
|
||||||
@ -527,7 +509,6 @@ public:
|
|||||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||||
mpi3synctime_g+=usecond();
|
mpi3synctime_g+=usecond();
|
||||||
|
|
||||||
// conformable(source.Grid(),_grid);
|
|
||||||
assert(source.Grid()==_grid);
|
assert(source.Grid()==_grid);
|
||||||
halogtime-=usecond();
|
halogtime-=usecond();
|
||||||
|
|
||||||
@ -586,13 +567,8 @@ public:
|
|||||||
CommsMerge(decompress,Mergers,Decompressions);
|
CommsMerge(decompress,Mergers,Decompressions);
|
||||||
}
|
}
|
||||||
template<class decompressor> void CommsMergeSHM(decompressor decompress) {
|
template<class decompressor> void CommsMergeSHM(decompressor decompress) {
|
||||||
mpi3synctime-=usecond();
|
assert(MergersSHM.size()==0);
|
||||||
accelerator_barrier();
|
assert(DecompressionsSHM.size()==0);
|
||||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
|
||||||
mpi3synctime+=usecond();
|
|
||||||
shmmergetime-=usecond();
|
|
||||||
CommsMerge(decompress,MergersSHM,DecompressionsSHM);
|
|
||||||
shmmergetime+=usecond();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class decompressor>
|
template<class decompressor>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user