1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-09 21:50:45 +01:00

Move barrier into the StencilSend begin routine

This commit is contained in:
Peter Boyle 2022-08-04 13:35:26 -04:00
parent 74f10c2dc0
commit 75bb6b2b40

View File

@ -359,6 +359,7 @@ public:
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
{ {
accelerator_barrier();
for(int i=0;i<Packets.size();i++){ for(int i=0;i<Packets.size();i++){
_grid->StencilSendToRecvFromBegin(MpiReqs, _grid->StencilSendToRecvFromBegin(MpiReqs,
Packets[i].send_buf, Packets[i].send_buf,
@ -371,39 +372,19 @@ public:
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
{ {
_grid->StencilSendToRecvFromComplete(MpiReqs,i); _grid->StencilSendToRecvFromComplete(MpiReqs,0);
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Blocking send and receive. Either sequential or parallel. // Blocking send and receive. Either sequential or parallel.
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
void Communicate(void) void Communicate(void)
{ {
if ( CartesianCommunicator::CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicySequential ){ /////////////////////////////////////////////////////////
///////////////////////////////////////////////////////// // Concurrent and non-threaded asynch calls to MPI
// several way threaded on different communicators. /////////////////////////////////////////////////////////
// Cannot combine with Dirichlet operators std::vector<std::vector<CommsRequest_t> > reqs;
// This scheme is needed on Intel Omnipath for best performance this->CommunicateBegin(reqs);
// Deprecate once there are very few omnipath clusters this->CommunicateComplete(reqs);
/////////////////////////////////////////////////////////
int nthreads = CartesianCommunicator::nCommThreads;
int old = GridThread::GetThreads();
GridThread::SetThreads(nthreads);
thread_for(i,Packets.size(),{
_grid->StencilSendToRecvFrom(Packets[i].send_buf,
Packets[i].to_rank,Packets[i].do_send,
Packets[i].recv_buf,
Packets[i].from_rank,Packets[i].do_recv,
Packets[i].bytes,i);
});
GridThread::SetThreads(old);
} else {
/////////////////////////////////////////////////////////
// Concurrent and non-threaded asynch calls to MPI
/////////////////////////////////////////////////////////
std::vector<std::vector<CommsRequest_t> > reqs;
this->CommunicateBegin(reqs);
this->CommunicateComplete(reqs);
}
} }
template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress) template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
@ -483,7 +464,6 @@ public:
face_table_computed=1; face_table_computed=1;
assert(u_comm_offset==_unified_buffer_size); assert(u_comm_offset==_unified_buffer_size);
accelerator_barrier();
} }
///////////////////////// /////////////////////////