Move barrier into the StencilSend begin routine

2025-11-23 16:09:32 +00:00 · 2022-08-04 13:35:26 -04:00
parent 74f10c2dc0
commit 75bb6b2b40
1 changed files with 8 additions and 28 deletions
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -359,6 +359,7 @@ public:
  ////////////////////////////////////////////////////////////////////////
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
    accelerator_barrier();
    for(int i=0;i<Packets.size();i++){
      _grid->StencilSendToRecvFromBegin(MpiReqs,
 					Packets[i].send_buf,
@@ -371,39 +372,19 @@ public:
  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
-    _grid->StencilSendToRecvFromComplete(MpiReqs,i);
+    _grid->StencilSendToRecvFromComplete(MpiReqs,0);
  }
  ////////////////////////////////////////////////////////////////////////
  // Blocking send and receive. Either sequential or parallel.
  ////////////////////////////////////////////////////////////////////////
  void Communicate(void)
  {
-    if ( CartesianCommunicator::CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicySequential ){
+    /////////////////////////////////////////////////////////
-      /////////////////////////////////////////////////////////
+    // Concurrent and non-threaded asynch calls to MPI
-      // several way threaded on different communicators.
+    /////////////////////////////////////////////////////////
-      // Cannot combine with Dirichlet operators
+    std::vector<std::vector<CommsRequest_t> > reqs;
-      // This scheme is needed on Intel Omnipath for best performance
+    this->CommunicateBegin(reqs);
-      // Deprecate once there are very few omnipath clusters
+    this->CommunicateComplete(reqs);
      /////////////////////////////////////////////////////////
      int nthreads = CartesianCommunicator::nCommThreads;
      int old = GridThread::GetThreads();
      GridThread::SetThreads(nthreads);
      thread_for(i,Packets.size(),{
 	  _grid->StencilSendToRecvFrom(Packets[i].send_buf,
 				       Packets[i].to_rank,Packets[i].do_send,
 				       Packets[i].recv_buf,
 				       Packets[i].from_rank,Packets[i].do_recv,
 				       Packets[i].bytes,i);
      });
      GridThread::SetThreads(old);
    } else { 
      /////////////////////////////////////////////////////////
      // Concurrent and non-threaded asynch calls to MPI
      /////////////////////////////////////////////////////////
      std::vector<std::vector<CommsRequest_t> > reqs;
      this->CommunicateBegin(reqs);
      this->CommunicateComplete(reqs);
    }
  }
  template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
@@ -483,7 +464,6 @@ public:
    face_table_computed=1;
    assert(u_comm_offset==_unified_buffer_size);
    accelerator_barrier();
  }
  /////////////////////////