Threaded MPI calls patches

2025-07-29 10:47:07 +01:00 · 2017-07-29 13:06:53 -04:00
parent 6f5a5cd9b3
commit 14d53e1c9e
8 changed files with 128 additions and 66 deletions
--- a/lib/communicator/Communicator_base.cc
+++ b/lib/communicator/Communicator_base.cc
@@ -34,7 +34,9 @@ namespace Grid {
 ///////////////////////////////////////////////////////////////
 void *              CartesianCommunicator::ShmCommBuf;
 uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 128*1024*1024; 
-CartesianCommunicator::CommunicatorPolicy_t  CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
+CartesianCommunicator::CommunicatorPolicy_t  
+CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
+int CartesianCommunicator::nCommThreads = -1;

 /////////////////////////////////
 // Alloc, free shmem region
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -54,8 +54,9 @@ class CartesianCommunicator {
  // 128MB shared memory for comms enought for 48^4 local vol comms
  // Give external control (command line override?) of this

-  static const int      MAXLOG2RANKSPERNODE = 16;            
-  static uint64_t MAX_MPI_SHM_BYTES;
+  static const int MAXLOG2RANKSPERNODE = 16;            
+  static uint64_t  MAX_MPI_SHM_BYTES;
+  static int       nCommThreads;

  // Communicator should know nothing of the physics grid, only processor grid.
  int              _Nprocessors;     // How many in all
@@ -125,7 +126,7 @@ class CartesianCommunicator {
  enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
  static CommunicatorPolicy_t CommunicatorPolicy;
  static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
-
+  
  size_t heap_top;
  size_t heap_bytes;

@@ -215,6 +216,12 @@ class CartesianCommunicator {
  
  void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);

+  double StencilSendToRecvFrom(void *xmit,
+			       int xmit_to_rank,
+			       void *recv,
+			       int recv_from_rank,
+			       int bytes,int dir);
+
  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				    void *xmit,
 				    int xmit_to_rank,
@@ -222,6 +229,7 @@ class CartesianCommunicator {
 				    int recv_from_rank,
 				    int bytes,int dir);
  
+  
  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
  void StencilBarrier(void);

--- a/lib/communicator/Communicator_mpit.cc
+++ b/lib/communicator/Communicator_mpit.cc
@@ -242,7 +242,8 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 							 int recv_from_rank,
 							 int bytes,int dir)
 {
-
+  assert(false);
+  /*
  int myrank = _processor;
  int ierr;
  assert(dir < communicator_halo.size());
@@ -254,6 +255,28 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 		    communicator_halo[dir],MPI_STATUS_IGNORE);
  assert(ierr==0);
  return 2.0*bytes;
+  */
+}
+
+double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
+						    int xmit_to_rank,
+						    void *recv,
+						    int recv_from_rank,
+						    int bytes,int dir)
+{
+  int myrank = _processor;
+  int ierr;
+  assert(dir < communicator_halo.size());
+  
+  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
+  // Give the CPU to MPI immediately; can use threads to overlap optionally
+  MPI_Request req[2];
+  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank,
+	    communicator_halo[dir],&req[1]);
+  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank,myrank,
+	    communicator_halo[dir], &req[0]);
+  MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
+  return 2.0*bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {