Threaded MPI calls patches

2025-10-29 11:09:33 +00:00 · 2017-07-29 13:06:53 -04:00
parent 6f5a5cd9b3
commit 14d53e1c9e
8 changed files with 128 additions and 66 deletions
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -489,7 +489,7 @@ int main (int argc, char ** argv)

  //assert(norm2(src_e)<1.0e-4);
  //assert(norm2(src_o)<1.0e-4);
-
+  exit(0);
  Grid_finalize();
 }

--- a/lib/allocator/AlignedAllocator.h
+++ b/lib/allocator/AlignedAllocator.h
@@ -92,11 +92,15 @@ public:
    size_type bytes = __n*sizeof(_Tp);

    _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
-    
+    //////////////////
+    // Hack 2MB align; could make option probably doesn't need configurability
+    //////////////////
+//define GRID_ALLOC_ALIGN (128)
+#define GRID_ALLOC_ALIGN (2*1024*1024)
 #ifdef HAVE_MM_MALLOC_H
-    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,128);
+    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
 #else
-    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes);
+    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
 #endif

    return ptr;
--- a/lib/communicator/Communicator_base.cc
+++ b/lib/communicator/Communicator_base.cc
@@ -34,7 +34,9 @@ namespace Grid {
 ///////////////////////////////////////////////////////////////
 void *              CartesianCommunicator::ShmCommBuf;
 uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 128*1024*1024; 
-CartesianCommunicator::CommunicatorPolicy_t  CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
+CartesianCommunicator::CommunicatorPolicy_t  
+CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
+int CartesianCommunicator::nCommThreads = -1;

 /////////////////////////////////
 // Alloc, free shmem region
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -54,8 +54,9 @@ class CartesianCommunicator {
  // 128MB shared memory for comms enought for 48^4 local vol comms
  // Give external control (command line override?) of this

-  static const int      MAXLOG2RANKSPERNODE = 16;            
-  static uint64_t MAX_MPI_SHM_BYTES;
+  static const int MAXLOG2RANKSPERNODE = 16;            
+  static uint64_t  MAX_MPI_SHM_BYTES;
+  static int       nCommThreads;

  // Communicator should know nothing of the physics grid, only processor grid.
  int              _Nprocessors;     // How many in all
@@ -215,6 +216,12 @@ class CartesianCommunicator {
  
  void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);

+  double StencilSendToRecvFrom(void *xmit,
+			       int xmit_to_rank,
+			       void *recv,
+			       int recv_from_rank,
+			       int bytes,int dir);
+
  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				    void *xmit,
 				    int xmit_to_rank,
@@ -222,6 +229,7 @@ class CartesianCommunicator {
 				    int recv_from_rank,
 				    int bytes,int dir);
  
+  
  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
  void StencilBarrier(void);

--- a/lib/communicator/Communicator_mpit.cc
+++ b/lib/communicator/Communicator_mpit.cc
@@ -242,7 +242,8 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 							 int recv_from_rank,
 							 int bytes,int dir)
 {
-
+  assert(false);
+  /*
  int myrank = _processor;
  int ierr;
  assert(dir < communicator_halo.size());
@@ -254,6 +255,28 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 		    communicator_halo[dir],MPI_STATUS_IGNORE);
  assert(ierr==0);
  return 2.0*bytes;
+  */
+}
+
+double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
+						    int xmit_to_rank,
+						    void *recv,
+						    int recv_from_rank,
+						    int bytes,int dir)
+{
+  int myrank = _processor;
+  int ierr;
+  assert(dir < communicator_halo.size());
+  
+  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
+  // Give the CPU to MPI immediately; can use threads to overlap optionally
+  MPI_Request req[2];
+  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank,
+	    communicator_halo[dir],&req[1]);
+  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank,myrank,
+	    communicator_halo[dir], &req[0]);
+  MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
+  return 2.0*bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 { 
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -391,37 +391,57 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
  DhopFaceTime+=usecond();

  // Rely on async comms; start comms before merge of local data
-  DhopComputeTime-=usecond();
-  DhopCommTime-=usecond();
-#pragma omp parallel 
+  double ctime=0;
+  double ptime=0;
+  //  DhopComputeTime-=usecond();
+  //  DhopCommTime-=usecond();
+#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
  { 
-    // Should time this somehow; hard as the threads fork nowait
-    st.CommunicateThreaded();
+    int tid = omp_get_thread_num();
+    int nthreads = omp_get_num_threads();
+    int ncomms = CartesianCommunicator::nCommThreads;
+    if (ncomms == -1) ncomms = st.Packets.size(); 
+    assert(nthreads > ncomms);
+    if (tid >= ncomms) {
+      double start = usecond();
+      nthreads -= ncomms;
+      int ttid = tid - ncomms;
+      int n = U._grid->oSites();
+      int chunk = n / nthreads;
+      int rem = n % nthreads;
+      int myblock, myn;
+      if (ttid < rem) {
+	myblock = ttid * chunk + ttid;
+	myn = chunk+1;
+      } else {
+	myblock = ttid*chunk + rem;
+	myn = chunk;
+      }
      
-  if (dag == DaggerYes) {
-#pragma omp for
-    for (int ss = 0; ss < U._grid->oSites(); ss++) {
-      int sU = ss;
-      int sF = LLs * sU;
-      Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
+      // do the compute
+      if (dag == DaggerYes) {
+	for (int ss = myblock; ss < myblock+myn; ++ss) {
+	  int sU = ss;
+	  int sF = LLs * sU;
+	  Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
+	}
+      } else {
+	for (int ss = myblock; ss < myblock+myn; ++ss) {
+	  int sU = ss;
+	  int sF = LLs * sU;
+	  Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
+	}
+      }
+	ptime = usecond() - start;
    }
-  } else {
-#pragma omp for
-    for (int ss = 0; ss < U._grid->oSites(); ss++) {
-      int sU = ss;
-      int sF = LLs * sU;
-      Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
+    {
+      double start = usecond();
+      st.CommunicateThreaded();
+      ctime = usecond() - start;
    }
  }
-#pragma omp single
-  DhopComputeTime+=usecond();
-
-#pragma omp taskwait 
-
-#pragma omp single
-  DhopCommTime+=usecond();
-  } // Closes parallel region and waits the comms (I hope)
-
+  DhopCommTime += ctime;
+  DhopComputeTime+=ptime;

  DhopFaceTime-=usecond();
  st.CommsMerge(compressor);
--- a/lib/stencil/Stencil.h
+++ b/lib/stencil/Stencil.h
@@ -185,6 +185,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
  double splicetime;
  double nosplicetime;
  double calls;
+  std::vector<double> comms_bytesthr;
+  std::vector<double> commtimethr;

  ////////////////////////////////////////
  // Stencil query
@@ -250,36 +252,22 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
  //////////////////////////////////////////
  void CommunicateThreaded()
  {
-    for(int i=0;i<Packets.size();i++){
-#pragma omp task 
-      {
-	double start;
-	double stop;
-	start = usecond();
-	uint64_t bytes;
-	std::vector<CommsRequest_t> reqs;
-	bytes=_grid->StencilSendToRecvFromBegin(reqs,
-					  Packets[i].send_buf,
-					  Packets[i].to_rank,
-					  Packets[i].recv_buf,
-					  Packets[i].from_rank,
-					  Packets[i].bytes,i);
-	_grid->StencilSendToRecvFromComplete(reqs,i);
-	// Last task logged; this is approximate but hard to catch
-	// the last to complete
-	stop = usecond();
-	stop = stop - start;
-
-	if ( i==0 ) commtime+=stop;
-
-#pragma omp critical
-	{
-	  comms_bytes+=bytes;
-	}
-
+    // must be called in parallel region
+    int mythread = omp_get_thread_num();
+    int nthreads = CartesianCommunicator::nCommThreads;
+    if (nthreads == -1) nthreads = Packets.size();
+    if (mythread < nthreads) {
+      for (int i = mythread; i < Packets.size(); i += nthreads) {
+	double start = usecond();
+	uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf,
+						      Packets[i].to_rank,
+						      Packets[i].recv_buf,
+						      Packets[i].from_rank,
+						      Packets[i].bytes,i);
+	comms_bytesthr[mythread] += bytes;
+	commtimethr[mythread] += usecond() - start;
      }
    }
-    
  }
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
@@ -475,7 +463,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 		  int checkerboard,
 		  const std::vector<int> &directions,
 		  const std::vector<int> &distances) 
-   :   _permute_type(npoints), _comm_buf_size(npoints)
+   : _permute_type(npoints), 
+    _comm_buf_size(npoints),
+    comms_bytesthr(npoints), 
+       commtimethr(npoints)
  {
    face_table_computed=0;
    _npoints = npoints;
@@ -1029,6 +1020,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
  void ZeroCounters(void) {
    gathertime = 0.;
    commtime = 0.;
+    memset(&commtimethr[0], 0, sizeof(commtimethr));
+    memset(&comms_bytesthr[0], 0, sizeof(comms_bytesthr));
    halogtime = 0.;
    mergetime = 0.;
    decompresstime = 0.;
@@ -1044,6 +1037,14 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 #define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
    RealD NP = _grid->_Nprocessors;
    RealD NN = _grid->NodeCount();
+    double t = 0;
+    // if commtimethr is set they were all done in parallel so take the max
+    // but add up the bytes
+    for (int i = 0; i < 8; ++i) {
+      comms_bytes += comms_bytesthr[i];
+      if (t < commtimethr[i]) t = commtimethr[i];
+    }
+    commtime += t;
    
    _grid->GlobalSum(commtime);    commtime/=NP;
    if ( calls > 0. ) {
--- a/lib/util/Init.cc
+++ b/lib/util/Init.cc
@@ -359,7 +359,11 @@ void Grid_init(int *argc,char ***argv)
  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
    LebesgueOrder::UseLebesgueOrder=1;
  }
-
+  CartesianCommunicator::nCommThreads = -1;
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--commthreads") ){
+    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--commthreads");
+    GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads);
+  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
    GridCmdOptionIntVector(arg,LebesgueOrder::Block);