Improvement to use own SHM_OPEN call to avoid openmpi bug.

2025-07-23 07:57:07 +01:00 · 2016-11-02 12:37:46 +00:00
parent 32375aca65
commit 757a928f9a
2 changed files with 124 additions and 39 deletions
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -48,7 +48,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
-  int maxlat=32;
+  int maxlat=16;
  for(int lat=4;lat<=maxlat;lat+=2){
    for(int Ls=1;Ls<=16;Ls*=2){

@@ -193,7 +193,7 @@ int main (int argc, char ** argv)
    }
  }  

-
+  Nloop=100;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
--- a/lib/communicator/Communicator_mpi3_leader.cc
+++ b/lib/communicator/Communicator_mpi3_leader.cc
@@ -28,23 +28,26 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include "Grid.h"
 #include <mpi.h>

-/// bloody mac os doesn't implement unnamed semaphores since it is "optional" posix.
-/// darwin dispatch semaphores don't seem to be multiprocess.
-//#ifdef __APPLE__
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Workarounds:
+/// i) bloody mac os doesn't implement unnamed semaphores since it is "optional" posix.
+///    darwin dispatch semaphores don't seem to be multiprocess.
+///
+/// ii) openmpi under --mca shmem posix works with two squadrons per node; 
+///     openmpi under default mca settings (I think --mca shmem mmap) on MacOS makes two squadrons map the SAME
+///     memory as each other, despite their living on different communicators. This appears to be a bug in OpenMPI.
+///
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 #include <semaphore.h>
 typedef sem_t *Grid_semaphore;
-#define SEM_INIT(S)      S = sem_open(sem_name,0,00777,0);
-#define SEM_INIT_EXCL(S) sem_unlink(sem_name); S = sem_open(sem_name,O_CREAT|O_EXCL,00777,0);
-#define SEM_POST(S) sem_post(S);
-#define SEM_WAIT(S) sem_wait(S);
-//#else
-//#include <semaphore.h>
-//typedef sem_t Grid_semaphore;
-//#define SEM_INIT(S)      
-//#define SEM_INIT_EXCL(S) sem_init(&S,0);
-//#define SEM_POST(S) sem_post(&S);
-//#define SEM_WAIT(S) sem_wait(&S);
-//#endif
+
+#define SEM_INIT(S)      S = sem_open(sem_name,0,0600,0); assert ( S != SEM_FAILED );
+#define SEM_INIT_EXCL(S) sem_unlink(sem_name); S = sem_open(sem_name,O_CREAT|O_EXCL,0600,0); assert ( S != SEM_FAILED );
+#define SEM_POST(S) assert ( sem_post(S) == 0 ); 
+#define SEM_WAIT(S) assert ( sem_wait(S) == 0 );
+
+#include <sys/mman.h>
+

 namespace Grid {

@@ -63,8 +66,6 @@ const int pool = 48;

 class SlaveState {
 public:
-  Grid_semaphore  sem_head;
-  Grid_semaphore  sem_tail;
  volatile int head;
  volatile int start;
  volatile int tail;
@@ -73,6 +74,8 @@ public:

 class Slave {
 public:
+  Grid_semaphore  sem_head;
+  Grid_semaphore  sem_tail;
  SlaveState *state;
  MPI_Comm squadron;
  uint64_t     base;
@@ -87,33 +90,38 @@ public:
  void Init(SlaveState * _state,MPI_Comm _squadron,int _universe_rank,int _vertical_rank);

  void SemInit(void) {
-    sprintf(sem_name,"/Grid_mpi3_head_%d",universe_rank);
-    SEM_INIT(state->sem_head);
-    sprintf(sem_name,"/Grid_mpi3_tail_%d",universe_rank);
-    SEM_INIT(state->sem_tail);
+    sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank);
+    printf("SEM_NAME: %s \n",sem_name);
+    SEM_INIT(sem_head);
+    sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank);
+    printf("SEM_NAME: %s \n",sem_name);
+    SEM_INIT(sem_tail);
  }  
  void SemInitExcl(void) {
-    sprintf(sem_name,"/Grid_mpi3_head_%d",universe_rank);
-    SEM_INIT_EXCL(state->sem_head);
-    sprintf(sem_name,"/Grid_mpi3_tail_%d",universe_rank);
-    SEM_INIT_EXCL(state->sem_tail);
+    sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank);
+    printf("SEM_INIT_EXCL: %s \n",sem_name);
+    SEM_INIT_EXCL(sem_head);
+    sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank);
+    printf("SEM_INIT_EXCL: %s \n",sem_name);
+    SEM_INIT_EXCL(sem_tail);
  }  
  void WakeUpDMA(void) { 
-    SEM_POST(state->sem_head);
+    SEM_POST(sem_head);
  };
  void WakeUpCompute(void) { 
-    SEM_POST(state->sem_tail);
+    SEM_POST(sem_tail);
  };
  void WaitForCommand(void) { 
-    SEM_WAIT(state->sem_head);
+    SEM_WAIT(sem_head);
  };
  void WaitForComplete(void) { 
-    SEM_WAIT(state->sem_tail);
+    SEM_WAIT(sem_tail);
  };
  void EventLoop (void) {
    std::cout<< " Entering event loop "<<std::endl;
    while(1){
      WaitForCommand();
+      //      std::cout << "Getting command "<<std::endl;
      Event();
    }
  }
@@ -123,9 +131,13 @@ public:
  uint64_t QueueCommand(int command,void *buf, int bytes, int hashtag, MPI_Comm comm,int u_rank) ;

  void WaitAll() {
+    //    std::cout << "Queueing WAIT command  "<<std::endl;
    QueueCommand(COMMAND_WAITALL,0,0,0,squadron,0);
+    //    std::cout << "Waking up DMA "<<std::endl;
    WakeUpDMA();
+    //    std::cout << "Waiting from semaphore "<<std::endl;
    WaitForComplete();
+    //    std::cout << "Checking FIFO is empty "<<std::endl;
    assert ( state->tail == state->head );
  }
 };
@@ -176,19 +188,25 @@ public:
  static void QueueSend(int slave,void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
     //    std::cout<< " Queueing send  "<< bytes<< " slave "<< slave << " to comm "<<rank  <<std::endl;
    Slaves[slave].QueueCommand(COMMAND_ISEND,buf,bytes,tag,comm,rank);
+    //    std::cout << "Queued send command to rank "<< rank<< " via "<<slave <<std::endl;
    Slaves[slave].WakeUpDMA();
+    //    std::cout << "Waking up DMA "<< slave<<std::endl;
  };

  static void QueueRecv(int slave, void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
    //    std::cout<< " Queueing recv "<< bytes<< " slave "<< slave << " from comm "<<rank  <<std::endl;
    Slaves[slave].QueueCommand(COMMAND_IRECV,buf,bytes,tag,comm,rank);
+    //    std::cout << "Queued recv command from rank "<< rank<< " via "<<slave <<std::endl;
    Slaves[slave].WakeUpDMA();
+    //    std::cout << "Waking up DMA "<< slave<<std::endl;
  };

  static void WaitAll() {
    for(int s=1;s<VerticalSize;s++) {
+      //      std::cout << "Waiting for slave "<< s<<std::endl;
      Slaves[s].WaitAll();
    }
+    //    std::cout << " Wait all Complete "<<std::endl;
  };

  static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){
@@ -247,10 +265,9 @@ int      MPIoffloadEngine::HorizontalRank;
 int      MPIoffloadEngine::HorizontalSize;

 MPI_Comm MPIoffloadEngine::VerticalComm;
-MPI_Win  MPIoffloadEngine::VerticalWindow; 
 int      MPIoffloadEngine::VerticalSize;
 int      MPIoffloadEngine::VerticalRank;
-
+MPI_Win  MPIoffloadEngine::VerticalWindow; 
 std::vector<void *>            MPIoffloadEngine::VerticalShmBufs;
 std::vector<std::vector<int> > MPIoffloadEngine::UniverseRanks;
 std::vector<int>               MPIoffloadEngine::UserCommunicatorToWorldRanks; 
@@ -274,8 +291,12 @@ void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world,
  /////////////////////////////////////////////////////////////////////
  // Split into groups that can share memory (Verticals)
  /////////////////////////////////////////////////////////////////////
-  //  MPI_Comm_split_type(communicator_universe, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&VerticalComm);
-  MPI_Comm_split(communicator_universe,(UniverseRank/2),UniverseRank,&VerticalComm);
+#define MPI_SHARED_MEM_DEBUG
+#ifdef  MPI_SHARED_MEM_DEBUG
+  MPI_Comm_split(communicator_universe,(UniverseRank/4),UniverseRank,&VerticalComm);
+#else 
+  MPI_Comm_split_type(communicator_universe, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&VerticalComm);
+#endif
  MPI_Comm_rank(VerticalComm     ,&VerticalRank);
  MPI_Comm_size(VerticalComm     ,&VerticalSize);
  
@@ -308,19 +329,83 @@ void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world,
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  // allocate the shared window for our group, pass back Shm info to CartesianCommunicator
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  VerticalShmBufs.resize(VerticalSize);
+
+#undef MPI_SHARED_MEM
+#ifdef MPI_SHARED_MEM
  ierr = MPI_Win_allocate_shared(CartesianCommunicator::MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,VerticalComm,&ShmCommBuf,&VerticalWindow);
  ierr|= MPI_Win_lock_all (MPI_MODE_NOCHECK, VerticalWindow);
  assert(ierr==0);
-  
-  std::cout<<"SHM "<<ShmCommBuf<<std::endl;
+  //  std::cout<<"SHM "<<ShmCommBuf<<std::endl;

-  VerticalShmBufs.resize(VerticalSize);
  for(int r=0;r<VerticalSize;r++){
    MPI_Aint sz;
    int dsp_unit;
    MPI_Win_shared_query (VerticalWindow, r, &sz, &dsp_unit, &VerticalShmBufs[r]);
-    std::cout<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
+    //    std::cout<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
  }
+#else 
+  char shm_name [NAME_MAX];
+  MPI_Barrier(VerticalComm);
+
+  if ( VerticalRank == 0 ) {
+    for(int r=0;r<VerticalSize;r++){
+
+      size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
+      if ( r>0 ) size = sizeof(SlaveState);
+
+      sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldRank,r);
+      
+      shm_unlink(shm_name);
+
+      int fd=shm_open(shm_name,O_RDWR|O_CREAT,0600);
+      if ( fd < 0 ) {
+	perror("failed shm_open");
+	assert(0);
+      }
+
+      ftruncate(fd, size);
+
+      VerticalShmBufs[r] = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+
+      if ( VerticalShmBufs[r] == MAP_FAILED ) { 
+	perror("failed mmap");
+	assert(0);
+      }
+
+      uint64_t * check = (uint64_t *) VerticalShmBufs[r];
+      check[0] = WorldRank;
+      check[1] = r;
+
+      //      std::cout<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
+    }
+  }
+
+  MPI_Barrier(VerticalComm);
+
+  if ( VerticalRank != 0 ) { 
+  for(int r=0;r<VerticalSize;r++){
+
+    size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES ;
+    if ( r>0 ) size = sizeof(SlaveState);
+    
+    sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldRank,r);
+    
+    int fd=shm_open(shm_name,O_RDWR|O_CREAT,0600);
+    if ( fd<0 ) {
+      perror("failed shm_open");
+      assert(0);
+    }
+    VerticalShmBufs[r] = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+
+    uint64_t * check = (uint64_t *) VerticalShmBufs[r];
+    assert(check[0]== WorldRank);
+    assert(check[1]== r);
+    std::cerr<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
+  }
+  }
+#endif
+  MPI_Barrier(VerticalComm);

  //////////////////////////////////////////////////////////////////////
  // Map rank of leader on node in their in new world, to the