Internal SHM comms in non-simd directions working

Need to fix simd directions
2025-08-01 04:07:07 +01:00 · 2016-10-22 18:14:27 +01:00
parent 0fcd2e7188
commit c190221fd3
16 changed files with 1729 additions and 1739 deletions
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -80,7 +80,6 @@ class CartesianCommunicator {

    void * ShmCommBuf;
    std::vector<void *> ShmCommBufs;
-    std::vector<void *> ShmStencilBufs;

    int WorldRank;
    int WorldSize;
@@ -105,6 +104,10 @@ class CartesianCommunicator {
    int  RankFromProcessorCoor(std::vector<int> &coor);
    void ProcessorCoorFromRank(int rank,std::vector<int> &coor);

+    // Helper function for SHM Windows in MPI3
+    void *ShmBufferSelf(void);
+    void *ShmBuffer(int rank);
+
    /////////////////////////////////
    // Grid information queries
    /////////////////////////////////
@@ -173,6 +176,16 @@ class CartesianCommunicator {
 			 int recv_from_rank,
 			 int bytes);
    void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
+    void StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+			 void *xmit,
+			 int xmit_to_rank,
+			 void *recv,
+			 int recv_from_rank,
+			 int bytes);
+    void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
+    {
+      SendToRecvFromComplete(waitall);
+    }

    ////////////////////////////////////////////////////////////
    // Barrier
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@@ -67,6 +67,14 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
  
  assert(Size==_Nprocessors);
 }
+void *CartesianCommunicator::ShmBufferSelf(void)
+{
+  return NULL;
+}
+void *CartesianCommunicator::ShmBuffer(int rank)
+{
+  return NULL;
+}

 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -197,10 +197,10 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    // Verbose for now
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    std::cout<< "Ranks per node "<< ShmSize << std::endl;
-    std::cout<< "Nodes          "<< GroupSize << std::endl;
-    std::cout<< "Ranks          "<< WorldSize << std::endl;
-    std::cout<< "Shm CommBuf "<< ShmCommBuf << std::endl;
+    std::cout<<GridLogMessage<< "MPI-3 configuration: Ranks per node "<< ShmSize ;
+    std::cout<< " Nodes "<< GroupSize;
+    std::cout<< " Ranks "<< WorldSize;
+    std::cout<< " Shm CommBuf address"<< std::hex <<ShmCommBuf << std::dec<<std::endl;

    // Done
    ShmSetup=1;
@@ -208,12 +208,10 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
  }

  ShmCommBufs.resize(ShmSize);
-  ShmStencilBufs.resize(ShmSize);
  for(int r=0;r<ShmSize;r++){
    MPI_Aint sz;
    int dsp_unit;
    MPI_Win_shared_query (ShmWindow, r, &sz, &dsp_unit, &ShmCommBufs[r]);
-    ShmStencilBufs[r] = (void *) ((uint64_t)ShmCommBufs[r]+MAX_MPI_SHM_BYTES/4);
  }
  
  ////////////////////////////////////////////////////////////////
@@ -240,6 +238,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
  ShmCoor.resize(_ndimension);
  GroupCoor.resize(_ndimension);
  WorldCoor.resize(_ndimension);
+
  for(int l2=0;l2<log2size;l2++){
    while ( WorldDims[dim] / ShmDims[dim] <= 1 ) dim=(dim+1)%_ndimension;
    ShmDims[dim]*=2;
@@ -347,6 +346,21 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
  }
 }

+
+void *CartesianCommunicator::ShmBufferSelf(void)
+{
+  return ShmCommBufs[ShmRank];
+}
+void *CartesianCommunicator::ShmBuffer(int rank)
+{
+  int gpeer = GroupRanks[rank];
+  if (gpeer == MPI_UNDEFINED){
+    return NULL;
+  } else { 
+    return ShmCommBufs[gpeer];
+  }
+}
+
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
@@ -355,13 +369,11 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 						int from,
 						int bytes)
 {
-#undef SHM_USE_BCOPY
  MPI_Request xrq;
  MPI_Request rrq;
  
  static int sequence;

-  int rank = _processor;
  int ierr;
  int tag;
  int check;
@@ -370,6 +382,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
  assert(from != _processor);
  
  int gdest = GroupRanks[dest];
+  int gfrom = GroupRanks[from];
  int gme   = GroupRanks[_processor];

  sequence++;
@@ -379,30 +392,23 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis

  int small = (bytes<MAX_MPI_SHM_BYTES);

-#ifndef SHM_USE_BCOPY
  typedef vRealD T;
  int words = bytes/sizeof(T);
-  assert(((size_t)bytes &(sizeof(T)-1))==0);
-  //  assert(((size_t)xmit  &(sizeof(T)-1))==0);
-  //  assert(((size_t)recv  &(sizeof(T)-1))==0);
-#endif

+  assert(((size_t)bytes &(sizeof(T)-1))==0);
  assert(gme == ShmRank);

-  //  std::cerr << "proc dest from gme  gdest "<<_processor<<" "<<dest <<" "<< from <<" "<<gme<<" "<< gdest<<std::endl; Barrier();
-  if ( small && (dest !=MPI_UNDEFINED) ) {
+  if ( small && (gdest !=MPI_UNDEFINED) ) {
+
    assert(gme != gdest);

-#ifdef SHM_USE_BCOPY
-    bcopy(xmit,to_ptr,bytes);
-#else
    T *ip = (T *)xmit;
    T *op = (T *)to_ptr;
-    PARALLEL_FOR_LOOP 
+PARALLEL_FOR_LOOP 
    for(int w=0;w<words;w++) {
      vstream(op[w],ip[w]);
    }
-#endif
+
    bcopy(&_processor,&to_ptr[bytes],sizeof(_processor));
    bcopy(&  sequence,&to_ptr[bytes+4],sizeof(sequence));
  } else { 
@@ -411,24 +417,17 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
    list.push_back(xrq);
  }
  
-  //  std::cout << "Syncing "<<std::endl; Barrier();
  MPI_Win_sync (ShmWindow);   
  MPI_Barrier  (ShmComm);
  MPI_Win_sync (ShmWindow);   

-  //  std::cout << "Receiving "<<std::endl; Barrier();
-  
-  if (small && (from !=MPI_UNDEFINED) ) {
-#ifdef SHM_USE_BCOPY
-    bcopy(from_ptr,recv,bytes);
-#else
+  if (small && (gfrom !=MPI_UNDEFINED) ) {
    T *ip = (T *)from_ptr;
    T *op = (T *)recv;
-    PARALLEL_FOR_LOOP 
+PARALLEL_FOR_LOOP 
    for(int w=0;w<words;w++) {
      vstream(op[w],ip[w]);
    }
-#endif
    bcopy(&from_ptr[bytes]  ,&tag  ,sizeof(tag));
    bcopy(&from_ptr[bytes+4],&check,sizeof(check));
    assert(check==sequence);
@@ -438,28 +437,52 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
    assert(ierr==0);
    list.push_back(rrq);
  }
-  
-  //  std::cout << "Syncing"<<std::endl; Barrier();

  MPI_Win_sync (ShmWindow);   
  MPI_Barrier  (ShmComm);
  MPI_Win_sync (ShmWindow);   
+}

-#if 0
+void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+						       void *xmit,
+						       int dest,
+						       void *recv,
+						       int from,
+						       int bytes)
+{
  MPI_Request xrq;
  MPI_Request rrq;
-  int rank = _processor;
-  int ierr;
-  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
-  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
-  
-  assert(ierr==0);

-  list.push_back(xrq);
-  list.push_back(rrq);
-#endif
+  int ierr;
+
+  assert(dest != _processor);
+  assert(from != _processor);
+  
+  int gdest = GroupRanks[dest];
+  int gfrom = GroupRanks[from];
+  int gme   = GroupRanks[_processor];
+
+  assert(gme == ShmRank);
+
+  if ( gdest == MPI_UNDEFINED ) {
+    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+    assert(ierr==0);
+    list.push_back(xrq);
+  }
+  
+  if ( gfrom ==MPI_UNDEFINED) {
+    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
+    assert(ierr==0);
+    list.push_back(rrq);
+  }
+
+  MPI_Win_sync (ShmWindow);   
+  MPI_Barrier  (ShmComm);
+  MPI_Win_sync (ShmWindow);   
+  
 }

+
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  int nreq=list.size();
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@@ -33,6 +33,14 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
 }

 int Rank(void ){ return 0; };
+void *CartesianCommunicator::ShmBufferSelf(void)
+{
+  return NULL;
+}
+void *CartesianCommunicator::ShmBuffer(int rank)
+{
+  return NULL;
+}

 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
--- a/lib/communicator/Communicator_shmem.cc
+++ b/lib/communicator/Communicator_shmem.cc
@@ -50,6 +50,14 @@ typedef struct HandShake_t {
 static Vector< HandShake > XConnections;
 static Vector< HandShake > RConnections;

+void *CartesianCommunicator::ShmBufferSelf(void)
+{
+  return NULL;
+}
+void *CartesianCommunicator::ShmBuffer(int rank)
+{
+  return NULL;
+}
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  shmem_init();
  XConnections.resize(shmem_n_pes());