Update to use shared memory to contain the stencil comms buffers

Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions
2026-05-21 01:24:16 +01:00 · 2016-10-24 17:30:43 +01:00
parent ea25a4d9ac
commit b6a65059a2
13 changed files with 706 additions and 458 deletions
@@ -40,169 +40,188 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_COMMS_SHMEM
 #include <mpp/shmem.h>
 #endif
+
 namespace Grid {
+
 class CartesianCommunicator {
  public:    

+  // 65536 ranks per node adequate for now
+  // 128MB shared memory for comms enought for 48^4 local vol comms
+  // Give external control (command line override?) of this
+
+  static const int      MAXLOG2RANKSPERNODE = 16;            
+  static const uint64_t MAX_MPI_SHM_BYTES   = 128*1024*1024; 
+
  // Communicator should know nothing of the physics grid, only processor grid.
-  
-    int              _Nprocessors;     // How many in all
-    std::vector<int> _processors;      // Which dimensions get relayed out over processors lanes.
-    int              _processor;       // linear processor rank
-    std::vector<int> _processor_coor;  // linear processor coordinate
-    unsigned long _ndimension;
+  int              _Nprocessors;     // How many in all
+  std::vector<int> _processors;      // Which dimensions get relayed out over processors lanes.
+  int              _processor;       // linear processor rank
+  std::vector<int> _processor_coor;  // linear processor coordinate
+  unsigned long _ndimension;

-#ifdef GRID_COMMS_MPI
-    MPI_Comm communicator;
-    typedef MPI_Request CommsRequest_t;
-#elif  GRID_COMMS_MPI3
-    int shm_mode;
-
-    MPI_Comm communicator;
-    typedef MPI_Request CommsRequest_t;
-
-    const int MAXLOG2RANKSPERNODE = 16;     // 65536 ranks per node adequate for now
-    const uint64_t MAX_MPI_SHM_BYTES = 256*1024*1024; // 256MB shared memory for comms enought for 48^4 local vol comms
-
-    std::vector<int>  WorldDims;
-    std::vector<int>  GroupDims;
-    std::vector<int>  ShmDims;
-
-    std::vector<int> GroupCoor;
-    std::vector<int> ShmCoor;
-    std::vector<int> WorldCoor;
-
-    static std::vector<int> GroupRanks; 
-    static std::vector<int> MyGroup;
-    static int ShmSetup;
-    static MPI_Win ShmWindow; 
-    static MPI_Comm ShmComm;
-
-    void * ShmCommBuf;
-    std::vector<void *> ShmCommBufs;
-
-    int WorldRank;
-    int WorldSize;
-
-    static int ShmRank;
-    static int ShmSize;
-    static int GroupSize;
-    static int GroupRank;
-
-    std::vector<int>  LexicographicToWorldRank;
+#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
+  MPI_Comm communicator;
+  static MPI_Comm communicator_world;
+  typedef MPI_Request CommsRequest_t;
 #else 
-    typedef int CommsRequest_t;
+  typedef int CommsRequest_t;
 #endif

-    static void Init(int *argc, char ***argv);
+  ////////////////////////////////////////////////////////////////////
+  // Helper functionality for SHM Windows common to all other impls
+  ////////////////////////////////////////////////////////////////////
+  // Longer term; drop this in favour of a master / slave model with 
+  // cartesian communicator on a subset of ranks, slave ranks controlled
+  // by group leader with data xfer via shared memory
+  ////////////////////////////////////////////////////////////////////
+#ifdef  GRID_COMMS_MPI3
+  std::vector<int>  WorldDims;
+  std::vector<int>  GroupDims;
+  std::vector<int>  ShmDims;
+  
+  std::vector<int> GroupCoor;
+  std::vector<int> ShmCoor;
+  std::vector<int> WorldCoor;
+  
+  static std::vector<int> GroupRanks; 
+  static std::vector<int> MyGroup;
+  static int ShmSetup;
+  static MPI_Win ShmWindow; 
+  static MPI_Comm ShmComm;
+  
+  std::vector<int>  LexicographicToWorldRank;
+  
+  static std::vector<void *> ShmCommBufs;
+#else 
+  static void ShmInitGeneric(void);
+  static commVector<uint8_t> ShmBufStorageVector;
+#endif 
+  static void * ShmCommBuf;
+  size_t heap_top;
+  size_t heap_bytes;
+  void *ShmBufferSelf(void);
+  void *ShmBuffer(int rank);
+  void *ShmBufferTranslate(int rank,void * local_p);
+  void *ShmBufferMalloc(size_t bytes);
+  void *ShmBufferFreeAll(void) ;
+  
+  ////////////////////////////////////////////////
+  // Must call in Grid startup
+  ////////////////////////////////////////////////
+  static void Init(int *argc, char ***argv);
+  
+  ////////////////////////////////////////////////
+  // Constructor of any given grid
+  ////////////////////////////////////////////////
+  CartesianCommunicator(const std::vector<int> &pdimensions_in);
+  
+  ////////////////////////////////////////////////////////////////////////////////////////
+  // Wraps MPI_Cart routines, or implements equivalent on other impls
+  ////////////////////////////////////////////////////////////////////////////////////////
+  void ShiftedRanks(int dim,int shift,int & source, int & dest);
+  int  RankFromProcessorCoor(std::vector<int> &coor);
+  void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
+  
+  /////////////////////////////////
+  // Grid information and queries
+  /////////////////////////////////
+  static int ShmRank;
+  static int ShmSize;
+  static int GroupSize;
+  static int GroupRank;
+  static int WorldRank;
+  static int WorldSize;
+  static int Slave;
+  
+  int                      IsBoss(void)            ;
+  int                      BossRank(void)          ;
+  int                      ThisRank(void)          ;
+  const std::vector<int> & ThisProcessorCoor(void) ;
+  const std::vector<int> & ProcessorGrid(void)     ;
+  int                      ProcessorCount(void)    ;
+  static int Ranks    (void);
+  static int Nodes    (void);
+  static int Cores    (void);
+  static int NodeRank (void);
+  static int CoreRank (void);

-    // Constructor
-    CartesianCommunicator(const std::vector<int> &pdimensions_in);
+  ////////////////////////////////////////////////////////////////////////////////
+  // very VERY rarely (Log, serial RNG) we need world without a grid
+  ////////////////////////////////////////////////////////////////////////////////
+  static int  RankWorld(void) ;
+  static void BroadcastWorld(int root,void* data, int bytes);
+  
+  ////////////////////////////////////////////////////////////
+  // Reduction
+  ////////////////////////////////////////////////////////////
+  void GlobalSum(RealF &);
+  void GlobalSumVector(RealF *,int N);
+  void GlobalSum(RealD &);
+  void GlobalSumVector(RealD *,int N);
+  void GlobalSum(uint32_t &);
+  void GlobalSum(uint64_t &);
+  void GlobalSum(ComplexF &c);
+  void GlobalSumVector(ComplexF *c,int N);
+  void GlobalSum(ComplexD &c);
+  void GlobalSumVector(ComplexD *c,int N);
+  
+  template<class obj> void GlobalSum(obj &o){
+    typedef typename obj::scalar_type scalar_type;
+    int words = sizeof(obj)/sizeof(scalar_type);
+    scalar_type * ptr = (scalar_type *)& o;
+    GlobalSumVector(ptr,words);
+  }
+  
+  ////////////////////////////////////////////////////////////
+  // Face exchange, buffer swap in translational invariant way
+  ////////////////////////////////////////////////////////////
+  void SendToRecvFrom(void *xmit,
+		      int xmit_to_rank,
+		      void *recv,
+		      int recv_from_rank,
+		      int bytes);
+  
+  void SendRecvPacket(void *xmit,
+		      void *recv,
+		      int xmit_to_rank,
+		      int recv_from_rank,
+		      int bytes);
+  
+  void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+			   void *xmit,
+			   int xmit_to_rank,
+			   void *recv,
+			   int recv_from_rank,
+			   int bytes);
+  
+  void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);

-    // Wraps MPI_Cart routines
-    void ShiftedRanks(int dim,int shift,int & source, int & dest);
-    int  RankFromProcessorCoor(std::vector<int> &coor);
-    void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
+  void StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+				  void *xmit,
+				  int xmit_to_rank,
+				  void *recv,
+				  int recv_from_rank,
+				  int bytes);
+  
+  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
+  void StencilBarrier(void);

-    // Helper function for SHM Windows in MPI3
-    void *ShmBufferSelf(void);
-    void *ShmBuffer(int rank);
-
-    /////////////////////////////////
-    // Grid information queries
-    /////////////////////////////////
-    int                      IsBoss(void)            { return _processor==0; };
-    int                      BossRank(void)          { return 0; };
-    int                      ThisRank(void)          { return _processor; };
-    const std::vector<int> & ThisProcessorCoor(void) { return _processor_coor; };
-    const std::vector<int> & ProcessorGrid(void)     { return _processors; };
-    int                      ProcessorCount(void)    { return _Nprocessors; };
-
-    ////////////////////////////////////////////////////////////
-    // Reduction
-    ////////////////////////////////////////////////////////////
-    void GlobalSum(RealF &);
-    void GlobalSumVector(RealF *,int N);
-
-    void GlobalSum(RealD &);
-    void GlobalSumVector(RealD *,int N);
-
-    void GlobalSum(uint32_t &);
-    void GlobalSum(uint64_t &);
-
-    void GlobalSum(ComplexF &c)
-    {
-      GlobalSumVector((float *)&c,2);
-    }
-    void GlobalSumVector(ComplexF *c,int N)
-    {
-      GlobalSumVector((float *)c,2*N);
-    }
-
-    void GlobalSum(ComplexD &c)
-    {
-      GlobalSumVector((double *)&c,2);
-    }
-    void GlobalSumVector(ComplexD *c,int N)
-    {
-      GlobalSumVector((double *)c,2*N);
-    }
-    
-    template<class obj> void GlobalSum(obj &o){
-      typedef typename obj::scalar_type scalar_type;
-      int words = sizeof(obj)/sizeof(scalar_type);
-      scalar_type * ptr = (scalar_type *)& o;
-      GlobalSumVector(ptr,words);
-    }
-    ////////////////////////////////////////////////////////////
-    // Face exchange, buffer swap in translational invariant way
-    ////////////////////////////////////////////////////////////
-    void SendToRecvFrom(void *xmit,
-			int xmit_to_rank,
-			void *recv,
-			int recv_from_rank,
-			int bytes);
-
-    void SendRecvPacket(void *xmit,
-			void *recv,
-			int xmit_to_rank,
-			int recv_from_rank,
-			int bytes);
-
-    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-			 void *xmit,
-			 int xmit_to_rank,
-			 void *recv,
-			 int recv_from_rank,
-			 int bytes);
-    void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
-    void StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-			 void *xmit,
-			 int xmit_to_rank,
-			 void *recv,
-			 int recv_from_rank,
-			 int bytes);
-    void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
-    {
-      SendToRecvFromComplete(waitall);
-    }
-
-    ////////////////////////////////////////////////////////////
-    // Barrier
-    ////////////////////////////////////////////////////////////
-    void Barrier(void);
-
-    ////////////////////////////////////////////////////////////
-    // Broadcast a buffer and composite larger
-    ////////////////////////////////////////////////////////////
-    void Broadcast(int root,void* data, int bytes);
-    template<class obj> void Broadcast(int root,obj &data)
+  ////////////////////////////////////////////////////////////
+  // Barrier
+  ////////////////////////////////////////////////////////////
+  void Barrier(void);
+  
+  ////////////////////////////////////////////////////////////
+  // Broadcast a buffer and composite larger
+  ////////////////////////////////////////////////////////////
+  void Broadcast(int root,void* data, int bytes);
+  
+  template<class obj> void Broadcast(int root,obj &data)
    {
      Broadcast(root,(void *)&data,sizeof(data));
    };

-    static void BroadcastWorld(int root,void* data, int bytes);
-
 }; 
 }