Update to use shared memory to contain the stencil comms buffers

Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions
2026-02-01 12:53:28 +00:00 · 2016-10-24 17:30:43 +01:00
parent ea25a4d9ac
commit b6a65059a2
13 changed files with 706 additions and 458 deletions
--- a/lib/communicator/Communicator_base.cc
+++ b/lib/communicator/Communicator_base.cc
@@ -0,0 +1,132 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/communicator/Communicator_none.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include "Grid.h"
+namespace Grid {
+
+///////////////////////////////////////////////////////////////
+// Info that is setup once and indept of cartesian layout
+///////////////////////////////////////////////////////////////
+int CartesianCommunicator::ShmRank;
+int CartesianCommunicator::ShmSize;
+int CartesianCommunicator::GroupRank;
+int CartesianCommunicator::GroupSize;
+int CartesianCommunicator::WorldRank;
+int CartesianCommunicator::WorldSize;
+int CartesianCommunicator::Slave;
+void *              CartesianCommunicator::ShmCommBuf;
+
+/////////////////////////////////
+// Alloc, free shmem region
+/////////////////////////////////
+void *CartesianCommunicator::ShmBufferMalloc(size_t bytes){
+  //  bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
+  void *ptr = (void *)heap_top;
+  heap_top  += bytes;
+  heap_bytes+= bytes;
+  assert(heap_bytes < MAX_MPI_SHM_BYTES);
+  return ptr;
+}
+void *CartesianCommunicator::ShmBufferFreeAll(void) { 
+  heap_top  =(size_t)ShmBufferSelf();
+  heap_bytes=0;
+}
+
+/////////////////////////////////
+// Grid information queries
+/////////////////////////////////
+int                      CartesianCommunicator::IsBoss(void)            { return _processor==0; };
+int                      CartesianCommunicator::BossRank(void)          { return 0; };
+int                      CartesianCommunicator::ThisRank(void)          { return _processor; };
+const std::vector<int> & CartesianCommunicator::ThisProcessorCoor(void) { return _processor_coor; };
+const std::vector<int> & CartesianCommunicator::ProcessorGrid(void)     { return _processors; };
+int                      CartesianCommunicator::ProcessorCount(void)    { return _Nprocessors; };
+
+////////////////////////////////////////////////////////////////////////////////
+// very VERY rarely (Log, serial RNG) we need world without a grid
+////////////////////////////////////////////////////////////////////////////////
+int  CartesianCommunicator::RankWorld(void) { return WorldRank; };
+int CartesianCommunicator::Ranks    (void) { return WorldSize; };
+int CartesianCommunicator::Nodes    (void) { return GroupSize; };
+int CartesianCommunicator::Cores    (void) { return ShmSize;   };
+int CartesianCommunicator::NodeRank (void) { return GroupRank; };
+int CartesianCommunicator::CoreRank (void) { return ShmRank;   };
+
+void CartesianCommunicator::GlobalSum(ComplexF &c)
+{
+  GlobalSumVector((float *)&c,2);
+}
+void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
+{
+  GlobalSumVector((float *)c,2*N);
+}
+void CartesianCommunicator::GlobalSum(ComplexD &c)
+{
+  GlobalSumVector((double *)&c,2);
+}
+void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
+{
+  GlobalSumVector((double *)c,2*N);
+}
+
+#ifndef GRID_COMMS_MPI3
+
+void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+						       void *xmit,
+						       int xmit_to_rank,
+						       void *recv,
+						       int recv_from_rank,
+						       int bytes)
+{
+  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
+}
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
+{
+  SendToRecvFromComplete(waitall);
+}
+void StencilBarrier(void){};
+
+commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
+
+void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; }
+void *CartesianCommunicator::ShmBuffer(int rank) {
+  if (rank != ShmRank ) return NULL;
+  else                  return ShmCommBuf;
+}
+void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { 
+  if (rank != ShmRank ) return NULL;
+  else                  return local_p;
+}
+void CartesianCommunicator::ShmInitGeneric(void){
+  ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
+  ShmCommBuf=(void *)&ShmBufStorageVector[0];
+}
+
+#endif
+  
+}
+
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -40,169 +40,188 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_COMMS_SHMEM
 #include <mpp/shmem.h>
 #endif
+
 namespace Grid {
+
 class CartesianCommunicator {
  public:    

+  // 65536 ranks per node adequate for now
+  // 128MB shared memory for comms enought for 48^4 local vol comms
+  // Give external control (command line override?) of this
+
+  static const int      MAXLOG2RANKSPERNODE = 16;            
+  static const uint64_t MAX_MPI_SHM_BYTES   = 128*1024*1024; 
+
  // Communicator should know nothing of the physics grid, only processor grid.
-  
-    int              _Nprocessors;     // How many in all
-    std::vector<int> _processors;      // Which dimensions get relayed out over processors lanes.
-    int              _processor;       // linear processor rank
-    std::vector<int> _processor_coor;  // linear processor coordinate
-    unsigned long _ndimension;
+  int              _Nprocessors;     // How many in all
+  std::vector<int> _processors;      // Which dimensions get relayed out over processors lanes.
+  int              _processor;       // linear processor rank
+  std::vector<int> _processor_coor;  // linear processor coordinate
+  unsigned long _ndimension;

-#ifdef GRID_COMMS_MPI
-    MPI_Comm communicator;
-    typedef MPI_Request CommsRequest_t;
-#elif  GRID_COMMS_MPI3
-    int shm_mode;
-
-    MPI_Comm communicator;
-    typedef MPI_Request CommsRequest_t;
-
-    const int MAXLOG2RANKSPERNODE = 16;     // 65536 ranks per node adequate for now
-    const uint64_t MAX_MPI_SHM_BYTES = 256*1024*1024; // 256MB shared memory for comms enought for 48^4 local vol comms
-
-    std::vector<int>  WorldDims;
-    std::vector<int>  GroupDims;
-    std::vector<int>  ShmDims;
-
-    std::vector<int> GroupCoor;
-    std::vector<int> ShmCoor;
-    std::vector<int> WorldCoor;
-
-    static std::vector<int> GroupRanks; 
-    static std::vector<int> MyGroup;
-    static int ShmSetup;
-    static MPI_Win ShmWindow; 
-    static MPI_Comm ShmComm;
-
-    void * ShmCommBuf;
-    std::vector<void *> ShmCommBufs;
-
-    int WorldRank;
-    int WorldSize;
-
-    static int ShmRank;
-    static int ShmSize;
-    static int GroupSize;
-    static int GroupRank;
-
-    std::vector<int>  LexicographicToWorldRank;
+#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
+  MPI_Comm communicator;
+  static MPI_Comm communicator_world;
+  typedef MPI_Request CommsRequest_t;
 #else 
-    typedef int CommsRequest_t;
+  typedef int CommsRequest_t;
 #endif

-    static void Init(int *argc, char ***argv);
+  ////////////////////////////////////////////////////////////////////
+  // Helper functionality for SHM Windows common to all other impls
+  ////////////////////////////////////////////////////////////////////
+  // Longer term; drop this in favour of a master / slave model with 
+  // cartesian communicator on a subset of ranks, slave ranks controlled
+  // by group leader with data xfer via shared memory
+  ////////////////////////////////////////////////////////////////////
+#ifdef  GRID_COMMS_MPI3
+  std::vector<int>  WorldDims;
+  std::vector<int>  GroupDims;
+  std::vector<int>  ShmDims;
+  
+  std::vector<int> GroupCoor;
+  std::vector<int> ShmCoor;
+  std::vector<int> WorldCoor;
+  
+  static std::vector<int> GroupRanks; 
+  static std::vector<int> MyGroup;
+  static int ShmSetup;
+  static MPI_Win ShmWindow; 
+  static MPI_Comm ShmComm;
+  
+  std::vector<int>  LexicographicToWorldRank;
+  
+  static std::vector<void *> ShmCommBufs;
+#else 
+  static void ShmInitGeneric(void);
+  static commVector<uint8_t> ShmBufStorageVector;
+#endif 
+  static void * ShmCommBuf;
+  size_t heap_top;
+  size_t heap_bytes;
+  void *ShmBufferSelf(void);
+  void *ShmBuffer(int rank);
+  void *ShmBufferTranslate(int rank,void * local_p);
+  void *ShmBufferMalloc(size_t bytes);
+  void *ShmBufferFreeAll(void) ;
+  
+  ////////////////////////////////////////////////
+  // Must call in Grid startup
+  ////////////////////////////////////////////////
+  static void Init(int *argc, char ***argv);
+  
+  ////////////////////////////////////////////////
+  // Constructor of any given grid
+  ////////////////////////////////////////////////
+  CartesianCommunicator(const std::vector<int> &pdimensions_in);
+  
+  ////////////////////////////////////////////////////////////////////////////////////////
+  // Wraps MPI_Cart routines, or implements equivalent on other impls
+  ////////////////////////////////////////////////////////////////////////////////////////
+  void ShiftedRanks(int dim,int shift,int & source, int & dest);
+  int  RankFromProcessorCoor(std::vector<int> &coor);
+  void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
+  
+  /////////////////////////////////
+  // Grid information and queries
+  /////////////////////////////////
+  static int ShmRank;
+  static int ShmSize;
+  static int GroupSize;
+  static int GroupRank;
+  static int WorldRank;
+  static int WorldSize;
+  static int Slave;
+  
+  int                      IsBoss(void)            ;
+  int                      BossRank(void)          ;
+  int                      ThisRank(void)          ;
+  const std::vector<int> & ThisProcessorCoor(void) ;
+  const std::vector<int> & ProcessorGrid(void)     ;
+  int                      ProcessorCount(void)    ;
+  static int Ranks    (void);
+  static int Nodes    (void);
+  static int Cores    (void);
+  static int NodeRank (void);
+  static int CoreRank (void);

-    // Constructor
-    CartesianCommunicator(const std::vector<int> &pdimensions_in);
+  ////////////////////////////////////////////////////////////////////////////////
+  // very VERY rarely (Log, serial RNG) we need world without a grid
+  ////////////////////////////////////////////////////////////////////////////////
+  static int  RankWorld(void) ;
+  static void BroadcastWorld(int root,void* data, int bytes);
+  
+  ////////////////////////////////////////////////////////////
+  // Reduction
+  ////////////////////////////////////////////////////////////
+  void GlobalSum(RealF &);
+  void GlobalSumVector(RealF *,int N);
+  void GlobalSum(RealD &);
+  void GlobalSumVector(RealD *,int N);
+  void GlobalSum(uint32_t &);
+  void GlobalSum(uint64_t &);
+  void GlobalSum(ComplexF &c);
+  void GlobalSumVector(ComplexF *c,int N);
+  void GlobalSum(ComplexD &c);
+  void GlobalSumVector(ComplexD *c,int N);
+  
+  template<class obj> void GlobalSum(obj &o){
+    typedef typename obj::scalar_type scalar_type;
+    int words = sizeof(obj)/sizeof(scalar_type);
+    scalar_type * ptr = (scalar_type *)& o;
+    GlobalSumVector(ptr,words);
+  }
+  
+  ////////////////////////////////////////////////////////////
+  // Face exchange, buffer swap in translational invariant way
+  ////////////////////////////////////////////////////////////
+  void SendToRecvFrom(void *xmit,
+		      int xmit_to_rank,
+		      void *recv,
+		      int recv_from_rank,
+		      int bytes);
+  
+  void SendRecvPacket(void *xmit,
+		      void *recv,
+		      int xmit_to_rank,
+		      int recv_from_rank,
+		      int bytes);
+  
+  void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+			   void *xmit,
+			   int xmit_to_rank,
+			   void *recv,
+			   int recv_from_rank,
+			   int bytes);
+  
+  void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);

-    // Wraps MPI_Cart routines
-    void ShiftedRanks(int dim,int shift,int & source, int & dest);
-    int  RankFromProcessorCoor(std::vector<int> &coor);
-    void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
+  void StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+				  void *xmit,
+				  int xmit_to_rank,
+				  void *recv,
+				  int recv_from_rank,
+				  int bytes);
+  
+  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
+  void StencilBarrier(void);

-    // Helper function for SHM Windows in MPI3
-    void *ShmBufferSelf(void);
-    void *ShmBuffer(int rank);
-
-    /////////////////////////////////
-    // Grid information queries
-    /////////////////////////////////
-    int                      IsBoss(void)            { return _processor==0; };
-    int                      BossRank(void)          { return 0; };
-    int                      ThisRank(void)          { return _processor; };
-    const std::vector<int> & ThisProcessorCoor(void) { return _processor_coor; };
-    const std::vector<int> & ProcessorGrid(void)     { return _processors; };
-    int                      ProcessorCount(void)    { return _Nprocessors; };
-
-    ////////////////////////////////////////////////////////////
-    // Reduction
-    ////////////////////////////////////////////////////////////
-    void GlobalSum(RealF &);
-    void GlobalSumVector(RealF *,int N);
-
-    void GlobalSum(RealD &);
-    void GlobalSumVector(RealD *,int N);
-
-    void GlobalSum(uint32_t &);
-    void GlobalSum(uint64_t &);
-
-    void GlobalSum(ComplexF &c)
-    {
-      GlobalSumVector((float *)&c,2);
-    }
-    void GlobalSumVector(ComplexF *c,int N)
-    {
-      GlobalSumVector((float *)c,2*N);
-    }
-
-    void GlobalSum(ComplexD &c)
-    {
-      GlobalSumVector((double *)&c,2);
-    }
-    void GlobalSumVector(ComplexD *c,int N)
-    {
-      GlobalSumVector((double *)c,2*N);
-    }
-    
-    template<class obj> void GlobalSum(obj &o){
-      typedef typename obj::scalar_type scalar_type;
-      int words = sizeof(obj)/sizeof(scalar_type);
-      scalar_type * ptr = (scalar_type *)& o;
-      GlobalSumVector(ptr,words);
-    }
-    ////////////////////////////////////////////////////////////
-    // Face exchange, buffer swap in translational invariant way
-    ////////////////////////////////////////////////////////////
-    void SendToRecvFrom(void *xmit,
-			int xmit_to_rank,
-			void *recv,
-			int recv_from_rank,
-			int bytes);
-
-    void SendRecvPacket(void *xmit,
-			void *recv,
-			int xmit_to_rank,
-			int recv_from_rank,
-			int bytes);
-
-    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-			 void *xmit,
-			 int xmit_to_rank,
-			 void *recv,
-			 int recv_from_rank,
-			 int bytes);
-    void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
-    void StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-			 void *xmit,
-			 int xmit_to_rank,
-			 void *recv,
-			 int recv_from_rank,
-			 int bytes);
-    void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
-    {
-      SendToRecvFromComplete(waitall);
-    }
-
-    ////////////////////////////////////////////////////////////
-    // Barrier
-    ////////////////////////////////////////////////////////////
-    void Barrier(void);
-
-    ////////////////////////////////////////////////////////////
-    // Broadcast a buffer and composite larger
-    ////////////////////////////////////////////////////////////
-    void Broadcast(int root,void* data, int bytes);
-    template<class obj> void Broadcast(int root,obj &data)
+  ////////////////////////////////////////////////////////////
+  // Barrier
+  ////////////////////////////////////////////////////////////
+  void Barrier(void);
+  
+  ////////////////////////////////////////////////////////////
+  // Broadcast a buffer and composite larger
+  ////////////////////////////////////////////////////////////
+  void Broadcast(int root,void* data, int bytes);
+  
+  template<class obj> void Broadcast(int root,obj &data)
    {
      Broadcast(root,(void *)&data,sizeof(data));
    };

-    static void BroadcastWorld(int root,void* data, int bytes);
-
 }; 
 }

--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@@ -30,19 +30,28 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 namespace Grid {

-  // Should error check all MPI calls.
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Info that is setup once and indept of cartesian layout
+///////////////////////////////////////////////////////////////////////////////////////////////////
+MPI_Comm CartesianCommunicator::communicator_world;
+
+// Should error check all MPI calls.
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  int flag;
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
    MPI_Init(argc,argv);
  }
-}
-
-int Rank(void) {
-  int pe;
-  MPI_Comm_rank(MPI_COMM_WORLD,&pe);
-  return pe;
+  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
+  MPI_Comm_rank(communicator_world,&WorldRank);
+  MPI_Comm_size(communicator_world,&WorldSize);
+  ShmRank=0;
+  ShmSize=1;
+  GroupRank=WorldRank;
+  GroupSize=WorldSize;
+  Slave    =0;
+  ShmInitGeneric();
 }

 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
@@ -54,7 +63,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
  _processors = processors;
  _processor_coor.resize(_ndimension);
  
-  MPI_Cart_create(MPI_COMM_WORLD, _ndimension,&_processors[0],&periodic[0],1,&communicator);
+  MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
  MPI_Comm_rank(communicator,&_processor);
  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);

@@ -67,15 +76,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
  
  assert(Size==_Nprocessors);
 }
-void *CartesianCommunicator::ShmBufferSelf(void)
-{
-  return NULL;
-}
-void *CartesianCommunicator::ShmBuffer(int rank)
-{
-  return NULL;
-}
-
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
@@ -194,14 +194,17 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 		     communicator);
  assert(ierr==0);
 }
-
+  ///////////////////////////////////////////////////////
+  // Should only be used prior to Grid Init finished.
+  // Check for this?
+  ///////////////////////////////////////////////////////
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
 		      bytes,
 		      MPI_BYTE,
 		      root,
-		      MPI_COMM_WORLD);
+		      communicator_world);
  assert(ierr==0);
 }

--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -1,4 +1,3 @@
-
    /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 
@@ -33,26 +32,197 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 namespace Grid {


+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Info that is setup once and indept of cartesian layout
+///////////////////////////////////////////////////////////////////////////////////////////////////
+int CartesianCommunicator::ShmSetup = 0;

-// Global used by Init and nowhere else. How to hide?
-int Rank(void) {
-  int pe;
-  MPI_Comm_rank(MPI_COMM_WORLD,&pe);
-  return pe;
+MPI_Comm CartesianCommunicator::communicator_world;
+MPI_Comm CartesianCommunicator::ShmComm;
+MPI_Win  CartesianCommunicator::ShmWindow;
+
+std::vector<int> CartesianCommunicator::GroupRanks;  
+std::vector<int> CartesianCommunicator::MyGroup;
+std::vector<void *> CartesianCommunicator::ShmCommBufs;
+
+void *CartesianCommunicator::ShmBufferSelf(void)
+{
+  return ShmCommBufs[ShmRank];
 }
-  // Should error check all MPI calls.
+void *CartesianCommunicator::ShmBuffer(int rank)
+{
+  int gpeer = GroupRanks[rank];
+  if (gpeer == MPI_UNDEFINED){
+    return NULL;
+  } else { 
+    return ShmCommBufs[gpeer];
+  }
+}
+void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p)
+{
+  int gpeer = GroupRanks[rank];
+  if (gpeer == MPI_UNDEFINED){
+    return NULL;
+  } else { 
+    uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
+    uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
+    return (void *) remote;
+  }
+}
+
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  int flag;
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
    MPI_Init(argc,argv);
  }
-}
-  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Want to implement some magic ... Group sub-cubes into those on same node
-  //
-  ////////////////////////////////////////////////////////////////////////////////////////////////////////////

+  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
+  MPI_Comm_rank(communicator_world,&WorldRank);
+  MPI_Comm_size(communicator_world,&WorldSize);
+
+  /////////////////////////////////////////////////////////////////////
+  // Split into groups that can share memory
+  /////////////////////////////////////////////////////////////////////
+  MPI_Comm_split_type(communicator_world, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
+  MPI_Comm_rank(ShmComm     ,&ShmRank);
+  MPI_Comm_size(ShmComm     ,&ShmSize);
+  GroupSize = WorldSize/ShmSize;
+
+  /////////////////////////////////////////////////////////////////////
+  // find world ranks in our SHM group (i.e. which ranks are on our node)
+  /////////////////////////////////////////////////////////////////////
+  MPI_Group WorldGroup, ShmGroup;
+  MPI_Comm_group (communicator_world, &WorldGroup); 
+  MPI_Comm_group (ShmComm, &ShmGroup);
+  
+  std::vector<int> world_ranks(WorldSize); 
+  GroupRanks.resize(WorldSize); 
+  MyGroup.resize(ShmSize);
+  for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
+  
+  MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &GroupRanks[0]); 
+
+  ///////////////////////////////////////////////////////////////////
+  // Identify who is in my group and noninate the leader
+    ///////////////////////////////////////////////////////////////////
+  int g=0;
+  for(int rank=0;rank<WorldSize;rank++){
+    if(GroupRanks[rank]!=MPI_UNDEFINED){
+      assert(g<ShmSize);
+      MyGroup[g++] = rank;
+    }
+  }
+  
+  std::sort(MyGroup.begin(),MyGroup.end(),std::less<int>());
+  int myleader = MyGroup[0];
+  
+  std::vector<int> leaders_1hot(WorldSize,0);
+  std::vector<int> leaders_group(GroupSize,0);
+  leaders_1hot [ myleader ] = 1;
+    
+  ///////////////////////////////////////////////////////////////////
+  // global sum leaders over comm world
+  ///////////////////////////////////////////////////////////////////
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator_world);
+  assert(ierr==0);
+  
+  ///////////////////////////////////////////////////////////////////
+  // find the group leaders world rank
+  ///////////////////////////////////////////////////////////////////
+  int group=0;
+  for(int l=0;l<WorldSize;l++){
+    if(leaders_1hot[l]){
+      leaders_group[group++] = l;
+    }
+  }
+  
+  ///////////////////////////////////////////////////////////////////
+  // Identify the rank of the group in which I (and my leader) live
+  ///////////////////////////////////////////////////////////////////
+  GroupRank=-1;
+  for(int g=0;g<GroupSize;g++){
+    if (myleader == leaders_group[g]){
+      GroupRank=g;
+    }
+  }
+  assert(GroupRank!=-1);
+  
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // allocate the shared window for our group
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  
+  ShmCommBuf = 0;
+  ierr = MPI_Win_allocate_shared(MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,ShmComm,&ShmCommBuf,&ShmWindow);
+  assert(ierr==0);
+  // KNL hack -- force to numa-domain 1 in flat
+#if 0
+  for(uint64_t page=0;page<MAX_MPI_SHM_BYTES;page+=4096){
+    void *pages = (void *) ( page + ShmCommBuf );
+    int status;
+    int flags=MPOL_MF_MOVE_ALL;
+    int nodes=1; // numa domain == MCDRAM
+    unsigned long count=1;
+    ierr= move_pages(0,count, &pages,&nodes,&status,flags);
+    if (ierr && (page==0)) perror("numa relocate command failed");
+  }
+#endif
+  MPI_Win_lock_all (MPI_MODE_NOCHECK, ShmWindow);
+  
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Plan: allocate a fixed SHM region. Scratch that is just used via some scheme during stencil comms, with no allocate free.
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  ShmCommBufs.resize(ShmSize);
+  for(int r=0;r<ShmSize;r++){
+    MPI_Aint sz;
+    int dsp_unit;
+    MPI_Win_shared_query (ShmWindow, r, &sz, &dsp_unit, &ShmCommBufs[r]);
+  }
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Verbose for now
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  if (WorldRank == 0){
+    std::cout<<GridLogMessage<< "Grid MPI-3 configuration: detected ";
+    std::cout<< WorldSize << " Ranks " ;
+    std::cout<< GroupSize << " Nodes " ;
+    std::cout<<  ShmSize  << " with ranks-per-node "<<std::endl;
+    
+    std::cout<<GridLogMessage     <<"Grid MPI-3 configuration: allocated shared memory region of size ";
+    std::cout<<std::hex << MAX_MPI_SHM_BYTES <<" ShmCommBuf address = "<<ShmCommBuf << std::dec<<std::endl;
+
+    for(int g=0;g<GroupSize;g++){
+      std::cout<<GridLogMessage<<" Node "<<g<<" led by MPI rank "<<leaders_group[g]<<std::endl;
+    }
+
+    std::cout<<GridLogMessage<<" Boss Node Shm Pointers are {";
+    for(int g=0;g<ShmSize;g++){
+      std::cout<<std::hex<<ShmCommBufs[g]<<std::dec;
+      if(g!=ShmSize-1) std::cout<<",";
+      else std::cout<<"}"<<std::endl;
+    }
+
+  }
+  
+  for(int g=0;g<GroupSize;g++){
+    if ( (ShmRank == 0) && (GroupRank==g) )  std::cout<<GridLogMessage<<"["<<g<<"] Node Group "<<g<<" is ranks {";
+    for(int r=0;r<ShmSize;r++){
+      if ( (ShmRank == 0) && (GroupRank==g) ) {
+	std::cout<<MyGroup[r];
+	if(r<ShmSize-1) std::cout<<",";
+	else std::cout<<"}"<<std::endl;
+      }
+      MPI_Barrier(communicator_world);
+    }
+  }
+  
+  assert(ShmSetup==0);  ShmSetup=1;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Want to implement some magic ... Group sub-cubes into those on same node
+////////////////////////////////////////////////////////////////////////////////////////////////////////////
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  std::vector<int> coor = _processor_coor;
@@ -80,139 +250,13 @@ void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &c
  rank = LexicographicToWorldRank[rank];
 }

-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Info that is setup once and indept of cartesian layout
-///////////////////////////////////////////////////////////////////////////////////////////////////
-int CartesianCommunicator::ShmSetup = 0;
-int CartesianCommunicator::ShmRank;
-int CartesianCommunicator::ShmSize;
-int CartesianCommunicator::GroupRank;
-int CartesianCommunicator::GroupSize;
-MPI_Comm CartesianCommunicator::ShmComm;
-MPI_Win  CartesianCommunicator::ShmWindow;
-std::vector<int> CartesianCommunicator::GroupRanks; 
-std::vector<int> CartesianCommunicator::MyGroup;
-
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 { 
-
-  _ndimension = processors.size();
-
-  WorldDims = processors;
-
-  communicator = MPI_COMM_WORLD;
-  MPI_Comm_rank(communicator,&WorldRank);
-  MPI_Comm_size(communicator,&WorldSize);
-
-  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Plan: allocate a fixed SHM region. Scratch that is just used via some scheme during stencil comms, with no allocate free.
-  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Does every grid need one, or could we share across all grids via a singleton/guard?
  int ierr;

-  if ( !ShmSetup ) { 
+  communicator=communicator_world;

-    MPI_Comm_split_type(communicator, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
-    MPI_Comm_rank(ShmComm     ,&ShmRank);
-    MPI_Comm_size(ShmComm     ,&ShmSize);
-    GroupSize = WorldSize/ShmSize;
-
-    /////////////////////////////////////////////////////////////////////
-    // find world ranks in our SHM group (i.e. which ranks are on our node)
-    /////////////////////////////////////////////////////////////////////
-    MPI_Group WorldGroup, ShmGroup;
-    MPI_Comm_group (communicator, &WorldGroup); 
-    MPI_Comm_group (ShmComm, &ShmGroup);
-
-    std::vector<int> world_ranks(WorldSize); 
-    GroupRanks.resize(WorldSize); 
-    MyGroup.resize(ShmSize);
-    for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
-  
-    MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &GroupRanks[0]); 
-
-    ///////////////////////////////////////////////////////////////////
-    // Identify who is in my group and noninate the leader
-    ///////////////////////////////////////////////////////////////////
-    int g=0;
-    for(int rank=0;rank<WorldSize;rank++){
-      if(GroupRanks[rank]!=MPI_UNDEFINED){
-	assert(g<ShmSize);
-	MyGroup[g++] = rank;
-      }
-    }
-  
-    std::sort(MyGroup.begin(),MyGroup.end(),std::greater<int>());
-    int myleader = MyGroup[0];
-    
-    std::vector<int> leaders_1hot(WorldSize,0);
-    std::vector<int> leaders_group(GroupSize,0);
-    leaders_1hot [ myleader ] = 1;
-    
-    ///////////////////////////////////////////////////////////////////
-    // global sum leaders over comm world
-    ///////////////////////////////////////////////////////////////////
-    ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator);
-    assert(ierr==0);
-  
-    ///////////////////////////////////////////////////////////////////
-    // find the group leaders world rank
-    ///////////////////////////////////////////////////////////////////
-    int group=0;
-    for(int l=0;l<WorldSize;l++){
-      if(leaders_1hot[l]){
-	leaders_group[group++] = l;
-      }
-    }
-  
-    ///////////////////////////////////////////////////////////////////
-    // Identify the rank of the group in which I (and my leader) live
-    ///////////////////////////////////////////////////////////////////
-    GroupRank=-1;
-    for(int g=0;g<GroupSize;g++){
-      if (myleader == leaders_group[g]){
-	GroupRank=g;
-      }
-    }
-    assert(GroupRank!=-1);
-    
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // allocate the shared window for our group
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    
-    ShmCommBuf = 0;
-    ierr = MPI_Win_allocate_shared(MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,ShmComm,&ShmCommBuf,&ShmWindow);
-    assert(ierr==0);
-    for(uint64_t page=0;page<MAX_MPI_SHM_BYTES;page+=4096){
-      void *pages = (void *) ( page + ShmCommBuf );
-      int status;
-      int flags=MPOL_MF_MOVE_ALL;
-      int nodes=1; // numa domain == MCDRAM
-      unsigned long count=1;
-      ierr= move_pages(0,count, &pages,&nodes,&status,flags);
-      if (ierr && (page==0)) perror("numa relocate command failed");
-    }
-    MPI_Win_lock_all (MPI_MODE_NOCHECK, ShmWindow);
-
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // Verbose for now
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    std::cout<<GridLogMessage<< "MPI-3 configuration: Ranks per node "<< ShmSize ;
-    std::cout<< " Nodes "<< GroupSize;
-    std::cout<< " Ranks "<< WorldSize;
-    std::cout<< " Shm CommBuf address"<< std::hex <<ShmCommBuf << std::dec<<std::endl;
-
-    // Done
-    ShmSetup=1;
-
-  }
-
-  ShmCommBufs.resize(ShmSize);
-  for(int r=0;r<ShmSize;r++){
-    MPI_Aint sz;
-    int dsp_unit;
-    MPI_Win_shared_query (ShmWindow, r, &sz, &dsp_unit, &ShmCommBufs[r]);
-  }
+  _ndimension = processors.size();
  
  ////////////////////////////////////////////////////////////////
  // Assert power of two shm_size.
@@ -232,6 +276,8 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
  ////////////////////////////////////////////////////////////////
  int dim = 0;
  
+  std::vector<int> WorldDims = processors;
+
  ShmDims.resize(_ndimension,1);
  GroupDims.resize(_ndimension);
    
@@ -346,21 +392,6 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
  }
 }

-
-void *CartesianCommunicator::ShmBufferSelf(void)
-{
-  return ShmCommBufs[ShmRank];
-}
-void *CartesianCommunicator::ShmBuffer(int rank)
-{
-  int gpeer = GroupRanks[rank];
-  if (gpeer == MPI_UNDEFINED){
-    return NULL;
-  } else { 
-    return ShmCommBufs[gpeer];
-  }
-}
-
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
@@ -369,6 +400,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 						int from,
 						int bytes)
 {
+#if 1
  MPI_Request xrq;
  MPI_Request rrq;
  
@@ -387,12 +419,11 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis

  sequence++;
  
-  char *to_ptr   = (char *)ShmCommBufs[gdest];
  char *from_ptr = (char *)ShmCommBufs[ShmRank];

  int small = (bytes<MAX_MPI_SHM_BYTES);

-  typedef vRealD T;
+  typedef uint64_t T;
  int words = bytes/sizeof(T);

  assert(((size_t)bytes &(sizeof(T)-1))==0);
@@ -400,13 +431,18 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis

  if ( small && (gdest !=MPI_UNDEFINED) ) {

+    char *to_ptr   = (char *)ShmCommBufs[gdest];
+
    assert(gme != gdest);

    T *ip = (T *)xmit;
    T *op = (T *)to_ptr;
 PARALLEL_FOR_LOOP 
    for(int w=0;w<words;w++) {
-      vstream(op[w],ip[w]);
+      op[w]=ip[w];
+      if ( w == 0 ) { 
+	//	std::cout << " xmit "<< ShmRank <<" -> "<< gdest<<" " <<std::hex<<op[w]<<std::dec<<std::endl;
+      }
    }

    bcopy(&_processor,&to_ptr[bytes],sizeof(_processor));
@@ -426,7 +462,10 @@ PARALLEL_FOR_LOOP
    T *op = (T *)recv;
 PARALLEL_FOR_LOOP 
    for(int w=0;w<words;w++) {
-      vstream(op[w],ip[w]);
+      op[w]=ip[w];
+      if ( w == 0 ) { 
+	//	std::cout << " recv "<< ShmRank <<" <- "<< gfrom<<" " <<std::hex<<op[w]<<std::dec<<std::endl;
+      }
    }
    bcopy(&from_ptr[bytes]  ,&tag  ,sizeof(tag));
    bcopy(&from_ptr[bytes+4],&check,sizeof(check));
@@ -441,6 +480,19 @@ PARALLEL_FOR_LOOP
  MPI_Win_sync (ShmWindow);   
  MPI_Barrier  (ShmComm);
  MPI_Win_sync (ShmWindow);   
+#else
+  MPI_Request xrq;
+  MPI_Request rrq;
+  int rank = _processor;
+  int ierr;
+  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
+  
+  assert(ierr==0);
+
+  list.push_back(xrq);
+  list.push_back(rrq);
+#endif
 }

 void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
@@ -476,19 +528,29 @@ void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_
    list.push_back(rrq);
  }

+
+  StencilBarrier();
+
+}
+
+
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list)
+{
+  SendToRecvFromComplete(list);
+}
+
+void CartesianCommunicator::StencilBarrier(void)
+{
  MPI_Win_sync (ShmWindow);   
  MPI_Barrier  (ShmComm);
  MPI_Win_sync (ShmWindow);   
-  
 }

-
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  int nreq=list.size();
  std::vector<MPI_Status> status(nreq);
  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
-
  assert(ierr==0);
 }

@@ -514,7 +576,7 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 		      bytes,
 		      MPI_BYTE,
 		      root,
-		      MPI_COMM_WORLD);
+		      communicator_world);
  assert(ierr==0);
 }

--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@@ -28,18 +28,29 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include "Grid.h"
 namespace Grid {

+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Info that is setup once and indept of cartesian layout
+///////////////////////////////////////////////////////////////////////////////////////////////////
+int CartesianCommunicator::ShmRank;
+int CartesianCommunicator::ShmSize;
+int CartesianCommunicator::GroupRank;
+int CartesianCommunicator::GroupSize;
+int CartesianCommunicator::WorldRank;
+int CartesianCommunicator::WorldSize;
+int CartesianCommunicator::Slave;
+void *              CartesianCommunicator::ShmCommBuf;
+commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
+
 void CartesianCommunicator::Init(int *argc, char *** arv)
 {
-}
-
-int Rank(void ){ return 0; };
-void *CartesianCommunicator::ShmBufferSelf(void)
-{
-  return NULL;
-}
-void *CartesianCommunicator::ShmBuffer(int rank)
-{
-  return NULL;
+  WorldRank = 0;
+  WorldSize = 1;
+  ShmRank=0;
+  ShmSize=1;
+  GroupRank=_WorldRank;
+  GroupSize=_WorldSize;
+  Slave    =0;
+  ShmInitGeneric();
 }

 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
@@ -97,30 +108,16 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
  assert(0);
 }

-void CartesianCommunicator::Barrier(void)
-{
-}
-
-void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
-{
-}
-void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
-{
-}
-
-
+void CartesianCommunicator::Barrier(void){}
+void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
+void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
+int  CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) {  return 0;}
+void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){  assert(0);}
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  source =0;
  dest=0;
 }
-int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
-{
-  return 0;
-}
-void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
-{
-}


 }
--- a/lib/communicator/Communicator_shmem.cc
+++ b/lib/communicator/Communicator_shmem.cc
@@ -39,25 +39,19 @@ namespace Grid {
    BACKTRACEFILE();		   \
  }\
 }
-int Rank(void) {
-  return shmem_my_pe();
-}
-typedef struct HandShake_t { 
-  uint64_t seq_local;
-  uint64_t seq_remote;
-} HandShake;

-static Vector< HandShake > XConnections;
-static Vector< HandShake > RConnections;

-void *CartesianCommunicator::ShmBufferSelf(void)
-{
-  return NULL;
-}
-void *CartesianCommunicator::ShmBuffer(int rank)
-{
-  return NULL;
-}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Info that is setup once and indept of cartesian layout
+///////////////////////////////////////////////////////////////////////////////////////////////////
+int CartesianCommunicator::ShmRank;
+int CartesianCommunicator::ShmSize;
+int CartesianCommunicator::GroupRank;
+int CartesianCommunicator::GroupSize;
+int CartesianCommunicator::WorldRank;
+int CartesianCommunicator::WorldSize;
+int CartesianCommunicator::Slave;
+
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  shmem_init();
  XConnections.resize(shmem_n_pes());
@@ -69,7 +63,36 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
    RConnections[pe].seq_remote= 0;
  }
  shmem_barrier_all();
+  ShmInitGeneric();
 }
+
+
+// Should error check all MPI calls.
+void CartesianCommunicator::Init(int *argc, char ***argv) {
+  int flag;
+  MPI_Initialized(&flag); // needed to coexist with other libs apparently
+  if ( !flag ) {
+    MPI_Init(argc,argv);
+    MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
+    MPI_Comm_rank(communicator_world,&_WorldRank);
+    MPI_Comm_size(communicator_world,&_WorldSize);
+    _ShmRank=0;
+    _ShmSize=1;
+    _GroupRank=_WorldRank;
+    _GroupSize=_WorldSize;
+    _Slave    =0;
+  }
+}
+
+
+typedef struct HandShake_t { 
+  uint64_t seq_local;
+  uint64_t seq_remote;
+} HandShake;
+
+static Vector< HandShake > XConnections;
+static Vector< HandShake > RConnections;
+
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
  _ndimension = processors.size();