Update to use shared memory to contain the stencil comms buffers

Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions
2026-05-21 09:34:17 +01:00 · 2016-10-24 17:30:43 +01:00
parent ea25a4d9ac
commit b6a65059a2
13 changed files with 706 additions and 458 deletions
@@ -156,6 +156,7 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    err = ref-result; 
    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
    assert (norm2(err)< 1.0e-5 );
    Dw.Report();
  }
@@ -208,7 +209,7 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage<< "res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl;
-    RealF sum=0;
+    RealD sum=0;
    for(int x=0;x<latt4[0];x++){
    for(int y=0;y<latt4[1];y++){
    for(int z=0;z<latt4[2];z++){
@@ -226,12 +227,12 @@ int main (int argc, char ** argv)
      }
    }}}}}
    std::cout<<GridLogMessage<<" difference between normal and simd is "<<sum<<std::endl;
    assert (sum< 1.0e-5 );
    if (1) {
      LatticeFermion sr_eo(sFGrid);
      LatticeFermion serr(sFGrid);
      LatticeFermion ssrc_e (sFrbGrid);
      LatticeFermion ssrc_o (sFrbGrid);
@@ -243,8 +244,6 @@ int main (int argc, char ** argv)
      setCheckerboard(sr_eo,ssrc_o);
      setCheckerboard(sr_eo,ssrc_e);
      serr = sr_eo-ssrc; 
      std::cout<<GridLogMessage << "EO src norm diff   "<< norm2(serr)<<std::endl;
      sr_e = zero;
      sr_o = zero;
@@ -272,9 +271,18 @@ int main (int argc, char ** argv)
      pickCheckerboard(Even,ssrc_e,sresult);
      pickCheckerboard(Odd ,ssrc_o,sresult);
      ssrc_e = ssrc_e - sr_e;
      RealD error = norm2(ssrc_e);
      std::cout<<GridLogMessage << "sE norm diff   "<< norm2(ssrc_e)<< "  vec nrm"<<norm2(sr_e) <<std::endl;
      ssrc_o = ssrc_o - sr_o;
      error+= norm2(ssrc_o);
      std::cout<<GridLogMessage << "sO norm diff   "<< norm2(ssrc_o)<< "  vec nrm"<<norm2(sr_o) <<std::endl;
      if(error>1.0e-5) { 
 	setCheckerboard(ssrc,ssrc_o);
 	setCheckerboard(ssrc,ssrc_e);
 	std::cout<< ssrc << std::endl;
      }
    }
@@ -306,7 +314,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
  err = ref-result; 
  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-
+  assert(norm2(err)<1.0e-5);
  LatticeFermion src_e (FrbGrid);
  LatticeFermion src_o (FrbGrid);
  LatticeFermion r_e   (FrbGrid);
@@ -349,11 +357,14 @@ int main (int argc, char ** argv)
  err = r_eo-result; 
  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
  assert(norm2(err)<1.0e-5);
  pickCheckerboard(Even,src_e,err);
  pickCheckerboard(Odd,src_o,err);
  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
  assert(norm2(src_e)<1.0e-5);
  assert(norm2(src_o)<1.0e-5);
  }
@@ -171,14 +171,17 @@ std::string GridCmdVectorIntToString(const std::vector<int> & vec){
 /////////////////////////////////////////////////////////
 //
 /////////////////////////////////////////////////////////
 static int Grid_is_initialised = 0;
 void Grid_init(int *argc,char ***argv)
 {
  GridLogger::StopWatch.Start();
  CartesianCommunicator::Init(argc,argv);
  // Parse command line args.
  GridLogger::StopWatch.Start();
  std::string arg;
  std::vector<std::string> logstreams;
  std::string defaultLog("Error,Warning,Message,Performance");
@@ -216,11 +219,14 @@ void Grid_init(int *argc,char ***argv)
  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
    LebesgueOrder::UseLebesgueOrder=1;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
    GridCmdOptionIntVector(arg,LebesgueOrder::Block);
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--timestamp") ){
    GridLogTimestamp(1);
  }
  GridParseLayout(*argv,*argc,
 		  Grid_default_latt,
 		  Grid_default_mpi);
@@ -274,6 +280,8 @@ void Grid_init(int *argc,char ***argv)
  std::cout << "GNU General Public License for more details."<<std::endl;
  std::cout << COL_BACKGROUND <<std::endl;
  std::cout << std::endl;
  Grid_is_initialised = 1;
 }
@@ -33,6 +33,7 @@ namespace Grid {
  void Grid_init(int *argc,char ***argv);
  void Grid_finalize(void);
  // internal, controled with --handle
  void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
  void Grid_debug_handler_init(void);
@@ -44,6 +45,7 @@ namespace Grid {
  const std::vector<int> &GridDefaultMpi(void);
  const int              &GridThreads(void)  ;
  void                    GridSetThreads(int t) ;
  void GridLogTimestamp(int);
  // Common parsing chores
  std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option);
@@ -34,8 +34,13 @@ directory
 namespace Grid {
 GridStopWatch Logger::StopWatch;
 int Logger::timestamp;
 std::ostream Logger::devnull(0);
 void GridLogTimestamp(int on){
  Logger::Timestamp(on);
 }
 Colours GridLogColours(0);
 GridLogger GridLogError(1, "Error", GridLogColours, "RED");
 GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
@@ -37,10 +37,11 @@
 #include <execinfo.h>
 #endif
-    namespace Grid {
+namespace Grid {
 //////////////////////////////////////////////////////////////////////////////////////////////////
 // Dress the output; use std::chrono for time stamping via the StopWatch class
-int Rank(void); // used for early stage debug before library init
+//////////////////////////////////////////////////////////////////////////////////////////////////
 class Colours{
@@ -55,7 +56,6 @@ public:
  void Active(bool activate){
    is_active=activate;
    if (is_active){
     colour["BLACK"]  ="\033[30m";
     colour["RED"]    ="\033[31m";
@@ -66,21 +66,18 @@ public:
     colour["CYAN"]   ="\033[36m";
     colour["WHITE"]  ="\033[37m";
     colour["NORMAL"] ="\033[0;39m";
-   } else {
+    } else {
-    colour["BLACK"] ="";
+      colour["BLACK"] ="";
-    colour["RED"]   ="";
+      colour["RED"]   ="";
-    colour["GREEN"] ="";
+      colour["GREEN"] ="";
-    colour["YELLOW"]="";
+      colour["YELLOW"]="";
-    colour["BLUE"]  ="";
+      colour["BLUE"]  ="";
-    colour["PURPLE"]="";
+      colour["PURPLE"]="";
-    colour["CYAN"]  ="";
+      colour["CYAN"]  ="";
-    colour["WHITE"] ="";
+      colour["WHITE"] ="";
-    colour["NORMAL"]="";
+      colour["NORMAL"]="";
-  }
+    }
-
+  };
 };
 };
@@ -88,6 +85,7 @@ class Logger {
 protected:
  Colours &Painter;
  int active;
  static int timestamp;
  std::string name, topName;
  std::string COLOUR;
@@ -99,25 +97,28 @@ public:
  std::string evidence() {return Painter.colour["YELLOW"];}
  std::string colour() {return Painter.colour[COLOUR];}
-  Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col)
+  Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col)  : active(on),
-  : active(on),
+    name(nm),
-  name(nm),
+    topName(topNm),
-  topName(topNm),
+    Painter(col_class),
-  Painter(col_class),
+    COLOUR(col) {} ;
  COLOUR(col){} ;
  void Active(int on) {active = on;};
  int  isActive(void) {return active;};
  static void Timestamp(int on) {timestamp = on;};
  friend std::ostream& operator<< (std::ostream& stream, Logger& log){
    if ( log.active ) {
      StopWatch.Stop();
      GridTime now = StopWatch.Elapsed();
      StopWatch.Start();
      stream << log.background()<< log.topName << log.background()<< " : ";
      stream << log.colour() <<std::setw(14) << std::left << log.name << log.background() << " : ";
-      stream << log.evidence()<< now << log.background() << " : " << log.colour();
+      if ( log.timestamp ) {
 	StopWatch.Stop();
 	GridTime now = StopWatch.Elapsed();
 	StopWatch.Start();
 	stream << log.evidence()<< now << log.background() << " : " ;
      }
      stream << log.colour();
      return stream;
    } else { 
      return devnull;
@@ -149,7 +150,7 @@ extern void * Grid_backtrace_buffer[_NBACKTRACE];
 #define BACKTRACEFILE() {\
 char string[20];					\
-std::sprintf(string,"backtrace.%d",Rank());				\
+std::sprintf(string,"backtrace.%d",CartesianCommunicator::RankWorld()); \
 std::FILE * fp = std::fopen(string,"w");				\
 BACKTRACEFP(fp)\
 std::fclose(fp);	    \
@@ -1,18 +1,22 @@
 extra_sources=
 if BUILD_COMMS_MPI
  extra_sources+=communicator/Communicator_mpi.cc
  extra_sources+=communicator/Communicator_base.cc
 endif
 if BUILD_COMMS_MPI3
  extra_sources+=communicator/Communicator_mpi3.cc
  extra_sources+=communicator/Communicator_base.cc
 endif
 if BUILD_COMMS_SHMEM
  extra_sources+=communicator/Communicator_shmem.cc
  extra_sources+=communicator/Communicator_base.cc
 endif
 if BUILD_COMMS_NONE
  extra_sources+=communicator/Communicator_none.cc
  extra_sources+=communicator/Communicator_base.cc
 endif
 #
@@ -290,10 +290,11 @@ PARALLEL_FOR_LOOP
  // Unified Comms buffers for all directions
  ///////////////////////////////////////////////////////////
  // Vectors that live on the symmetric heap in case of SHMEM
-  std::vector<commVector<scalar_object> > u_simd_send_buf_hide;
+  //  std::vector<commVector<scalar_object> > u_simd_send_buf_hide;
-  std::vector<commVector<scalar_object> > u_simd_recv_buf_hide;
+  //  std::vector<commVector<scalar_object> > u_simd_recv_buf_hide;
-  commVector<cobj>          u_send_buf;
+  //  commVector<cobj>          u_send_buf_hide;
-  commVector<cobj>          u_recv_buf_hide;
+  //  commVector<cobj>          u_recv_buf_hide;
  // These are used; either SHM objects or refs to the above symmetric heap vectors
  // depending on comms target
  cobj* u_recv_buf_p;
@@ -439,36 +440,19 @@ PARALLEL_FOR_LOOP
    /////////////////////////////////////////////////////////////////////////////////
    const int Nsimd = grid->Nsimd();
-    uint8_t *shm_ptr   = (uint8_t *)_grid->ShmBufferSelf();
+    _grid->ShmBufferFreeAll();
    u_simd_send_buf.resize(Nsimd);
    u_simd_recv_buf.resize(Nsimd);
-    u_send_buf.resize(_unified_buffer_size);
+    u_send_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
-
+    u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
-    if( ShmDirectCopy && shm_ptr != NULL ) {
+    for(int l=0;l<Nsimd;l++){
-
+      u_simd_recv_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
-      u_recv_buf_p=(cobj *)shm_ptr; shm_ptr+= _unified_buffer_size*sizeof(cobj);
+      u_simd_send_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
      for(int l=0;l<Nsimd;l++){
 	u_simd_send_buf[l] = (scalar_object *)shm_ptr; shm_ptr += _unified_buffer_size*sizeof(scalar_object);
 	u_simd_recv_buf[l] = (scalar_object *)shm_ptr; shm_ptr += _unified_buffer_size*sizeof(scalar_object);
      }
    } else {
      u_recv_buf_hide.resize(_unified_buffer_size);
      u_simd_send_buf_hide.resize(Nsimd,commVector<scalar_object>(_unified_buffer_size));
      u_simd_recv_buf_hide.resize(Nsimd,commVector<scalar_object>(_unified_buffer_size));
      u_recv_buf_p=&u_recv_buf_hide[0];
      for(int l=0;l<Nsimd;l++){
 	u_simd_send_buf[l] = & u_simd_send_buf_hide[l][0];
 	u_simd_recv_buf[l] = & u_simd_recv_buf_hide[l][0];
      }
    }
    PrecomputeByteOffsets();
  }
  void Local     (int point, int dimension,int shiftpm,int cbmask)
@@ -698,6 +682,7 @@ PARALLEL_FOR_LOOP
    calls++;
    Mergers.resize(0);
    Packets.resize(0);
    _grid->StencilBarrier();
    HaloGather(source,compress);
    this->CommunicateBegin(reqs);
    this->CommunicateComplete(reqs);
@@ -836,19 +821,17 @@ PARALLEL_FOR_LOOP
 	// try the direct copy if possible
 	/////////////////////////////////////////////////////////
-	cobj *u_send_buf_p = &u_send_buf[0];
+
-	if (ShmDirectCopy) { 
+	cobj *send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,u_recv_buf_p);
-	  cobj *shm = (cobj *) _grid->ShmBuffer(xmit_to_rank);
+	if ( (ShmDirectCopy==0)||send_buf==NULL ) { 
-	  if ( shm!=NULL) { 
+	  cobj *send_buf = u_send_buf_p;
 	    u_send_buf_p = shm;
 	  }
 	}
 	t_data-=usecond();
-	Gather_plane_simple_table         (face_table[face_idx],rhs,u_send_buf_p,compress,u_comm_offset,so);  face_idx++;
+	Gather_plane_simple_table         (face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so);  face_idx++;
 	t_data+=usecond();
-	AddPacket((void *)&u_send_buf_p[u_comm_offset],
+	AddPacket((void *)&send_buf[u_comm_offset],
 		  (void *)&u_recv_buf_p[u_comm_offset],
 		  xmit_to_rank,
 		  recv_from_rank,
@@ -947,18 +930,16 @@ PARALLEL_FOR_LOOP
 	    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
-
+	    scalar_object *shm = (scalar_object *) _grid->ShmBufferTranslate(recv_from_rank,sp);
-	    AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes);
+	    if ((ShmDirectCopy==0)||(shm==NULL)) { 
-
+	      shm = rp;
-	    auto shm_or_rp = rp;
+	    } 
 	    if (ShmDirectCopy) { 
 	      scalar_object *shm = (scalar_object *) _grid->ShmBufferTranslate(xmit_to_rank,sp);
 	      if ( shm!=NULL) { 
 		shm_or_rp = shm;
 	      }
 	    }
-	    rpointers[i] = shm_or_rp;
+	    // if Direct, StencilSendToRecvFrom will suppress copy to a peer on node
 	    // assuming above pointer flip
 	    AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes);
 	    rpointers[i] = shm;
 	  } else { 
@@ -0,0 +1,132 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_none.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "Grid.h"
 namespace Grid {
 ///////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////
 int CartesianCommunicator::ShmRank;
 int CartesianCommunicator::ShmSize;
 int CartesianCommunicator::GroupRank;
 int CartesianCommunicator::GroupSize;
 int CartesianCommunicator::WorldRank;
 int CartesianCommunicator::WorldSize;
 int CartesianCommunicator::Slave;
 void *              CartesianCommunicator::ShmCommBuf;
 /////////////////////////////////
 // Alloc, free shmem region
 /////////////////////////////////
 void *CartesianCommunicator::ShmBufferMalloc(size_t bytes){
  //  bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
  void *ptr = (void *)heap_top;
  heap_top  += bytes;
  heap_bytes+= bytes;
  assert(heap_bytes < MAX_MPI_SHM_BYTES);
  return ptr;
 }
 void *CartesianCommunicator::ShmBufferFreeAll(void) { 
  heap_top  =(size_t)ShmBufferSelf();
  heap_bytes=0;
 }
 /////////////////////////////////
 // Grid information queries
 /////////////////////////////////
 int                      CartesianCommunicator::IsBoss(void)            { return _processor==0; };
 int                      CartesianCommunicator::BossRank(void)          { return 0; };
 int                      CartesianCommunicator::ThisRank(void)          { return _processor; };
 const std::vector<int> & CartesianCommunicator::ThisProcessorCoor(void) { return _processor_coor; };
 const std::vector<int> & CartesianCommunicator::ProcessorGrid(void)     { return _processors; };
 int                      CartesianCommunicator::ProcessorCount(void)    { return _Nprocessors; };
 ////////////////////////////////////////////////////////////////////////////////
 // very VERY rarely (Log, serial RNG) we need world without a grid
 ////////////////////////////////////////////////////////////////////////////////
 int  CartesianCommunicator::RankWorld(void) { return WorldRank; };
 int CartesianCommunicator::Ranks    (void) { return WorldSize; };
 int CartesianCommunicator::Nodes    (void) { return GroupSize; };
 int CartesianCommunicator::Cores    (void) { return ShmSize;   };
 int CartesianCommunicator::NodeRank (void) { return GroupRank; };
 int CartesianCommunicator::CoreRank (void) { return ShmRank;   };
 void CartesianCommunicator::GlobalSum(ComplexF &c)
 {
  GlobalSumVector((float *)&c,2);
 }
 void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
 {
  GlobalSumVector((float *)c,2*N);
 }
 void CartesianCommunicator::GlobalSum(ComplexD &c)
 {
  GlobalSumVector((double *)&c,2);
 }
 void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
 {
  GlobalSumVector((double *)c,2*N);
 }
 #ifndef GRID_COMMS_MPI3
 void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						       void *xmit,
 						       int xmit_to_rank,
 						       void *recv,
 						       int recv_from_rank,
 						       int bytes)
 {
  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
 {
  SendToRecvFromComplete(waitall);
 }
 void StencilBarrier(void){};
 commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
 void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; }
 void *CartesianCommunicator::ShmBuffer(int rank) {
  if (rank != ShmRank ) return NULL;
  else                  return ShmCommBuf;
 }
 void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { 
  if (rank != ShmRank ) return NULL;
  else                  return local_p;
 }
 void CartesianCommunicator::ShmInitGeneric(void){
  ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
  ShmCommBuf=(void *)&ShmBufStorageVector[0];
 }
 #endif
 }
@@ -40,169 +40,188 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_COMMS_SHMEM
 #include <mpp/shmem.h>
 #endif
 namespace Grid {
 class CartesianCommunicator {
  public:    
  // 65536 ranks per node adequate for now
  // 128MB shared memory for comms enought for 48^4 local vol comms
  // Give external control (command line override?) of this
  static const int      MAXLOG2RANKSPERNODE = 16;            
  static const uint64_t MAX_MPI_SHM_BYTES   = 128*1024*1024; 
  // Communicator should know nothing of the physics grid, only processor grid.
-  
+  int              _Nprocessors;     // How many in all
-    int              _Nprocessors;     // How many in all
+  std::vector<int> _processors;      // Which dimensions get relayed out over processors lanes.
-    std::vector<int> _processors;      // Which dimensions get relayed out over processors lanes.
+  int              _processor;       // linear processor rank
-    int              _processor;       // linear processor rank
+  std::vector<int> _processor_coor;  // linear processor coordinate
-    std::vector<int> _processor_coor;  // linear processor coordinate
+  unsigned long _ndimension;
    unsigned long _ndimension;
-#ifdef GRID_COMMS_MPI
+#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
-    MPI_Comm communicator;
+  MPI_Comm communicator;
-    typedef MPI_Request CommsRequest_t;
+  static MPI_Comm communicator_world;
-#elif  GRID_COMMS_MPI3
+  typedef MPI_Request CommsRequest_t;
    int shm_mode;
    MPI_Comm communicator;
    typedef MPI_Request CommsRequest_t;
    const int MAXLOG2RANKSPERNODE = 16;     // 65536 ranks per node adequate for now
    const uint64_t MAX_MPI_SHM_BYTES = 256*1024*1024; // 256MB shared memory for comms enought for 48^4 local vol comms
    std::vector<int>  WorldDims;
    std::vector<int>  GroupDims;
    std::vector<int>  ShmDims;
    std::vector<int> GroupCoor;
    std::vector<int> ShmCoor;
    std::vector<int> WorldCoor;
    static std::vector<int> GroupRanks; 
    static std::vector<int> MyGroup;
    static int ShmSetup;
    static MPI_Win ShmWindow; 
    static MPI_Comm ShmComm;
    void * ShmCommBuf;
    std::vector<void *> ShmCommBufs;
    int WorldRank;
    int WorldSize;
    static int ShmRank;
    static int ShmSize;
    static int GroupSize;
    static int GroupRank;
    std::vector<int>  LexicographicToWorldRank;
 #else 
-    typedef int CommsRequest_t;
+  typedef int CommsRequest_t;
 #endif
-    static void Init(int *argc, char ***argv);
+  ////////////////////////////////////////////////////////////////////
  // Helper functionality for SHM Windows common to all other impls
  ////////////////////////////////////////////////////////////////////
  // Longer term; drop this in favour of a master / slave model with 
  // cartesian communicator on a subset of ranks, slave ranks controlled
  // by group leader with data xfer via shared memory
  ////////////////////////////////////////////////////////////////////
 #ifdef  GRID_COMMS_MPI3
  std::vector<int>  WorldDims;
  std::vector<int>  GroupDims;
  std::vector<int>  ShmDims;
  std::vector<int> GroupCoor;
  std::vector<int> ShmCoor;
  std::vector<int> WorldCoor;
  static std::vector<int> GroupRanks; 
  static std::vector<int> MyGroup;
  static int ShmSetup;
  static MPI_Win ShmWindow; 
  static MPI_Comm ShmComm;
  std::vector<int>  LexicographicToWorldRank;
  static std::vector<void *> ShmCommBufs;
 #else 
  static void ShmInitGeneric(void);
  static commVector<uint8_t> ShmBufStorageVector;
 #endif 
  static void * ShmCommBuf;
  size_t heap_top;
  size_t heap_bytes;
  void *ShmBufferSelf(void);
  void *ShmBuffer(int rank);
  void *ShmBufferTranslate(int rank,void * local_p);
  void *ShmBufferMalloc(size_t bytes);
  void *ShmBufferFreeAll(void) ;
  ////////////////////////////////////////////////
  // Must call in Grid startup
  ////////////////////////////////////////////////
  static void Init(int *argc, char ***argv);
  ////////////////////////////////////////////////
  // Constructor of any given grid
  ////////////////////////////////////////////////
  CartesianCommunicator(const std::vector<int> &pdimensions_in);
  ////////////////////////////////////////////////////////////////////////////////////////
  // Wraps MPI_Cart routines, or implements equivalent on other impls
  ////////////////////////////////////////////////////////////////////////////////////////
  void ShiftedRanks(int dim,int shift,int & source, int & dest);
  int  RankFromProcessorCoor(std::vector<int> &coor);
  void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
  /////////////////////////////////
  // Grid information and queries
  /////////////////////////////////
  static int ShmRank;
  static int ShmSize;
  static int GroupSize;
  static int GroupRank;
  static int WorldRank;
  static int WorldSize;
  static int Slave;
  int                      IsBoss(void)            ;
  int                      BossRank(void)          ;
  int                      ThisRank(void)          ;
  const std::vector<int> & ThisProcessorCoor(void) ;
  const std::vector<int> & ProcessorGrid(void)     ;
  int                      ProcessorCount(void)    ;
  static int Ranks    (void);
  static int Nodes    (void);
  static int Cores    (void);
  static int NodeRank (void);
  static int CoreRank (void);
-    // Constructor
+  ////////////////////////////////////////////////////////////////////////////////
-    CartesianCommunicator(const std::vector<int> &pdimensions_in);
+  // very VERY rarely (Log, serial RNG) we need world without a grid
  ////////////////////////////////////////////////////////////////////////////////
  static int  RankWorld(void) ;
  static void BroadcastWorld(int root,void* data, int bytes);
  ////////////////////////////////////////////////////////////
  // Reduction
  ////////////////////////////////////////////////////////////
  void GlobalSum(RealF &);
  void GlobalSumVector(RealF *,int N);
  void GlobalSum(RealD &);
  void GlobalSumVector(RealD *,int N);
  void GlobalSum(uint32_t &);
  void GlobalSum(uint64_t &);
  void GlobalSum(ComplexF &c);
  void GlobalSumVector(ComplexF *c,int N);
  void GlobalSum(ComplexD &c);
  void GlobalSumVector(ComplexD *c,int N);
  template<class obj> void GlobalSum(obj &o){
    typedef typename obj::scalar_type scalar_type;
    int words = sizeof(obj)/sizeof(scalar_type);
    scalar_type * ptr = (scalar_type *)& o;
    GlobalSumVector(ptr,words);
  }
  ////////////////////////////////////////////////////////////
  // Face exchange, buffer swap in translational invariant way
  ////////////////////////////////////////////////////////////
  void SendToRecvFrom(void *xmit,
 		      int xmit_to_rank,
 		      void *recv,
 		      int recv_from_rank,
 		      int bytes);
  void SendRecvPacket(void *xmit,
 		      void *recv,
 		      int xmit_to_rank,
 		      int recv_from_rank,
 		      int bytes);
  void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 			   void *xmit,
 			   int xmit_to_rank,
 			   void *recv,
 			   int recv_from_rank,
 			   int bytes);
  void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
-    // Wraps MPI_Cart routines
+  void StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-    void ShiftedRanks(int dim,int shift,int & source, int & dest);
+				  void *xmit,
-    int  RankFromProcessorCoor(std::vector<int> &coor);
+				  int xmit_to_rank,
-    void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
+				  void *recv,
 				  int recv_from_rank,
 				  int bytes);
  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
  void StencilBarrier(void);
-    // Helper function for SHM Windows in MPI3
+  ////////////////////////////////////////////////////////////
-    void *ShmBufferSelf(void);
+  // Barrier
-    void *ShmBuffer(int rank);
+  ////////////////////////////////////////////////////////////
-
+  void Barrier(void);
-    /////////////////////////////////
+  
-    // Grid information queries
+  ////////////////////////////////////////////////////////////
-    /////////////////////////////////
+  // Broadcast a buffer and composite larger
-    int                      IsBoss(void)            { return _processor==0; };
+  ////////////////////////////////////////////////////////////
-    int                      BossRank(void)          { return 0; };
+  void Broadcast(int root,void* data, int bytes);
-    int                      ThisRank(void)          { return _processor; };
+  
-    const std::vector<int> & ThisProcessorCoor(void) { return _processor_coor; };
+  template<class obj> void Broadcast(int root,obj &data)
    const std::vector<int> & ProcessorGrid(void)     { return _processors; };
    int                      ProcessorCount(void)    { return _Nprocessors; };
    ////////////////////////////////////////////////////////////
    // Reduction
    ////////////////////////////////////////////////////////////
    void GlobalSum(RealF &);
    void GlobalSumVector(RealF *,int N);
    void GlobalSum(RealD &);
    void GlobalSumVector(RealD *,int N);
    void GlobalSum(uint32_t &);
    void GlobalSum(uint64_t &);
    void GlobalSum(ComplexF &c)
    {
      GlobalSumVector((float *)&c,2);
    }
    void GlobalSumVector(ComplexF *c,int N)
    {
      GlobalSumVector((float *)c,2*N);
    }
    void GlobalSum(ComplexD &c)
    {
      GlobalSumVector((double *)&c,2);
    }
    void GlobalSumVector(ComplexD *c,int N)
    {
      GlobalSumVector((double *)c,2*N);
    }
    template<class obj> void GlobalSum(obj &o){
      typedef typename obj::scalar_type scalar_type;
      int words = sizeof(obj)/sizeof(scalar_type);
      scalar_type * ptr = (scalar_type *)& o;
      GlobalSumVector(ptr,words);
    }
    ////////////////////////////////////////////////////////////
    // Face exchange, buffer swap in translational invariant way
    ////////////////////////////////////////////////////////////
    void SendToRecvFrom(void *xmit,
 			int xmit_to_rank,
 			void *recv,
 			int recv_from_rank,
 			int bytes);
    void SendRecvPacket(void *xmit,
 			void *recv,
 			int xmit_to_rank,
 			int recv_from_rank,
 			int bytes);
    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 			 void *xmit,
 			 int xmit_to_rank,
 			 void *recv,
 			 int recv_from_rank,
 			 int bytes);
    void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
    void StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 			 void *xmit,
 			 int xmit_to_rank,
 			 void *recv,
 			 int recv_from_rank,
 			 int bytes);
    void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
    {
      SendToRecvFromComplete(waitall);
    }
    ////////////////////////////////////////////////////////////
    // Barrier
    ////////////////////////////////////////////////////////////
    void Barrier(void);
    ////////////////////////////////////////////////////////////
    // Broadcast a buffer and composite larger
    ////////////////////////////////////////////////////////////
    void Broadcast(int root,void* data, int bytes);
    template<class obj> void Broadcast(int root,obj &data)
    {
      Broadcast(root,(void *)&data,sizeof(data));
    };
    static void BroadcastWorld(int root,void* data, int bytes);
 }; 
 }
@@ -30,19 +30,28 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
-  // Should error check all MPI calls.
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 MPI_Comm CartesianCommunicator::communicator_world;
 // Should error check all MPI calls.
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  int flag;
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
    MPI_Init(argc,argv);
  }
-}
+  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
-
+  MPI_Comm_rank(communicator_world,&WorldRank);
-int Rank(void) {
+  MPI_Comm_size(communicator_world,&WorldSize);
-  int pe;
+  ShmRank=0;
-  MPI_Comm_rank(MPI_COMM_WORLD,&pe);
+  ShmSize=1;
-  return pe;
+  GroupRank=WorldRank;
  GroupSize=WorldSize;
  Slave    =0;
  ShmInitGeneric();
 }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
@@ -54,7 +63,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
  _processors = processors;
  _processor_coor.resize(_ndimension);
-  MPI_Cart_create(MPI_COMM_WORLD, _ndimension,&_processors[0],&periodic[0],1,&communicator);
+  MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
  MPI_Comm_rank(communicator,&_processor);
  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
@@ -67,15 +76,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
  assert(Size==_Nprocessors);
 }
 void *CartesianCommunicator::ShmBufferSelf(void)
 {
  return NULL;
 }
 void *CartesianCommunicator::ShmBuffer(int rank)
 {
  return NULL;
 }
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
@@ -194,14 +194,17 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 		     communicator);
  assert(ierr==0);
 }
-
+  ///////////////////////////////////////////////////////
  // Should only be used prior to Grid Init finished.
  // Check for this?
  ///////////////////////////////////////////////////////
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
 		      bytes,
 		      MPI_BYTE,
 		      root,
-		      MPI_COMM_WORLD);
+		      communicator_world);
  assert(ierr==0);
 }
@@ -1,4 +1,3 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
@@ -33,26 +32,197 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 int CartesianCommunicator::ShmSetup = 0;
-// Global used by Init and nowhere else. How to hide?
+MPI_Comm CartesianCommunicator::communicator_world;
-int Rank(void) {
+MPI_Comm CartesianCommunicator::ShmComm;
-  int pe;
+MPI_Win  CartesianCommunicator::ShmWindow;
-  MPI_Comm_rank(MPI_COMM_WORLD,&pe);
+
-  return pe;
+std::vector<int> CartesianCommunicator::GroupRanks;  
 std::vector<int> CartesianCommunicator::MyGroup;
 std::vector<void *> CartesianCommunicator::ShmCommBufs;
 void *CartesianCommunicator::ShmBufferSelf(void)
 {
  return ShmCommBufs[ShmRank];
 }
-  // Should error check all MPI calls.
+void *CartesianCommunicator::ShmBuffer(int rank)
 {
  int gpeer = GroupRanks[rank];
  if (gpeer == MPI_UNDEFINED){
    return NULL;
  } else { 
    return ShmCommBufs[gpeer];
  }
 }
 void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p)
 {
  int gpeer = GroupRanks[rank];
  if (gpeer == MPI_UNDEFINED){
    return NULL;
  } else { 
    uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
    uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
    return (void *) remote;
  }
 }
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  int flag;
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
    MPI_Init(argc,argv);
  }
 }
  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Want to implement some magic ... Group sub-cubes into those on same node
  //
  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
  MPI_Comm_rank(communicator_world,&WorldRank);
  MPI_Comm_size(communicator_world,&WorldSize);
  /////////////////////////////////////////////////////////////////////
  // Split into groups that can share memory
  /////////////////////////////////////////////////////////////////////
  MPI_Comm_split_type(communicator_world, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
  MPI_Comm_rank(ShmComm     ,&ShmRank);
  MPI_Comm_size(ShmComm     ,&ShmSize);
  GroupSize = WorldSize/ShmSize;
  /////////////////////////////////////////////////////////////////////
  // find world ranks in our SHM group (i.e. which ranks are on our node)
  /////////////////////////////////////////////////////////////////////
  MPI_Group WorldGroup, ShmGroup;
  MPI_Comm_group (communicator_world, &WorldGroup); 
  MPI_Comm_group (ShmComm, &ShmGroup);
  std::vector<int> world_ranks(WorldSize); 
  GroupRanks.resize(WorldSize); 
  MyGroup.resize(ShmSize);
  for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
  MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &GroupRanks[0]); 
  ///////////////////////////////////////////////////////////////////
  // Identify who is in my group and noninate the leader
    ///////////////////////////////////////////////////////////////////
  int g=0;
  for(int rank=0;rank<WorldSize;rank++){
    if(GroupRanks[rank]!=MPI_UNDEFINED){
      assert(g<ShmSize);
      MyGroup[g++] = rank;
    }
  }
  std::sort(MyGroup.begin(),MyGroup.end(),std::less<int>());
  int myleader = MyGroup[0];
  std::vector<int> leaders_1hot(WorldSize,0);
  std::vector<int> leaders_group(GroupSize,0);
  leaders_1hot [ myleader ] = 1;
  ///////////////////////////////////////////////////////////////////
  // global sum leaders over comm world
  ///////////////////////////////////////////////////////////////////
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator_world);
  assert(ierr==0);
  ///////////////////////////////////////////////////////////////////
  // find the group leaders world rank
  ///////////////////////////////////////////////////////////////////
  int group=0;
  for(int l=0;l<WorldSize;l++){
    if(leaders_1hot[l]){
      leaders_group[group++] = l;
    }
  }
  ///////////////////////////////////////////////////////////////////
  // Identify the rank of the group in which I (and my leader) live
  ///////////////////////////////////////////////////////////////////
  GroupRank=-1;
  for(int g=0;g<GroupSize;g++){
    if (myleader == leaders_group[g]){
      GroupRank=g;
    }
  }
  assert(GroupRank!=-1);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  // allocate the shared window for our group
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  ShmCommBuf = 0;
  ierr = MPI_Win_allocate_shared(MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,ShmComm,&ShmCommBuf,&ShmWindow);
  assert(ierr==0);
  // KNL hack -- force to numa-domain 1 in flat
 #if 0
  for(uint64_t page=0;page<MAX_MPI_SHM_BYTES;page+=4096){
    void *pages = (void *) ( page + ShmCommBuf );
    int status;
    int flags=MPOL_MF_MOVE_ALL;
    int nodes=1; // numa domain == MCDRAM
    unsigned long count=1;
    ierr= move_pages(0,count, &pages,&nodes,&status,flags);
    if (ierr && (page==0)) perror("numa relocate command failed");
  }
 #endif
  MPI_Win_lock_all (MPI_MODE_NOCHECK, ShmWindow);
  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Plan: allocate a fixed SHM region. Scratch that is just used via some scheme during stencil comms, with no allocate free.
  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  ShmCommBufs.resize(ShmSize);
  for(int r=0;r<ShmSize;r++){
    MPI_Aint sz;
    int dsp_unit;
    MPI_Win_shared_query (ShmWindow, r, &sz, &dsp_unit, &ShmCommBufs[r]);
  }
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Verbose for now
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  if (WorldRank == 0){
    std::cout<<GridLogMessage<< "Grid MPI-3 configuration: detected ";
    std::cout<< WorldSize << " Ranks " ;
    std::cout<< GroupSize << " Nodes " ;
    std::cout<<  ShmSize  << " with ranks-per-node "<<std::endl;
    std::cout<<GridLogMessage     <<"Grid MPI-3 configuration: allocated shared memory region of size ";
    std::cout<<std::hex << MAX_MPI_SHM_BYTES <<" ShmCommBuf address = "<<ShmCommBuf << std::dec<<std::endl;
    for(int g=0;g<GroupSize;g++){
      std::cout<<GridLogMessage<<" Node "<<g<<" led by MPI rank "<<leaders_group[g]<<std::endl;
    }
    std::cout<<GridLogMessage<<" Boss Node Shm Pointers are {";
    for(int g=0;g<ShmSize;g++){
      std::cout<<std::hex<<ShmCommBufs[g]<<std::dec;
      if(g!=ShmSize-1) std::cout<<",";
      else std::cout<<"}"<<std::endl;
    }
  }
  for(int g=0;g<GroupSize;g++){
    if ( (ShmRank == 0) && (GroupRank==g) )  std::cout<<GridLogMessage<<"["<<g<<"] Node Group "<<g<<" is ranks {";
    for(int r=0;r<ShmSize;r++){
      if ( (ShmRank == 0) && (GroupRank==g) ) {
 	std::cout<<MyGroup[r];
 	if(r<ShmSize-1) std::cout<<",";
 	else std::cout<<"}"<<std::endl;
      }
      MPI_Barrier(communicator_world);
    }
  }
  assert(ShmSetup==0);  ShmSetup=1;
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Want to implement some magic ... Group sub-cubes into those on same node
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  std::vector<int> coor = _processor_coor;
@@ -80,139 +250,13 @@ void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &c
  rank = LexicographicToWorldRank[rank];
 }
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 int CartesianCommunicator::ShmSetup = 0;
 int CartesianCommunicator::ShmRank;
 int CartesianCommunicator::ShmSize;
 int CartesianCommunicator::GroupRank;
 int CartesianCommunicator::GroupSize;
 MPI_Comm CartesianCommunicator::ShmComm;
 MPI_Win  CartesianCommunicator::ShmWindow;
 std::vector<int> CartesianCommunicator::GroupRanks; 
 std::vector<int> CartesianCommunicator::MyGroup;
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 { 
  _ndimension = processors.size();
  WorldDims = processors;
  communicator = MPI_COMM_WORLD;
  MPI_Comm_rank(communicator,&WorldRank);
  MPI_Comm_size(communicator,&WorldSize);
  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Plan: allocate a fixed SHM region. Scratch that is just used via some scheme during stencil comms, with no allocate free.
  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Does every grid need one, or could we share across all grids via a singleton/guard?
  int ierr;
-  if ( !ShmSetup ) { 
+  communicator=communicator_world;
-    MPI_Comm_split_type(communicator, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
+  _ndimension = processors.size();
    MPI_Comm_rank(ShmComm     ,&ShmRank);
    MPI_Comm_size(ShmComm     ,&ShmSize);
    GroupSize = WorldSize/ShmSize;
    /////////////////////////////////////////////////////////////////////
    // find world ranks in our SHM group (i.e. which ranks are on our node)
    /////////////////////////////////////////////////////////////////////
    MPI_Group WorldGroup, ShmGroup;
    MPI_Comm_group (communicator, &WorldGroup); 
    MPI_Comm_group (ShmComm, &ShmGroup);
    std::vector<int> world_ranks(WorldSize); 
    GroupRanks.resize(WorldSize); 
    MyGroup.resize(ShmSize);
    for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
    MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &GroupRanks[0]); 
    ///////////////////////////////////////////////////////////////////
    // Identify who is in my group and noninate the leader
    ///////////////////////////////////////////////////////////////////
    int g=0;
    for(int rank=0;rank<WorldSize;rank++){
      if(GroupRanks[rank]!=MPI_UNDEFINED){
 	assert(g<ShmSize);
 	MyGroup[g++] = rank;
      }
    }
    std::sort(MyGroup.begin(),MyGroup.end(),std::greater<int>());
    int myleader = MyGroup[0];
    std::vector<int> leaders_1hot(WorldSize,0);
    std::vector<int> leaders_group(GroupSize,0);
    leaders_1hot [ myleader ] = 1;
    ///////////////////////////////////////////////////////////////////
    // global sum leaders over comm world
    ///////////////////////////////////////////////////////////////////
    ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator);
    assert(ierr==0);
    ///////////////////////////////////////////////////////////////////
    // find the group leaders world rank
    ///////////////////////////////////////////////////////////////////
    int group=0;
    for(int l=0;l<WorldSize;l++){
      if(leaders_1hot[l]){
 	leaders_group[group++] = l;
      }
    }
    ///////////////////////////////////////////////////////////////////
    // Identify the rank of the group in which I (and my leader) live
    ///////////////////////////////////////////////////////////////////
    GroupRank=-1;
    for(int g=0;g<GroupSize;g++){
      if (myleader == leaders_group[g]){
 	GroupRank=g;
      }
    }
    assert(GroupRank!=-1);
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    // allocate the shared window for our group
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    ShmCommBuf = 0;
    ierr = MPI_Win_allocate_shared(MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,ShmComm,&ShmCommBuf,&ShmWindow);
    assert(ierr==0);
    for(uint64_t page=0;page<MAX_MPI_SHM_BYTES;page+=4096){
      void *pages = (void *) ( page + ShmCommBuf );
      int status;
      int flags=MPOL_MF_MOVE_ALL;
      int nodes=1; // numa domain == MCDRAM
      unsigned long count=1;
      ierr= move_pages(0,count, &pages,&nodes,&status,flags);
      if (ierr && (page==0)) perror("numa relocate command failed");
    }
    MPI_Win_lock_all (MPI_MODE_NOCHECK, ShmWindow);
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    // Verbose for now
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    std::cout<<GridLogMessage<< "MPI-3 configuration: Ranks per node "<< ShmSize ;
    std::cout<< " Nodes "<< GroupSize;
    std::cout<< " Ranks "<< WorldSize;
    std::cout<< " Shm CommBuf address"<< std::hex <<ShmCommBuf << std::dec<<std::endl;
    // Done
    ShmSetup=1;
  }
  ShmCommBufs.resize(ShmSize);
  for(int r=0;r<ShmSize;r++){
    MPI_Aint sz;
    int dsp_unit;
    MPI_Win_shared_query (ShmWindow, r, &sz, &dsp_unit, &ShmCommBufs[r]);
  }
  ////////////////////////////////////////////////////////////////
  // Assert power of two shm_size.
@@ -232,6 +276,8 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
  ////////////////////////////////////////////////////////////////
  int dim = 0;
  std::vector<int> WorldDims = processors;
  ShmDims.resize(_ndimension,1);
  GroupDims.resize(_ndimension);
@@ -346,21 +392,6 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
  }
 }
 void *CartesianCommunicator::ShmBufferSelf(void)
 {
  return ShmCommBufs[ShmRank];
 }
 void *CartesianCommunicator::ShmBuffer(int rank)
 {
  int gpeer = GroupRanks[rank];
  if (gpeer == MPI_UNDEFINED){
    return NULL;
  } else { 
    return ShmCommBufs[gpeer];
  }
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
@@ -369,6 +400,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 						int from,
 						int bytes)
 {
 #if 1
  MPI_Request xrq;
  MPI_Request rrq;
@@ -387,12 +419,11 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
  sequence++;
  char *to_ptr   = (char *)ShmCommBufs[gdest];
  char *from_ptr = (char *)ShmCommBufs[ShmRank];
  int small = (bytes<MAX_MPI_SHM_BYTES);
-  typedef vRealD T;
+  typedef uint64_t T;
  int words = bytes/sizeof(T);
  assert(((size_t)bytes &(sizeof(T)-1))==0);
@@ -400,13 +431,18 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
  if ( small && (gdest !=MPI_UNDEFINED) ) {
    char *to_ptr   = (char *)ShmCommBufs[gdest];
    assert(gme != gdest);
    T *ip = (T *)xmit;
    T *op = (T *)to_ptr;
 PARALLEL_FOR_LOOP 
    for(int w=0;w<words;w++) {
-      vstream(op[w],ip[w]);
+      op[w]=ip[w];
      if ( w == 0 ) { 
 	//	std::cout << " xmit "<< ShmRank <<" -> "<< gdest<<" " <<std::hex<<op[w]<<std::dec<<std::endl;
      }
    }
    bcopy(&_processor,&to_ptr[bytes],sizeof(_processor));
@@ -426,7 +462,10 @@ PARALLEL_FOR_LOOP
    T *op = (T *)recv;
 PARALLEL_FOR_LOOP 
    for(int w=0;w<words;w++) {
-      vstream(op[w],ip[w]);
+      op[w]=ip[w];
      if ( w == 0 ) { 
 	//	std::cout << " recv "<< ShmRank <<" <- "<< gfrom<<" " <<std::hex<<op[w]<<std::dec<<std::endl;
      }
    }
    bcopy(&from_ptr[bytes]  ,&tag  ,sizeof(tag));
    bcopy(&from_ptr[bytes+4],&check,sizeof(check));
@@ -441,6 +480,19 @@ PARALLEL_FOR_LOOP
  MPI_Win_sync (ShmWindow);   
  MPI_Barrier  (ShmComm);
  MPI_Win_sync (ShmWindow);   
 #else
  MPI_Request xrq;
  MPI_Request rrq;
  int rank = _processor;
  int ierr;
  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
  assert(ierr==0);
  list.push_back(xrq);
  list.push_back(rrq);
 #endif
 }
 void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
@@ -476,19 +528,29 @@ void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_
    list.push_back(rrq);
  }
  StencilBarrier();
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  SendToRecvFromComplete(list);
 }
 void CartesianCommunicator::StencilBarrier(void)
 {
  MPI_Win_sync (ShmWindow);   
  MPI_Barrier  (ShmComm);
  MPI_Win_sync (ShmWindow);   
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  int nreq=list.size();
  std::vector<MPI_Status> status(nreq);
  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
  assert(ierr==0);
 }
@@ -514,7 +576,7 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 		      bytes,
 		      MPI_BYTE,
 		      root,
-		      MPI_COMM_WORLD);
+		      communicator_world);
  assert(ierr==0);
 }
@@ -28,18 +28,29 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include "Grid.h"
 namespace Grid {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 int CartesianCommunicator::ShmRank;
 int CartesianCommunicator::ShmSize;
 int CartesianCommunicator::GroupRank;
 int CartesianCommunicator::GroupSize;
 int CartesianCommunicator::WorldRank;
 int CartesianCommunicator::WorldSize;
 int CartesianCommunicator::Slave;
 void *              CartesianCommunicator::ShmCommBuf;
 commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
 void CartesianCommunicator::Init(int *argc, char *** arv)
 {
-}
+  WorldRank = 0;
-
+  WorldSize = 1;
-int Rank(void ){ return 0; };
+  ShmRank=0;
-void *CartesianCommunicator::ShmBufferSelf(void)
+  ShmSize=1;
-{
+  GroupRank=_WorldRank;
-  return NULL;
+  GroupSize=_WorldSize;
-}
+  Slave    =0;
-void *CartesianCommunicator::ShmBuffer(int rank)
+  ShmInitGeneric();
 {
  return NULL;
 }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
@@ -97,30 +108,16 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
  assert(0);
 }
-void CartesianCommunicator::Barrier(void)
+void CartesianCommunicator::Barrier(void){}
-{
+void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
-}
+void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
-
+int  CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) {  return 0;}
-void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
+void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){  assert(0);}
 {
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
 }
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  source =0;
  dest=0;
 }
 int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 {
  return 0;
 }
 void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 {
 }
 }
@@ -39,25 +39,19 @@ namespace Grid {
    BACKTRACEFILE();		   \
  }\
 }
 int Rank(void) {
  return shmem_my_pe();
 }
 typedef struct HandShake_t { 
  uint64_t seq_local;
  uint64_t seq_remote;
 } HandShake;
 static Vector< HandShake > XConnections;
 static Vector< HandShake > RConnections;
-void *CartesianCommunicator::ShmBufferSelf(void)
+///////////////////////////////////////////////////////////////////////////////////////////////////
-{
+// Info that is setup once and indept of cartesian layout
-  return NULL;
+///////////////////////////////////////////////////////////////////////////////////////////////////
-}
+int CartesianCommunicator::ShmRank;
-void *CartesianCommunicator::ShmBuffer(int rank)
+int CartesianCommunicator::ShmSize;
-{
+int CartesianCommunicator::GroupRank;
-  return NULL;
+int CartesianCommunicator::GroupSize;
-}
+int CartesianCommunicator::WorldRank;
 int CartesianCommunicator::WorldSize;
 int CartesianCommunicator::Slave;
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  shmem_init();
  XConnections.resize(shmem_n_pes());
@@ -69,7 +63,36 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
    RConnections[pe].seq_remote= 0;
  }
  shmem_barrier_all();
  ShmInitGeneric();
 }
 // Should error check all MPI calls.
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  int flag;
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
    MPI_Init(argc,argv);
    MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
    MPI_Comm_rank(communicator_world,&_WorldRank);
    MPI_Comm_size(communicator_world,&_WorldSize);
    _ShmRank=0;
    _ShmSize=1;
    _GroupRank=_WorldRank;
    _GroupSize=_WorldSize;
    _Slave    =0;
  }
 }
 typedef struct HandShake_t { 
  uint64_t seq_local;
  uint64_t seq_remote;
 } HandShake;
 static Vector< HandShake > XConnections;
 static Vector< HandShake > RConnections;
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
  _ndimension = processors.size();