Simplify communicators

2026-01-08 02:49:33 +00:00 · 2018-01-08 11:35:43 +00:00
parent 43e48542ab
commit 44f65526e0
8 changed files with 777 additions and 1840 deletions
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@@ -1,222 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/communicator/Communicator_mpi.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/GridCore.h>
-#include <Grid/GridQCDcore.h>
-#include <Grid/qcd/action/ActionCore.h>
-#include <mpi.h>
-
-namespace Grid {
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Info that is setup once and indept of cartesian layout
-///////////////////////////////////////////////////////////////////////////////////////////////////
-MPI_Comm CartesianCommunicator::communicator_world;
-
-// Should error check all MPI calls.
-void CartesianCommunicator::Init(int *argc, char ***argv) {
-  int flag;
-  int provided;
-  MPI_Initialized(&flag); // needed to coexist with other libs apparently
-  if ( !flag ) {
-    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
-    if ( provided != MPI_THREAD_MULTIPLE ) {
-      QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
-    }
-  }
-  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
-  ShmInitGeneric();
-}
-
-CartesianCommunicator::~CartesianCommunicator()
-{
-  int MPI_is_finalised;
-  MPI_Finalized(&MPI_is_finalised);
-  if (communicator && !MPI_is_finalised)
-    MPI_Comm_free(&communicator);
-}
-
-void CartesianCommunicator::GlobalSum(uint32_t &u){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSum(uint64_t &u){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalXOR(uint32_t &u){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalXOR(uint64_t &u){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSum(float &f){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSumVector(float *f,int N)
-{
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSum(double &d)
-{
-  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSumVector(double *d,int N)
-{
-  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
-{
-  int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
-  assert(ierr==0);
-}
-int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
-{
-  int rank;
-  int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank);
-  assert(ierr==0);
-  return rank;
-}
-void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
-{
-  coor.resize(_ndimension);
-  int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
-  assert(ierr==0);
-}
-
-// Basic Halo comms primitive
-void CartesianCommunicator::SendToRecvFrom(void *xmit,
-					   int dest,
-					   void *recv,
-					   int from,
-					   int bytes)
-{
-  std::vector<CommsRequest_t> reqs(0);
-  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
-  SendToRecvFromComplete(reqs);
-}
-
-void CartesianCommunicator::SendRecvPacket(void *xmit,
-					   void *recv,
-					   int sender,
-					   int receiver,
-					   int bytes)
-{
-  MPI_Status stat;
-  assert(sender != receiver);
-  int tag = sender;
-  if ( _processor == sender ) {
-    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
-  }
-  if ( _processor == receiver ) { 
-    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
-  }
-}
-
-// Basic Halo comms primitive
-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-						void *xmit,
-						int dest,
-						void *recv,
-						int from,
-						int bytes)
-{
-  int myrank = _processor;
-  int ierr;
-  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
-    MPI_Request xrq;
-    MPI_Request rrq;
-
-    ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
-    ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
-    
-    assert(ierr==0);
-    list.push_back(xrq);
-    list.push_back(rrq);
-  } else { 
-    // Give the CPU to MPI immediately; can use threads to overlap optionally
-    ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
-		      recv,bytes,MPI_CHAR,from, from,
-		      communicator,MPI_STATUS_IGNORE);
-    assert(ierr==0);
-  }
-}
-void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
-{
-  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
-    int nreq=list.size();
-    std::vector<MPI_Status> status(nreq);
-    int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
-    assert(ierr==0);
-  }
-}
-
-void CartesianCommunicator::Barrier(void)
-{
-  int ierr = MPI_Barrier(communicator);
-  assert(ierr==0);
-}
-
-void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
-{
-  int ierr=MPI_Bcast(data,
-		     bytes,
-		     MPI_BYTE,
-		     root,
-		     communicator);
-  assert(ierr==0);
-}
-  ///////////////////////////////////////////////////////
-  // Should only be used prior to Grid Init finished.
-  // Check for this?
-  ///////////////////////////////////////////////////////
-int CartesianCommunicator::RankWorld(void){ 
-  int r; 
-  MPI_Comm_rank(communicator_world,&r);
-  return r;
-}
-void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
-{
-  int ierr= MPI_Bcast(data,
-		      bytes,
-		      MPI_BYTE,
-		      root,
-		      communicator_world);
-  assert(ierr==0);
-}
-
-
-
-}
-
--- a/lib/communicator/Communicator_mpi3_leader.cc
+++ b/lib/communicator/Communicator_mpi3_leader.cc
@@ -1,988 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/communicator/Communicator_mpi.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include "Grid.h"
-#include <mpi.h>
-//#include <numaif.h>
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Workarounds:
-/// i) bloody mac os doesn't implement unnamed semaphores since it is "optional" posix.
-///    darwin dispatch semaphores don't seem to be multiprocess.
-///
-/// ii) openmpi under --mca shmem posix works with two squadrons per node; 
-///     openmpi under default mca settings (I think --mca shmem mmap) on MacOS makes two squadrons map the SAME
-///     memory as each other, despite their living on different communicators. This appears to be a bug in OpenMPI.
-///
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-#include <semaphore.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <limits.h>
-typedef sem_t *Grid_semaphore;
-
-
-#error  /*THis is deprecated*/
-
-#if 0 
-#define SEM_INIT(S)      S = sem_open(sem_name,0,0600,0); assert ( S != SEM_FAILED );
-#define SEM_INIT_EXCL(S) sem_unlink(sem_name); S = sem_open(sem_name,O_CREAT|O_EXCL,0600,0); assert ( S != SEM_FAILED );
-#define SEM_POST(S) assert ( sem_post(S) == 0 ); 
-#define SEM_WAIT(S) assert ( sem_wait(S) == 0 );
-#else
-#define SEM_INIT(S)      ;
-#define SEM_INIT_EXCL(S) ;
-#define SEM_POST(S) ;
-#define SEM_WAIT(S) ;
-#endif
-#include <sys/mman.h>
-
-namespace Grid {
-
-enum { COMMAND_ISEND, COMMAND_IRECV, COMMAND_WAITALL, COMMAND_SENDRECV };
-
-struct Descriptor {
-  uint64_t buf;
-  size_t bytes;
-  int rank;
-  int tag;
-  int command;
-  uint64_t xbuf;
-  uint64_t rbuf;
-  int xtag;
-  int rtag;
-  int src;
-  int dest;
-  MPI_Request request;
-};
-
-const int pool = 48;
-
-class SlaveState {
-public:
-  volatile int head;
-  volatile int start;
-  volatile int tail;
-  volatile Descriptor Descrs[pool];
-};
-
-class Slave {
-public:
-  Grid_semaphore  sem_head;
-  Grid_semaphore  sem_tail;
-  SlaveState *state;
-  MPI_Comm squadron;
-  uint64_t     base;
-  int universe_rank;
-  int vertical_rank;
-  char sem_name [NAME_MAX];
-  ////////////////////////////////////////////////////////////
-  // Descriptor circular pointers
-  ////////////////////////////////////////////////////////////
-  Slave() {};
-
-  void Init(SlaveState * _state,MPI_Comm _squadron,int _universe_rank,int _vertical_rank);
-
-  void SemInit(void) {
-    sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank);
-    SEM_INIT(sem_head);
-    sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank);
-    SEM_INIT(sem_tail);
-  }  
-  void SemInitExcl(void) {
-    sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank);
-    SEM_INIT_EXCL(sem_head);
-    sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank);
-    SEM_INIT_EXCL(sem_tail);
-  }  
-  void WakeUpDMA(void) { 
-    SEM_POST(sem_head);
-  };
-  void WakeUpCompute(void) { 
-    SEM_POST(sem_tail);
-  };
-  void WaitForCommand(void) { 
-    SEM_WAIT(sem_head);
-  };
-  void WaitForComplete(void) { 
-    SEM_WAIT(sem_tail);
-  };
-  void EventLoop (void) {
-    //    std::cout<< " Entering event loop "<<std::endl;
-    while(1){
-      WaitForCommand();
-      //      std::cout << "Getting command "<<std::endl;
-#if 0
-      _mm_monitor((void *)&state->head,0,0);
-      int s=state->start;
-      if ( s != state->head ) {
-	_mm_mwait(0,0);
-      }
-#endif
-      Event();
-    }
-  }
-
-  int Event (void) ;
-
-  uint64_t QueueCommand(int command,void *buf, int bytes, int hashtag, MPI_Comm comm,int u_rank) ;
-  void QueueSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) ;
-
-  void WaitAll() {
-    //    std::cout << "Queueing WAIT command  "<<std::endl;
-    QueueCommand(COMMAND_WAITALL,0,0,0,squadron,0);
-    //    std::cout << "Waking up DMA "<<std::endl;
-    WakeUpDMA();
-    //    std::cout << "Waiting from semaphore "<<std::endl;
-    WaitForComplete();
-    //    std::cout << "Checking FIFO is empty "<<std::endl;
-    while ( state->tail != state->head );
-  }
-};
-
-////////////////////////////////////////////////////////////////////////
-// One instance of a data mover.
-// Master and Slave must agree on location in shared memory
-////////////////////////////////////////////////////////////////////////
-
-class MPIoffloadEngine { 
-public:
-
-  static std::vector<Slave> Slaves;
-
-  static int ShmSetup;
-  
-  static int UniverseRank;
-  static int UniverseSize;
-  
-  static MPI_Comm communicator_universe;
-  static MPI_Comm communicator_cached;
-
-  static MPI_Comm HorizontalComm;
-  static int HorizontalRank;
-  static int HorizontalSize;
-  
-  static MPI_Comm VerticalComm;
-  static MPI_Win  VerticalWindow; 
-  static int VerticalSize;
-  static int VerticalRank;
-  
-  static std::vector<void *> VerticalShmBufs;
-  static std::vector<std::vector<int> > UniverseRanks;
-  static std::vector<int> UserCommunicatorToWorldRanks; 
-  
-  static MPI_Group WorldGroup, CachedGroup;
-  
-  static void CommunicatorInit (MPI_Comm &communicator_world,
-				MPI_Comm &ShmComm,
-				void * &ShmCommBuf);
-
-  static void MapCommRankToWorldRank(int &hashtag, int & comm_world_peer,int tag, MPI_Comm comm,int commrank);
-
-  /////////////////////////////////////////////////////////
-  // routines for master proc must handle any communicator
-  /////////////////////////////////////////////////////////
-
-  static void QueueSend(int slave,void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
-     //    std::cout<< " Queueing send  "<< bytes<< " slave "<< slave << " to comm "<<rank  <<std::endl;
-    Slaves[slave].QueueCommand(COMMAND_ISEND,buf,bytes,tag,comm,rank);
-    //    std::cout << "Queued send command to rank "<< rank<< " via "<<slave <<std::endl;
-    Slaves[slave].WakeUpDMA();
-    //    std::cout << "Waking up DMA "<< slave<<std::endl;
-  };
-
-  static void QueueSendRecv(int slave,void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) 
-  {
-    Slaves[slave].QueueSendRecv(xbuf,rbuf,bytes,xtag,rtag,comm,dest,src);
-    Slaves[slave].WakeUpDMA();
-  }
-
-  static void QueueRecv(int slave, void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
-    //    std::cout<< " Queueing recv "<< bytes<< " slave "<< slave << " from comm "<<rank  <<std::endl;
-    Slaves[slave].QueueCommand(COMMAND_IRECV,buf,bytes,tag,comm,rank);
-    //    std::cout << "Queued recv command from rank "<< rank<< " via "<<slave <<std::endl;
-    Slaves[slave].WakeUpDMA();
-    //    std::cout << "Waking up DMA "<< slave<<std::endl;
-  };
-
-  static void WaitAll() {
-    for(int s=1;s<VerticalSize;s++) {
-      //      std::cout << "Waiting for slave "<< s<<std::endl;
-      Slaves[s].WaitAll();
-    }
-    //    std::cout << " Wait all Complete "<<std::endl;
-  };
-
-  static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){
-    int basework = nwork/units;
-    int backfill = units-(nwork%units);
-    if ( me >= units ) { 
-      mywork = myoff = 0;
-    } else { 
-      mywork = (nwork+me)/units;
-      myoff  = basework * me;
-      if ( me > backfill ) 
-	myoff+= (me-backfill);
-    }
-    return;
-  };
-
-  static void QueueRoundRobinSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) {
-    uint8_t * cxbuf = (uint8_t *) xbuf;
-    uint8_t * crbuf = (uint8_t *) rbuf;
-    static int rrp=0;
-    int procs = VerticalSize-1;
-    int myoff=0;
-    int mywork=bytes;
-    QueueSendRecv(rrp+1,&cxbuf[myoff],&crbuf[myoff],mywork,xtag,rtag,comm,dest,src);
-    rrp = rrp+1;
-    if ( rrp == (VerticalSize-1) ) rrp = 0;
-  }
-
-  static void QueueMultiplexedSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) {
-    uint8_t * cxbuf = (uint8_t *) xbuf;
-    uint8_t * crbuf = (uint8_t *) rbuf;
-    int mywork, myoff, procs;
-    procs = VerticalSize-1;
-    for(int s=0;s<procs;s++) {
-      GetWork(bytes,s,mywork,myoff,procs);
-      QueueSendRecv(s+1,&cxbuf[myoff],&crbuf[myoff],mywork,xtag,rtag,comm,dest,src);
-    }
-  };
-  static void QueueMultiplexedSend(void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
-    uint8_t * cbuf = (uint8_t *) buf;
-    int mywork, myoff, procs;
-    procs = VerticalSize-1;
-    for(int s=0;s<procs;s++) {
-      GetWork(bytes,s,mywork,myoff,procs);
-      QueueSend(s+1,&cbuf[myoff],mywork,tag,comm,rank);
-    }
-  };
-
-  static void QueueMultiplexedRecv(void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
-    uint8_t * cbuf = (uint8_t *) buf;
-    int mywork, myoff, procs;
-    procs = VerticalSize-1;
-    for(int s=0;s<procs;s++) {
-      GetWork(bytes,s,mywork,myoff,procs);
-      QueueRecv(s+1,&cbuf[myoff],mywork,tag,comm,rank);
-    }
-  };
-
-};
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Info that is setup once and indept of cartesian layout
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-std::vector<Slave> MPIoffloadEngine::Slaves;
-    
-int MPIoffloadEngine::UniverseRank;
-int MPIoffloadEngine::UniverseSize;
-
-MPI_Comm  MPIoffloadEngine::communicator_universe;
-MPI_Comm  MPIoffloadEngine::communicator_cached;
-MPI_Group MPIoffloadEngine::WorldGroup;
-MPI_Group MPIoffloadEngine::CachedGroup;
-
-MPI_Comm MPIoffloadEngine::HorizontalComm;
-int      MPIoffloadEngine::HorizontalRank;
-int      MPIoffloadEngine::HorizontalSize;
-
-MPI_Comm MPIoffloadEngine::VerticalComm;
-int      MPIoffloadEngine::VerticalSize;
-int      MPIoffloadEngine::VerticalRank;
-MPI_Win  MPIoffloadEngine::VerticalWindow; 
-std::vector<void *>            MPIoffloadEngine::VerticalShmBufs;
-std::vector<std::vector<int> > MPIoffloadEngine::UniverseRanks;
-std::vector<int>               MPIoffloadEngine::UserCommunicatorToWorldRanks; 
-
-int CartesianCommunicator::NodeCount(void)    { return HorizontalSize;};
-int MPIoffloadEngine::ShmSetup = 0;
-
-void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world,
-					 MPI_Comm &ShmComm,
-					 void * &ShmCommBuf)
-{      
-  int flag;
-  assert(ShmSetup==0);  
-  
-  //////////////////////////////////////////////////////////////////////
-  // Universe is all nodes prior to squadron grouping
-  //////////////////////////////////////////////////////////////////////
-  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_universe);
-  MPI_Comm_rank(communicator_universe,&UniverseRank);
-  MPI_Comm_size(communicator_universe,&UniverseSize);
-  
-  /////////////////////////////////////////////////////////////////////
-  // Split into groups that can share memory (Verticals)
-  /////////////////////////////////////////////////////////////////////
-#undef MPI_SHARED_MEM_DEBUG
-#ifdef  MPI_SHARED_MEM_DEBUG
-  MPI_Comm_split(communicator_universe,(UniverseRank/4),UniverseRank,&VerticalComm);
-#else 
-  MPI_Comm_split_type(communicator_universe, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&VerticalComm);
-#endif
-  MPI_Comm_rank(VerticalComm     ,&VerticalRank);
-  MPI_Comm_size(VerticalComm     ,&VerticalSize);
-  
-  //////////////////////////////////////////////////////////////////////
-  // Split into horizontal groups by rank in squadron
-  //////////////////////////////////////////////////////////////////////
-  MPI_Comm_split(communicator_universe,VerticalRank,UniverseRank,&HorizontalComm);
-  MPI_Comm_rank(HorizontalComm,&HorizontalRank);
-  MPI_Comm_size(HorizontalComm,&HorizontalSize);
-  assert(HorizontalSize*VerticalSize==UniverseSize);
-  
-  ////////////////////////////////////////////////////////////////////////////////
-  // What is my place in the world
-  ////////////////////////////////////////////////////////////////////////////////
-  int WorldRank=0;
-  if(VerticalRank==0) WorldRank = HorizontalRank;
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&WorldRank,1,MPI_INT,MPI_SUM,VerticalComm);
-  assert(ierr==0);
-  
-  ////////////////////////////////////////////////////////////////////////////////
-  // Where is the world in the universe?
-  ////////////////////////////////////////////////////////////////////////////////
-  UniverseRanks = std::vector<std::vector<int> >(HorizontalSize,std::vector<int>(VerticalSize,0));
-  UniverseRanks[WorldRank][VerticalRank] = UniverseRank;
-  for(int w=0;w<HorizontalSize;w++){
-    ierr=MPI_Allreduce(MPI_IN_PLACE,&UniverseRanks[w][0],VerticalSize,MPI_INT,MPI_SUM,communicator_universe);
-    assert(ierr==0);
-  }
-  
-  //////////////////////////////////////////////////////////////////////////////////////////////////////////
-  // allocate the shared window for our group, pass back Shm info to CartesianCommunicator
-  //////////////////////////////////////////////////////////////////////////////////////////////////////////
-  VerticalShmBufs.resize(VerticalSize);
-
-#undef MPI_SHARED_MEM
-#ifdef MPI_SHARED_MEM
-  ierr = MPI_Win_allocate_shared(CartesianCommunicator::MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,VerticalComm,&ShmCommBuf,&VerticalWindow);
-  ierr|= MPI_Win_lock_all (MPI_MODE_NOCHECK, VerticalWindow);
-  assert(ierr==0);
-  //  std::cout<<"SHM "<<ShmCommBuf<<std::endl;
-
-  for(int r=0;r<VerticalSize;r++){
-    MPI_Aint sz;
-    int dsp_unit;
-    MPI_Win_shared_query (VerticalWindow, r, &sz, &dsp_unit, &VerticalShmBufs[r]);
-    //    std::cout<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
-  }
-#else 
-  char shm_name [NAME_MAX];
-  MPI_Barrier(VerticalComm);
-
-  if ( VerticalRank == 0 ) {
-    for(int r=0;r<VerticalSize;r++){
-
-      size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
-      if ( r>0 ) size = sizeof(SlaveState);
-
-      sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldRank,r);
-      
-      shm_unlink(shm_name);
-
-      int fd=shm_open(shm_name,O_RDWR|O_CREAT,0600);
-      if ( fd < 0 ) {
-	perror("failed shm_open");
-	assert(0);
-      }
-
-      ftruncate(fd, size);
-
-      VerticalShmBufs[r] = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-      if ( VerticalShmBufs[r] == MAP_FAILED ) { 
-	perror("failed mmap");
-	assert(0);
-      }
-
-      /*
-      for(uint64_t page=0;page<size;page+=4096){
-	void *pages = (void *) ( page + (uint64_t)VerticalShmBufs[r] );
-	int status;
-	int flags=MPOL_MF_MOVE_ALL;
-	int nodes=1; // numa domain == MCDRAM
-	unsigned long count=1;
-	ierr= move_pages(0,count, &pages,&nodes,&status,flags);
-	if (ierr && (page==0)) perror("numa relocate command failed");
-      }
-      */
-      uint64_t * check = (uint64_t *) VerticalShmBufs[r];
-      check[0] = WorldRank;
-      check[1] = r;
-
-      //      std::cout<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
-    }
-  }
-
-  MPI_Barrier(VerticalComm);
-
-  if ( VerticalRank != 0 ) { 
-  for(int r=0;r<VerticalSize;r++){
-
-    size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES ;
-    if ( r>0 ) size = sizeof(SlaveState);
-    
-    sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldRank,r);
-    
-    int fd=shm_open(shm_name,O_RDWR|O_CREAT,0600);
-    if ( fd<0 ) {
-      perror("failed shm_open");
-      assert(0);
-    }
-    VerticalShmBufs[r] = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-
-    uint64_t * check = (uint64_t *) VerticalShmBufs[r];
-    assert(check[0]== WorldRank);
-    assert(check[1]== r);
-    //    std::cerr<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
-  }
-  }
-#endif
-  MPI_Barrier(VerticalComm);
-
-  //////////////////////////////////////////////////////////////////////
-  // Map rank of leader on node in their in new world, to the
-  // rank in this vertical plane's horizontal communicator
-  //////////////////////////////////////////////////////////////////////
-  communicator_world = HorizontalComm;
-  ShmComm            = VerticalComm;
-  ShmCommBuf         = VerticalShmBufs[0];
-  MPI_Comm_group (communicator_world, &WorldGroup); 
-  
-  ///////////////////////////////////////////////////////////
-  // Start the slave data movers
-  ///////////////////////////////////////////////////////////
-  if ( VerticalRank != 0 ) {
-    Slave indentured;
-    indentured.Init( (SlaveState *) VerticalShmBufs[VerticalRank], VerticalComm, UniverseRank,VerticalRank);
-    indentured.SemInitExcl();// init semaphore in shared memory
-    MPI_Barrier(VerticalComm);
-    MPI_Barrier(VerticalComm);
-    indentured.EventLoop();
-    assert(0);
-  } else {
-    Slaves.resize(VerticalSize);
-    for(int i=1;i<VerticalSize;i++){
-      Slaves[i].Init((SlaveState *)VerticalShmBufs[i],VerticalComm, UniverseRanks[HorizontalRank][i],i);
-    }
-    MPI_Barrier(VerticalComm);
-    for(int i=1;i<VerticalSize;i++){
-      Slaves[i].SemInit();// init semaphore in shared memory
-    }
-    MPI_Barrier(VerticalComm);
-  }
-  
-  ///////////////////////////////////////////////////////////
-  // Verbose for now
-  ///////////////////////////////////////////////////////////
-  
-  ShmSetup=1;
-  
-  if (UniverseRank == 0){
-      
-    std::cout<<GridLogMessage << "Grid MPI-3 configuration: detected ";
-    std::cout<<UniverseSize   << " Ranks " ;
-    std::cout<<HorizontalSize << " Nodes " ;
-    std::cout<<VerticalSize   << " with ranks-per-node "<<std::endl;
-    
-    std::cout<<GridLogMessage << "Grid MPI-3 configuration: using one lead process per node " << std::endl;
-    std::cout<<GridLogMessage << "Grid MPI-3 configuration: reduced communicator has size " << HorizontalSize << std::endl;
-    
-    for(int g=0;g<HorizontalSize;g++){
-      std::cout<<GridLogMessage<<" Node "<<g<<" led by MPI rank "<< UniverseRanks[g][0]<<std::endl;
-    }
-    
-    for(int g=0;g<HorizontalSize;g++){
-      std::cout<<GridLogMessage<<" { ";
-      for(int s=0;s<VerticalSize;s++){
-	std::cout<< UniverseRanks[g][s];
-	if ( s<VerticalSize-1 ) {
-	  std::cout<<",";
-	}
-      }
-      std::cout<<" } "<<std::endl;
-    }
-  }
-};
-
-  ///////////////////////////////////////////////////////////////////////////////////////////////
-  // Map the communicator into communicator_world, and find the neighbour.
-  // Cache the mappings; cache size is 1.
-  ///////////////////////////////////////////////////////////////////////////////////////////////
-void MPIoffloadEngine::MapCommRankToWorldRank(int &hashtag, int & comm_world_peer,int tag, MPI_Comm comm,int rank) {
-
-  if ( comm == HorizontalComm ) {
-    comm_world_peer = rank;
-    //    std::cout << " MapCommRankToWorldRank  horiz " <<rank<<"->"<<comm_world_peer<<std::endl;
-  } else if ( comm == communicator_cached ) {
-    comm_world_peer = UserCommunicatorToWorldRanks[rank];
-    //    std::cout << " MapCommRankToWorldRank  cached " <<rank<<"->"<<comm_world_peer<<std::endl;
-  } else { 
-    
-    int size;
-
-    MPI_Comm_size(comm,&size);
-
-    UserCommunicatorToWorldRanks.resize(size);
-
-    std::vector<int> cached_ranks(size); 
-
-    for(int r=0;r<size;r++) {
-      cached_ranks[r]=r;
-    }
-
-    communicator_cached=comm;
-    
-    MPI_Comm_group(communicator_cached, &CachedGroup);
-    
-    MPI_Group_translate_ranks(CachedGroup,size,&cached_ranks[0],WorldGroup, &UserCommunicatorToWorldRanks[0]); 
-    
-    comm_world_peer = UserCommunicatorToWorldRanks[rank];
-    //    std::cout << " MapCommRankToWorldRank  cache miss " <<rank<<"->"<<comm_world_peer<<std::endl;
-    
-    assert(comm_world_peer != MPI_UNDEFINED);
-  }
-
-  assert( (tag & (~0xFFFFL)) ==0); 
-  
-  uint64_t icomm = (uint64_t)comm;
-  int comm_hash = ((icomm>>0 )&0xFFFF)^((icomm>>16)&0xFFFF)
-                ^ ((icomm>>32)&0xFFFF)^((icomm>>48)&0xFFFF);
-  
-  //  hashtag = (comm_hash<<15) | tag;      
-  hashtag = tag;      
-
-};
-
-void Slave::Init(SlaveState * _state,MPI_Comm _squadron,int _universe_rank,int _vertical_rank)
-{
-  squadron=_squadron;
-  universe_rank=_universe_rank;
-  vertical_rank=_vertical_rank;
-  state   =_state;
-  //  std::cout << "state "<<_state<<" comm "<<_squadron<<" universe_rank"<<universe_rank <<std::endl;
-  state->head = state->tail = state->start = 0;
-  base = (uint64_t)MPIoffloadEngine::VerticalShmBufs[0];
-  int rank; MPI_Comm_rank(_squadron,&rank);
-}
-#define PERI_PLUS(A) ( (A+1)%pool )
-int Slave::Event (void) {
-
-  static int tail_last;
-  static int head_last;
-  static int start_last;
-  int ierr;
-  MPI_Status stat;
-  static int i=0;
-
-  ////////////////////////////////////////////////////
-  // Try to advance the start pointers
-  ////////////////////////////////////////////////////
-  int s=state->start;
-  if ( s != state->head ) {
-    switch ( state->Descrs[s].command ) {
-    case COMMAND_ISEND:
-      ierr = MPI_Isend((void *)(state->Descrs[s].buf+base), 
-		       state->Descrs[s].bytes, 
-		       MPI_CHAR,
-		       state->Descrs[s].rank,
-		       state->Descrs[s].tag,
-		       MPIoffloadEngine::communicator_universe,
-		       (MPI_Request *)&state->Descrs[s].request);
-      assert(ierr==0);
-      state->start = PERI_PLUS(s);
-      return 1;
-      break;
-
-    case COMMAND_IRECV:
-      ierr=MPI_Irecv((void *)(state->Descrs[s].buf+base), 
-		     state->Descrs[s].bytes, 
-		     MPI_CHAR,
-		     state->Descrs[s].rank,
-		     state->Descrs[s].tag,
-		     MPIoffloadEngine::communicator_universe,
-		     (MPI_Request *)&state->Descrs[s].request);
-
-      //      std::cout<< " Request is "<<state->Descrs[s].request<<std::endl;
-      //      std::cout<< " Request0 is "<<state->Descrs[0].request<<std::endl;
-      assert(ierr==0);
-      state->start = PERI_PLUS(s);
-      return 1;
-      break;
-
-    case COMMAND_SENDRECV:
-
-      //      fprintf(stderr,"Sendrecv ->%d %d : <-%d %d \n",state->Descrs[s].dest, state->Descrs[s].xtag+i*10,state->Descrs[s].src, state->Descrs[s].rtag+i*10);
-
-      ierr=MPI_Sendrecv((void *)(state->Descrs[s].xbuf+base), state->Descrs[s].bytes, MPI_CHAR, state->Descrs[s].dest, state->Descrs[s].xtag+i*10,
-			(void *)(state->Descrs[s].rbuf+base), state->Descrs[s].bytes, MPI_CHAR, state->Descrs[s].src , state->Descrs[s].rtag+i*10,
-			MPIoffloadEngine::communicator_universe,MPI_STATUS_IGNORE);
-
-      assert(ierr==0);
-
-      //      fprintf(stderr,"Sendrecv done %d %d\n",ierr,i);
-      //      MPI_Barrier(MPIoffloadEngine::HorizontalComm);
-      //      fprintf(stderr,"Barrier\n");
-      i++;
-
-      state->start = PERI_PLUS(s);
-
-      return 1;
-      break;
-
-    case COMMAND_WAITALL:
-
-      for(int t=state->tail;t!=s; t=PERI_PLUS(t) ){
-	if ( state->Descrs[t].command != COMMAND_SENDRECV ) {
-	  MPI_Wait((MPI_Request *)&state->Descrs[t].request,MPI_STATUS_IGNORE);
-	}
-      };
-      s=PERI_PLUS(s);
-      state->start = s;
-      state->tail  = s;
-
-      WakeUpCompute();
-
-      return 1;
-      break;
-
-    default:
-      assert(0);
-      break;
-    }
-  }
-  return 0;
-}
-  //////////////////////////////////////////////////////////////////////////////
-  // External interaction with the queue
-  //////////////////////////////////////////////////////////////////////////////
-  
-void Slave::QueueSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) 
-{
-  int head =state->head;
-  int next = PERI_PLUS(head);
-  
-  // Set up descriptor
-  int worldrank;
-  int hashtag;
-  MPI_Comm    communicator;
-  MPI_Request request;
-  uint64_t relative;
-  
-  relative = (uint64_t)xbuf - base;
-  state->Descrs[head].xbuf    = relative;
-  
-  relative= (uint64_t)rbuf - base;
-  state->Descrs[head].rbuf    = relative;
-  
-  state->Descrs[head].bytes  = bytes;
-  
-  MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,xtag,comm,dest);
-  state->Descrs[head].dest   = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank];
-  state->Descrs[head].xtag    = hashtag;
-  
-  MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,rtag,comm,src);
-  state->Descrs[head].src    = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank];
-  state->Descrs[head].rtag    = hashtag;
-  
-  state->Descrs[head].command= COMMAND_SENDRECV;
-  
-  // Block until FIFO has space
-  while( state->tail==next );
-  
-  // Msync on weak order architectures
-  
-  // Advance pointer
-  state->head = next;
-  
-};
-uint64_t Slave::QueueCommand(int command,void *buf, int bytes, int tag, MPI_Comm comm,int commrank) 
-{
-  /////////////////////////////////////////
-  // Spin; if FIFO is full until not full
-  /////////////////////////////////////////
-  int head =state->head;
-  int next = PERI_PLUS(head);
-    
-  // Set up descriptor
-  int worldrank;
-  int hashtag;
-  MPI_Comm    communicator;
-  MPI_Request request;
-  
-  MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,tag,comm,commrank);
-
-  uint64_t relative= (uint64_t)buf - base;
-  state->Descrs[head].buf    = relative;
-  state->Descrs[head].bytes  = bytes;
-  state->Descrs[head].rank   = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank];
-  state->Descrs[head].tag    = hashtag;
-  state->Descrs[head].command= command;
-
-  /*  
-  if ( command == COMMAND_ISEND ) { 
-  std::cout << "QueueSend from "<< universe_rank <<" to commrank " << commrank 
-            << " to worldrank " << worldrank <<std::endl;
-  std::cout << " via VerticalRank "<< vertical_rank <<" to universerank " << MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]<<std::endl;
-  std::cout << " QueueCommand "<<buf<<"["<<bytes<<"]" << std::endl;
-  } 
-  if ( command == COMMAND_IRECV ) { 
-  std::cout << "QueueRecv on "<< universe_rank <<" from commrank " << commrank 
-            << " from worldrank " << worldrank <<std::endl;
-  std::cout << " via VerticalRank "<< vertical_rank <<" from universerank " << MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]<<std::endl;
-  std::cout << " QueueSend "<<buf<<"["<<bytes<<"]" << std::endl;
-  } 
-  */
-  // Block until FIFO has space
-  while( state->tail==next );
-
-  // Msync on weak order architectures
-  // Advance pointer
-  state->head = next;
-
-  return 0;
-}
-  
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Info that is setup once and indept of cartesian layout
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-MPI_Comm CartesianCommunicator::communicator_world;
-
-void CartesianCommunicator::Init(int *argc, char ***argv) 
-{
-  int flag;
-  MPI_Initialized(&flag); // needed to coexist with other libs apparently
-  if ( !flag ) {
-    MPI_Init(argc,argv);
-  }
-  communicator_world = MPI_COMM_WORLD;
-  MPI_Comm ShmComm;
-  MPIoffloadEngine::CommunicatorInit (communicator_world,ShmComm,ShmCommBuf);
-}
-void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
-{
-  int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
-  assert(ierr==0);
-}
-int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
-{
-  int rank;
-  int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank);
-  assert(ierr==0);
-  return rank;
-}
-void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
-{
-  coor.resize(_ndimension);
-  int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
-  assert(ierr==0);
-}
-
-CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
-{ 
-  _ndimension = processors.size();
-  std::vector<int> periodic(_ndimension,1);
-
-  _Nprocessors=1;
-  _processors = processors;
-
-  for(int i=0;i<_ndimension;i++){
-    _Nprocessors*=_processors[i];
-  }
-
-  int Size; 
-  MPI_Comm_size(communicator_world,&Size);
-  assert(Size==_Nprocessors);
-
-  _processor_coor.resize(_ndimension);
-  MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
-  MPI_Comm_rank  (communicator,&_processor);
-  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
-};
-
-void CartesianCommunicator::GlobalSum(uint32_t &u){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSum(uint64_t &u){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSum(float &f){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSumVector(float *f,int N)
-{
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSum(double &d)
-{
-  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSumVector(double *d,int N)
-{
-  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-
-// Basic Halo comms primitive
-void CartesianCommunicator::SendToRecvFrom(void *xmit,
-					   int dest,
-					   void *recv,
-					   int from,
-					   int bytes)
-{
-  std::vector<CommsRequest_t> reqs(0);
-  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
-  SendToRecvFromComplete(reqs);
-}
-
-void CartesianCommunicator::SendRecvPacket(void *xmit,
-					   void *recv,
-					   int sender,
-					   int receiver,
-					   int bytes)
-{
-  MPI_Status stat;
-  assert(sender != receiver);
-  int tag = sender;
-  if ( _processor == sender ) {
-    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
-  }
-  if ( _processor == receiver ) { 
-    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
-  }
-}
-
-// Basic Halo comms primitive
-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-						void *xmit,
-						int dest,
-						void *recv,
-						int from,
-						int bytes)
-{
-  MPI_Request xrq;
-  MPI_Request rrq;
-  int rank = _processor;
-  int ierr;
-  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
-  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
-  
-  assert(ierr==0);
-
-  list.push_back(xrq);
-  list.push_back(rrq);
-}
-
-void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-						       void *xmit,
-						       int dest,
-						       void *recv,
-						       int from,
-						       int bytes)
-{
-  uint64_t xmit_i = (uint64_t) xmit;
-  uint64_t recv_i = (uint64_t) recv;
-  uint64_t shm    = (uint64_t) ShmCommBuf;
-  // assert xmit and recv lie in shared memory region
-  assert( (xmit_i >= shm) && (xmit_i+bytes <= shm+MAX_MPI_SHM_BYTES) );
-  assert( (recv_i >= shm) && (recv_i+bytes <= shm+MAX_MPI_SHM_BYTES) );
-  assert(from!=_processor);
-  assert(dest!=_processor);
-
-  MPIoffloadEngine::QueueMultiplexedSendRecv(xmit,recv,bytes,_processor,from,communicator,dest,from);
-
-  //MPIoffloadEngine::QueueRoundRobinSendRecv(xmit,recv,bytes,_processor,from,communicator,dest,from);
-
-  //MPIoffloadEngine::QueueMultiplexedSend(xmit,bytes,_processor,communicator,dest);
-  //MPIoffloadEngine::QueueMultiplexedRecv(recv,bytes,from,communicator,from);
-}
-
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list)
-{
-  MPIoffloadEngine::WaitAll();
-  //this->Barrier();
-}
-
-void CartesianCommunicator::StencilBarrier(void) { }
-
-void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
-{
-  int nreq=list.size();
-  std::vector<MPI_Status> status(nreq);
-  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
-  assert(ierr==0);
-}
-
-void CartesianCommunicator::Barrier(void)
-{
-  int ierr = MPI_Barrier(communicator);
-  assert(ierr==0);
-}
-
-void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
-{
-  int ierr=MPI_Bcast(data,
-		     bytes,
-		     MPI_BYTE,
-		     root,
-		     communicator);
-  assert(ierr==0);
-}
-
-void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
-{
-  int ierr= MPI_Bcast(data,
-		      bytes,
-		      MPI_BYTE,
-		      root,
-		      communicator_world);
-  assert(ierr==0);
-}
-
-void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; }
-
-void *CartesianCommunicator::ShmBuffer(int rank) {
-  return NULL;
-}
-void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { 
-  return NULL;
-}
-
-
-};
-
--- a/lib/communicator/Communicator_mpit.cc
+++ b/lib/communicator/Communicator_mpit.cc
@@ -1,273 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/communicator/Communicator_mpi.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/GridCore.h>
-#include <Grid/GridQCDcore.h>
-#include <Grid/qcd/action/ActionCore.h>
-#include <mpi.h>
-
-namespace Grid {
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Info that is setup once and indept of cartesian layout
-///////////////////////////////////////////////////////////////////////////////////////////////////
-MPI_Comm CartesianCommunicator::communicator_world;
-
-// Should error check all MPI calls.
-void CartesianCommunicator::Init(int *argc, char ***argv) {
-  int flag;
-  int provided;
-  MPI_Initialized(&flag); // needed to coexist with other libs apparently
-  if ( !flag ) {
-    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
-    if ( provided != MPI_THREAD_MULTIPLE ) {
-      QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
-    }
-  }
-  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
-  ShmInitGeneric();
-}
-
-CartesianCommunicator::~CartesianCommunicator()
-{
-  int MPI_is_finalised;
-  MPI_Finalized(&MPI_is_finalised);
-  if (communicator && !MPI_is_finalised){
-    MPI_Comm_free(&communicator);
-    for(int i=0;i<  communicator_halo.size();i++){
-      MPI_Comm_free(&communicator_halo[i]);
-    }
-  }  
-}
-
-void CartesianCommunicator::GlobalSum(uint32_t &u){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSum(uint64_t &u){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalXOR(uint32_t &u){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalXOR(uint64_t &u){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSum(float &f){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSumVector(float *f,int N)
-{
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSum(double &d)
-{
-  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::GlobalSumVector(double *d,int N)
-{
-  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
-  assert(ierr==0);
-}
-void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
-{
-  int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
-  assert(ierr==0);
-}
-int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
-{
-  int rank;
-  int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank);
-  assert(ierr==0);
-  return rank;
-}
-void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
-{
-  coor.resize(_ndimension);
-  int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
-  assert(ierr==0);
-}
-
-// Basic Halo comms primitive
-void CartesianCommunicator::SendToRecvFrom(void *xmit,
-					   int dest,
-					   void *recv,
-					   int from,
-					   int bytes)
-{
-  std::vector<CommsRequest_t> reqs(0);
-  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
-  SendToRecvFromComplete(reqs);
-}
-
-void CartesianCommunicator::SendRecvPacket(void *xmit,
-					   void *recv,
-					   int sender,
-					   int receiver,
-					   int bytes)
-{
-  MPI_Status stat;
-  assert(sender != receiver);
-  int tag = sender;
-  if ( _processor == sender ) {
-    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
-  }
-  if ( _processor == receiver ) { 
-    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
-  }
-}
-
-// Basic Halo comms primitive
-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-						void *xmit,
-						int dest,
-						void *recv,
-						int from,
-						int bytes)
-{
-  int myrank = _processor;
-  int ierr;
-  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
-    MPI_Request xrq;
-    MPI_Request rrq;
-
-    ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
-    ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
-    
-    assert(ierr==0);
-    list.push_back(xrq);
-    list.push_back(rrq);
-  } else { 
-    // Give the CPU to MPI immediately; can use threads to overlap optionally
-    ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
-		      recv,bytes,MPI_CHAR,from, from,
-		      communicator,MPI_STATUS_IGNORE);
-    assert(ierr==0);
-  }
-}
-void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
-{
-  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
-    int nreq=list.size();
-    std::vector<MPI_Status> status(nreq);
-    int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
-    assert(ierr==0);
-  }
-}
-
-void CartesianCommunicator::Barrier(void)
-{
-  int ierr = MPI_Barrier(communicator);
-  assert(ierr==0);
-}
-
-void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
-{
-  int ierr=MPI_Bcast(data,
-		     bytes,
-		     MPI_BYTE,
-		     root,
-		     communicator);
-  assert(ierr==0);
-}
-  ///////////////////////////////////////////////////////
-  // Should only be used prior to Grid Init finished.
-  // Check for this?
-  ///////////////////////////////////////////////////////
-int CartesianCommunicator::RankWorld(void){ 
-  int r; 
-  MPI_Comm_rank(communicator_world,&r);
-  return r;
-}
-void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
-{
-  int ierr= MPI_Bcast(data,
-		      bytes,
-		      MPI_BYTE,
-		      root,
-		      communicator_world);
-  assert(ierr==0);
-}
-
-double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-							 void *xmit,
-							 int xmit_to_rank,
-							 void *recv,
-							 int recv_from_rank,
-							 int bytes,int dir)
-{
-  int myrank = _processor;
-  int ierr;
-  int ncomm  =communicator_halo.size(); 
-  int commdir=dir%ncomm;
-  
-  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
-  // Give the CPU to MPI immediately; can use threads to overlap optionally
-  MPI_Request req[2];
-  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[commdir],&req[1]);
-  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[commdir],&req[0]);
-
-  list.push_back(req[0]);
-  list.push_back(req[1]);
-  return 2.0*bytes;
-}
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
-{ 
-  int nreq=waitall.size();
-  MPI_Waitall(nreq, &waitall[0], MPI_STATUSES_IGNORE);
-}
-double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
-						    int xmit_to_rank,
-						    void *recv,
-						    int recv_from_rank,
-						    int bytes,int dir)
-{
-  int myrank = _processor;
-  int ierr;
-  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo.size()<< <std::endl;
-
-  int ncomm  =communicator_halo.size(); 
-  int commdir=dir%ncomm;
-  // Give the CPU to MPI immediately; can use threads to overlap optionally
-  MPI_Request req[2];
-  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[commdir],&req[1]);
-  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[commdir],&req[0]);
-  MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
-  return 2.0*bytes;
-}
-
-
-
-}
-
--- a/lib/communicator/Communicator_shmem.cc
+++ b/lib/communicator/Communicator_shmem.cc
@@ -1,357 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/communicator/Communicator_shmem.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/Grid.h>
-#include <mpp/shmem.h>
-#include <array>
-
-namespace Grid {
-
-  // Should error check all MPI calls.
-#define SHMEM_VET(addr) 
-
-#define SHMEM_VET_DEBUG(addr) {				\
-  if ( ! shmem_addr_accessible(addr,_processor) ) {\
-    std::fprintf(stderr,"%d Inaccessible shmem address %lx %s %s\n",_processor,addr,__FUNCTION__,#addr); \
-    BACKTRACEFILE();		   \
-  }\
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Info that is setup once and indept of cartesian layout
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-typedef struct HandShake_t { 
-  uint64_t seq_local;
-  uint64_t seq_remote;
-} HandShake;
-
-std::array<long,_SHMEM_REDUCE_SYNC_SIZE> make_psync_init(void) {
-  std::array<long,_SHMEM_REDUCE_SYNC_SIZE> ret;
-  ret.fill(SHMEM_SYNC_VALUE);
-  return ret;
-}
-static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync_init = make_psync_init();
-
-static Vector< HandShake > XConnections;
-static Vector< HandShake > RConnections;
-
-void CartesianCommunicator::Init(int *argc, char ***argv) {
-  shmem_init();
-  XConnections.resize(shmem_n_pes());
-  RConnections.resize(shmem_n_pes());
-  for(int pe =0 ; pe<shmem_n_pes();pe++){
-    XConnections[pe].seq_local = 0;
-    XConnections[pe].seq_remote= 0;
-    RConnections[pe].seq_local = 0;
-    RConnections[pe].seq_remote= 0;
-  }
-  shmem_barrier_all();
-  ShmInitGeneric();
-}
-
-CartesianCommunicator::~CartesianCommunicator(){}
-
-CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent) 
-  : CartesianCommunicator(processors) 
-{
-  std::cout << "Attempts to split SHMEM communicators will fail " <<std::endl;
-}
-CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
-{
-  _ndimension = processors.size();
-  std::vector<int> periodic(_ndimension,1);
-
-  _Nprocessors=1;
-  _processors = processors;
-  _processor_coor.resize(_ndimension);
-
-  _processor = shmem_my_pe();
-  
-  Lexicographic::CoorFromIndex(_processor_coor,_processor,_processors);
-
-  for(int i=0;i<_ndimension;i++){
-    _Nprocessors*=_processors[i];
-  }
-
-  int Size = shmem_n_pes(); 
-
-
-  assert(Size==_Nprocessors);
-}
-
-void CartesianCommunicator::GlobalSum(uint32_t &u){
-  static long long source ;
-  static long long dest   ;
-  static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
-  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
-
-  //  int nreduce=1;
-  //  int pestart=0;
-  //  int logStride=0;
-
-  source = u;
-  dest   = 0;
-  shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
-  shmem_barrier_all(); // necessary?
-  u = dest;
-}
-void CartesianCommunicator::GlobalSum(uint64_t &u){
-  static long long source ;
-  static long long dest   ;
-  static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
-  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
-
-  //  int nreduce=1;
-  //  int pestart=0;
-  //  int logStride=0;
-
-  source = u;
-  dest   = 0;
-  shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
-  shmem_barrier_all(); // necessary?
-  u = dest;
-}
-void CartesianCommunicator::GlobalSum(float &f){
-  static float source ;
-  static float dest   ;
-  static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
-  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
-
-  source = f;
-  dest   =0.0;
-  shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
-  shmem_barrier_all();
-  f = dest;
-}
-void CartesianCommunicator::GlobalSumVector(float *f,int N)
-{
-  static float source ;
-  static float dest   = 0 ;
-  static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
-  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
-
-  if ( shmem_addr_accessible(f,_processor)  ){
-    shmem_float_sum_to_all(f,f,N,0,0,_Nprocessors,llwrk,psync.data());
-    shmem_barrier_all();
-    return;
-  }
-
-  for(int i=0;i<N;i++){
-    dest   =0.0;
-    source = f[i];
-    shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
-    shmem_barrier_all();
-    f[i] = dest;
-  }
-}
-void CartesianCommunicator::GlobalSum(double &d)
-{
-  static double source;
-  static double dest  ;
-  static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
-  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
-
-  source = d;
-  dest   = 0;
-  shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
-  shmem_barrier_all();
-  d = dest;
-}
-void CartesianCommunicator::GlobalSumVector(double *d,int N)
-{
-  static double source ;
-  static double dest   ;
-  static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
-  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
-
-
-  if ( shmem_addr_accessible(d,_processor)  ){
-    shmem_double_sum_to_all(d,d,N,0,0,_Nprocessors,llwrk,psync.data());
-    shmem_barrier_all();
-    return;
-  }
-
-  for(int i=0;i<N;i++){
-    source = d[i];
-    dest   =0.0;
-    shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
-    shmem_barrier_all();
-    d[i] = dest;
-  }
-}
-void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
-{
-  std::vector<int> coor = _processor_coor;
-
-  assert(std::abs(shift) <_processors[dim]);
-
-  coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim];
-  Lexicographic::IndexFromCoor(coor,source,_processors);
-
-  coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim];
-  Lexicographic::IndexFromCoor(coor,dest,_processors);
-
-}
-int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
-{
-  int rank;
-  Lexicographic::IndexFromCoor(coor,rank,_processors);
-  return rank;
-}
-void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
-{
-  Lexicographic::CoorFromIndex(coor,rank,_processors);
-}
-
-// Basic Halo comms primitive
-void CartesianCommunicator::SendToRecvFrom(void *xmit,
-					   int dest,
-					   void *recv,
-					   int from,
-					   int bytes)
-{
-  SHMEM_VET(xmit);
-  SHMEM_VET(recv);
-  std::vector<CommsRequest_t> reqs(0);
-  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
-  SendToRecvFromComplete(reqs);
-}
-
-void CartesianCommunicator::SendRecvPacket(void *xmit,
-					   void *recv,
-					   int sender,
-					   int receiver,
-					   int bytes)
-{
-  static uint64_t seq;
-
-  assert(recv!=xmit);
-  volatile HandShake *RecvSeq = (volatile HandShake *) & RConnections[sender];
-  volatile HandShake *SendSeq = (volatile HandShake *) & XConnections[receiver];
-
-  if ( _processor == sender ) {
-
-    // Check he has posted a receive
-    while(SendSeq->seq_remote == SendSeq->seq_local);
-
-    // Advance our send count
-    seq = ++(SendSeq->seq_local);
-    
-    // Send this packet 
-    SHMEM_VET(recv);
-    shmem_putmem(recv,xmit,bytes,receiver);
-    shmem_fence();
-
-    //Notify him we're done
-    shmem_putmem((void *)&(RecvSeq->seq_remote),&seq,sizeof(seq),receiver);
-    shmem_fence();
-  }
-  if ( _processor == receiver ) {
-
-    // Post a receive
-    seq = ++(RecvSeq->seq_local);
-    shmem_putmem((void *)&(SendSeq->seq_remote),&seq,sizeof(seq),sender);
-
-    // Now wait until he has advanced our reception counter
-    while(RecvSeq->seq_remote != RecvSeq->seq_local);
-
-  }
-}
-
-// Basic Halo comms primitive
-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-						void *xmit,
-						int dest,
-						void *recv,
-						int from,
-						int bytes)
-{
-  SHMEM_VET(xmit);
-  SHMEM_VET(recv);
-  //  shmem_putmem_nb(recv,xmit,bytes,dest,NULL);
-  shmem_putmem(recv,xmit,bytes,dest);
-
-  if ( CommunicatorPolicy == CommunicatorPolicySequential ) shmem_barrier_all(); 
-}
-void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
-{
-  //  shmem_quiet();      // I'm done
-  if( CommunicatorPolicy == CommunicatorPolicyConcurrent ) shmem_barrier_all();// He's done too
-}
-void CartesianCommunicator::Barrier(void)
-{
-  shmem_barrier_all();
-}
-void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
-{
-  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
-  static uint32_t word;
-  uint32_t *array = (uint32_t *) data;
-  assert( (bytes % 4)==0);
-  int words = bytes/4;
-
-  if ( shmem_addr_accessible(data,_processor)  ){
-    shmem_broadcast32(data,data,words,root,0,0,shmem_n_pes(),psync.data());
-    return;
-  }
-
-  for(int w=0;w<words;w++){
-    word = array[w];
-    shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync.data());
-    if ( shmem_my_pe() != root ) {
-      array[w] = word;
-    }
-    shmem_barrier_all();
-  }
-
-}
-void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
-{
-  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
-  static uint32_t word;
-  uint32_t *array = (uint32_t *) data;
-  assert( (bytes % 4)==0);
-  int words = bytes/4;
-
-  for(int w=0;w<words;w++){
-    word = array[w];
-    shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync.data());
-    if ( shmem_my_pe() != root ) {
-      array[w]= word;
-    }
-    shmem_barrier_all();
-  }
-}
-  
-int CartesianCommunicator::RankWorld(void){ 
-  return shmem_my_pe();
-}
-
-}
-
--- a/lib/communicator/SharedMemory.cc
+++ b/lib/communicator/SharedMemory.cc
@@ -0,0 +1,54 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/communicator/SharedMemory.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/GridCore.h>
+
+namespace Grid { 
+
+// static data
+
+uint64_t            GlobalSharedMemory::MAX_MPI_SHM_BYTES   = 1024LL*1024LL*1024LL; 
+int                 GlobalSharedMemory::Hugepages = 0;
+int                 GlobalSharedMemory::ShmSetup;
+
+std::vector<void *> GlobalSharedMemory::WorldShmCommBufs;
+
+Grid_MPI_Comm       GlobalSharedMemory::WorldShmComm;
+int                 GlobalSharedMemory::WorldShmRank;
+int                 GlobalSharedMemory::WorldShmSize;
+std::vector<int>    GlobalSharedMemory::WorldShmRanks;
+
+Grid_MPI_Comm       GlobalSharedMemory::WorldComm;
+int                 GlobalSharedMemory::WorldSize;
+int                 GlobalSharedMemory::WorldRank;
+
+int                 GlobalSharedMemory::WorldNodes;
+int                 GlobalSharedMemory::WorldNode;
+
+
+}
--- a/lib/communicator/SharedMemory.h
+++ b/lib/communicator/SharedMemory.h
@@ -0,0 +1,158 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/communicator/SharedMemory.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+
+// TODO
+// 1) move includes into SharedMemory.cc
+//
+// 2) split shared memory into a) optimal communicator creation from comm world
+// 
+//                             b) shared memory buffers container
+//                                -- static globally shared; init once
+//                                -- per instance set of buffers.
+//                                   
+
+#pragma once 
+
+#include <Grid/GridCore.h>
+
+#if defined (GRID_COMMS_MPI3) 
+#include <mpi.h>
+#endif 
+#include <semaphore.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <limits.h>
+#include <sys/types.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/mman.h>
+#include <zlib.h>
+#ifdef HAVE_NUMAIF_H
+#include <numaif.h>
+#endif
+
+namespace Grid {
+
+#if defined (GRID_COMMS_MPI3) 
+  typedef MPI_Comm    Grid_MPI_Comm;
+  typedef MPI_Request CommsRequest_t;
+#else 
+  typedef int CommsRequest_t;
+  typedef int Grid_MPI_Comm;
+#endif
+
+class GlobalSharedMemory {
+ private:
+  // Init once lock on the buffer allocation
+  static int ShmSetup;
+  static const int     MAXLOG2RANKSPERNODE = 16;            
+
+ public:
+  static uint64_t      MAX_MPI_SHM_BYTES;
+  static int           Hugepages;
+
+  static std::vector<void *> WorldShmCommBufs;
+
+  static Grid_MPI_Comm WorldComm;
+  static int           WorldRank;
+  static int           WorldSize;
+
+  static Grid_MPI_Comm WorldShmComm;
+  static int           WorldShmRank;
+  static int           WorldShmSize;
+
+  static int           WorldNodes;
+  static int           WorldNode;
+
+  static std::vector<int>  WorldShmRanks;
+
+  //////////////////////////////////////////////////////////////////////////////////////
+  // Create an optimal reordered communicator that makes MPI_Cart_create get it right
+  //////////////////////////////////////////////////////////////////////////////////////
+  static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD
+  static void OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
+  ///////////////////////////////////////////////////
+  // Provide shared memory facilities off comm world
+  ///////////////////////////////////////////////////
+  static void SharedMemoryAllocate(uint64_t bytes, int flags);
+  static void SharedMemoryFree(void);
+
+};
+
+//////////////////////////////
+// one per communicator
+//////////////////////////////
+class SharedMemory 
+{
+ private:
+  static const int     MAXLOG2RANKSPERNODE = 16;            
+
+  size_t heap_top;
+  size_t heap_bytes;
+  size_t heap_size;
+
+ protected:
+
+  Grid_MPI_Comm    ShmComm; // for barriers
+  int    ShmRank; 
+  int    ShmSize;
+  std::vector<void *> ShmCommBufs;
+  std::vector<int>    ShmRanks;// Mapping comm ranks to Shm ranks
+
+ public:
+  SharedMemory() {};
+  ///////////////////////////////////////////////////////////////////////////////////////
+  // set the buffers & sizes
+  ///////////////////////////////////////////////////////////////////////////////////////
+  void SetCommunicator(Grid_MPI_Comm comm);
+
+  ////////////////////////////////////////////////////////////////////////
+  // For this instance ; disjoint buffer sets between splits if split grid
+  ////////////////////////////////////////////////////////////////////////
+  void ShmBarrier(void); 
+
+  ///////////////////////////////////////////////////
+  // Call on any instance
+  ///////////////////////////////////////////////////
+  void SharedMemoryTest(void);
+  void *ShmBufferSelf(void);
+  void *ShmBuffer    (int rank);
+  void *ShmBufferTranslate(int rank,void * local_p);
+  void *ShmBufferMalloc(size_t bytes);
+  void  ShmBufferFreeAll(void) ;
+  
+  //////////////////////////////////////////////////////////////////////////
+  // Make info on Nodes & ranks and Shared memory available
+  //////////////////////////////////////////////////////////////////////////
+  int NodeCount(void) { return GlobalSharedMemory::WorldNodes;};
+  int RankCount(void) { return GlobalSharedMemory::WorldSize;};
+
+};
+
+}
--- a/lib/communicator/SharedMemoryMPI.cc
+++ b/lib/communicator/SharedMemoryMPI.cc
@@ -0,0 +1,415 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/communicator/SharedMemory.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/GridCore.h>
+
+namespace Grid { 
+
+/*Construct from an MPI communicator*/
+void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
+{
+  WorldComm = comm;
+  MPI_Comm_rank(WorldComm,&WorldRank);
+  MPI_Comm_size(WorldComm,&WorldSize);
+  // WorldComm, WorldSize, WorldRank
+
+  /////////////////////////////////////////////////////////////////////
+  // Split into groups that can share memory
+  /////////////////////////////////////////////////////////////////////
+  MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
+  MPI_Comm_rank(WorldShmComm     ,&WorldShmRank);
+  MPI_Comm_size(WorldShmComm     ,&WorldShmSize);
+  // WorldShmComm, WorldShmSize, WorldShmRank
+
+  // WorldNodes
+  WorldNodes = WorldSize/WorldShmSize;
+  assert( (WorldNodes * WorldShmSize) == WorldSize );
+
+  // FIXME: Check all WorldShmSize are the same ?
+
+  /////////////////////////////////////////////////////////////////////
+  // find world ranks in our SHM group (i.e. which ranks are on our node)
+  /////////////////////////////////////////////////////////////////////
+  MPI_Group WorldGroup, ShmGroup;
+  MPI_Comm_group (WorldComm, &WorldGroup); 
+  MPI_Comm_group (WorldShmComm, &ShmGroup);
+
+  std::vector<int> world_ranks(WorldSize);   for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
+
+  WorldShmRanks.resize(WorldSize); 
+  MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &WorldShmRanks[0]); 
+
+  ///////////////////////////////////////////////////////////////////
+  // Identify who is in my group and nominate the leader
+  ///////////////////////////////////////////////////////////////////
+  int g=0;
+  std::vector<int> MyGroup;
+  MyGroup.resize(WorldShmSize);
+  for(int rank=0;rank<WorldSize;rank++){
+    if(WorldShmRanks[rank]!=MPI_UNDEFINED){
+      assert(g<WorldShmSize);
+      MyGroup[g++] = rank;
+    }
+  }
+  
+  std::sort(MyGroup.begin(),MyGroup.end(),std::less<int>());
+  int myleader = MyGroup[0];
+  
+  std::vector<int> leaders_1hot(WorldSize,0);
+  std::vector<int> leaders_group(WorldNodes,0);
+  leaders_1hot [ myleader ] = 1;
+    
+  ///////////////////////////////////////////////////////////////////
+  // global sum leaders over comm world
+  ///////////////////////////////////////////////////////////////////
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,WorldComm);
+  assert(ierr==0);
+
+  ///////////////////////////////////////////////////////////////////
+  // find the group leaders world rank
+  ///////////////////////////////////////////////////////////////////
+  int group=0;
+  for(int l=0;l<WorldSize;l++){
+    if(leaders_1hot[l]){
+      leaders_group[group++] = l;
+    }
+  }
+
+  ///////////////////////////////////////////////////////////////////
+  // Identify the node of the group in which I (and my leader) live
+  ///////////////////////////////////////////////////////////////////
+  WorldNode=-1;
+  for(int g=0;g<WorldNodes;g++){
+    if (myleader == leaders_group[g]){
+      WorldNode=g;
+    }
+  }
+  assert(WorldNode!=-1);
+}
+
+void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
+{
+  ////////////////////////////////////////////////////////////////
+  // Assert power of two shm_size.
+  ////////////////////////////////////////////////////////////////
+  int log2size = -1;
+  for(int i=0;i<=MAXLOG2RANKSPERNODE;i++){  
+    if ( (0x1<<i) == WorldShmSize ) {
+      log2size = i;
+      break;
+    }
+  }
+  assert(log2size != -1);
+
+  ////////////////////////////////////////////////////////////////
+  // Identify subblock of ranks on node spreading across dims
+  // in a maximally symmetrical way
+  ////////////////////////////////////////////////////////////////
+  int ndimension              = processors.size();
+  std::vector<int> processor_coor(ndimension);
+  std::vector<int> WorldDims = processors;   std::vector<int> ShmDims  (ndimension,1);  std::vector<int> NodeDims (ndimension);
+  std::vector<int> ShmCoor  (ndimension);    std::vector<int> NodeCoor (ndimension);    std::vector<int> WorldCoor(ndimension);
+  int dim = 0;
+  for(int l2=0;l2<log2size;l2++){
+    while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
+    ShmDims[dim]*=2;
+    dim=(dim+1)%ndimension;
+  }
+
+  ////////////////////////////////////////////////////////////////
+  // Establish torus of processes and nodes with sub-blockings
+  ////////////////////////////////////////////////////////////////
+  for(int d=0;d<ndimension;d++){
+    NodeDims[d] = WorldDims[d]/ShmDims[d];
+  }
+
+  ////////////////////////////////////////////////////////////////
+  // Check processor counts match
+  ////////////////////////////////////////////////////////////////
+  int Nprocessors=1;
+  for(int i=0;i<ndimension;i++){
+    Nprocessors*=processors[i];
+  }
+  assert(WorldSize==Nprocessors);
+
+  ////////////////////////////////////////////////////////////////
+  // Establish mapping between lexico physics coord and WorldRank
+  ////////////////////////////////////////////////////////////////
+  int rank;
+
+  Lexicographic::CoorFromIndexReversed(NodeCoor,WorldNode   ,NodeDims);
+  Lexicographic::CoorFromIndexReversed(ShmCoor ,WorldShmRank,ShmDims);
+  for(int d=0;d<ndimension;d++) WorldCoor[d] = NodeCoor[d]*ShmDims[d]+ShmCoor[d];
+  Lexicographic::IndexFromCoorReversed(WorldCoor,rank,WorldDims);
+
+  /////////////////////////////////////////////////////////////////
+  // Build the new communicator
+  /////////////////////////////////////////////////////////////////
+  int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
+  assert(ierr==0);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////
+// Hugetlbfs mapping intended
+////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef GRID_MPI3_SHMMMAP
+void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
+{
+  GlobalSharedMemory::MAX_MPI_SHM_BYTES  = bytes;
+  assert(ShmSetup==0); ShmSetup=1;
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // allocate the shared windows for our group
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  MPI_Barrier(WorldShmComm);
+  WorldShmCommBufs.resize(WorldShmSize);
+  
+  ////////////////////////////////////////////////////////////////////////////////////////////
+  // Hugetlbf and others map filesystems as mappable huge pages
+  ////////////////////////////////////////////////////////////////////////////////////////////
+  char shm_name [NAME_MAX];
+  for(int r=0;r<WorldShmSize;r++){
+    
+    sprintf(shm_name,GRID_SHM_PATH "/Grid_mpi3_shm_%d_%d",WorldNode,r);
+    int fd=open(shm_name,O_RDWR|O_CREAT,0666);
+    if ( fd == -1) { 
+      printf("open %s failed\n",shm_name);
+      perror("open hugetlbfs");
+      exit(0);
+    }
+    int mmap_flag = MAP_SHARED ;
+#ifdef MAP_POPULATE    
+    mmap_flag|=MAP_POPULATE;
+#endif
+#ifdef MAP_HUGETLB
+    if ( flags ) mmap_flag |= MAP_HUGETLB;
+#endif
+    void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); 
+    if ( ptr == (void *)MAP_FAILED ) {    
+      printf("mmap %s failed\n",shm_name);
+      perror("failed mmap");      assert(0);    
+    }
+    assert(((uint64_t)ptr&0x3F)==0);
+    WorldShmCommBufs[r] =ptr;
+  }
+};
+#endif // MMAP
+
+#ifdef GRID_MPI3_SHMOPEN
+////////////////////////////////////////////////////////////////////////////////////////////
+// POSIX SHMOPEN ; as far as I know Linux does not allow EXPLICIT HugePages with this case
+// tmpfs (Larry Meadows says) does not support explicit huge page, and this is used for 
+// the posix shm virtual file system
+////////////////////////////////////////////////////////////////////////////////////////////
+void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
+{ 
+  GlobalSharedMemory::MAX_MPI_SHM_BYTES  = bytes;
+  assert(ShmSetup==0); ShmSetup=1;
+  MPI_Barrier(WorldShmComm);
+  WorldShmCommBufs.resize(WorldShmSize);
+
+  char shm_name [NAME_MAX];
+  if ( WorldShmRank == 0 ) {
+    for(int r=0;r<WorldShmSize;r++){
+	
+      size_t size = bytes;
+      
+      sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldNode,r);
+      
+      shm_unlink(shm_name);
+      int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
+      if ( fd < 0 ) {	perror("failed shm_open");	assert(0);      }
+      ftruncate(fd, size);
+	
+      int mmap_flag = MAP_SHARED;
+#ifdef MAP_POPULATE 
+      mmap_flag |= MAP_POPULATE;
+#endif
+#ifdef MAP_HUGETLB
+      if (flags) mmap_flag |= MAP_HUGETLB;
+#endif
+      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
+      
+      if ( ptr == (void * )MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
+      assert(((uint64_t)ptr&0x3F)==0);
+      
+      WorldShmCommBufs[r] =ptr;
+    }
+  }
+
+  MPI_Barrier(WorldShmComm);
+  
+  if ( WorldShmRank != 0 ) { 
+    for(int r=0;r<WorldShmSize;r++){
+
+      size_t size = bytes ;
+      
+      sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldNode,r);
+      
+      int fd=shm_open(shm_name,O_RDWR,0666);
+      if ( fd<0 ) {	perror("failed shm_open");	assert(0);      }
+      
+      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+      if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
+      assert(((uint64_t)ptr&0x3F)==0);
+      WorldShmCommBufs[r] =ptr;
+    }
+  }
+}
+#endif
+
+void GlobalSharedMemory::SharedMemoryFree(void)
+{
+  assert(ShmSetup);
+  assert(0); // unimplemented
+}
+
+  ////////////////////////////////////////////////////////
+  // Global shared functionality finished
+  // Now move to per communicator functionality
+  ////////////////////////////////////////////////////////
+void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
+{
+  int rank, size;
+  MPI_Comm_rank(comm,&rank);
+  MPI_Comm_size(comm,&size);
+  ShmRanks.resize(size);
+
+  /////////////////////////////////////////////////////////////////////
+  // Split into groups that can share memory
+  /////////////////////////////////////////////////////////////////////
+  MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
+  MPI_Comm_rank(ShmComm     ,&ShmRank);
+  MPI_Comm_size(ShmComm     ,&ShmSize);
+  ShmCommBufs.resize(ShmSize);
+
+  //////////////////////////////////////////////////////////////////////
+  // Map ShmRank to WorldShmRank and use the right buffer
+  //////////////////////////////////////////////////////////////////////
+  heap_size = GlobalSharedMemory::MAX_MPI_SHM_BYTES;
+  for(int r=0;r<ShmSize;r++){
+
+    uint32_t sr = (r==ShmRank) ? GlobalSharedMemory::WorldRank : 0 ;
+
+    MPI_Allreduce(MPI_IN_PLACE,&sr,1,MPI_UINT32_T,MPI_SUM,comm);
+
+    ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[sr];
+  }
+  ShmBufferFreeAll();
+
+  /////////////////////////////////////////////////////////////////////
+  // find comm ranks in our SHM group (i.e. which ranks are on our node)
+  /////////////////////////////////////////////////////////////////////
+  MPI_Group FullGroup, ShmGroup;
+  MPI_Comm_group (comm   , &FullGroup); 
+  MPI_Comm_group (ShmComm, &ShmGroup);
+
+  std::vector<int> ranks(size);   for(int r=0;r<size;r++) ranks[r]=r;
+  MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]); 
+}
+//////////////////////////////////////////////////////////////////
+// On node barrier
+//////////////////////////////////////////////////////////////////
+void SharedMemory::ShmBarrier(void)
+{
+  MPI_Barrier  (ShmComm);
+}
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Test the shared memory is working
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+void SharedMemory::SharedMemoryTest(void)
+{
+  ShmBarrier();
+  if ( ShmRank == 0 ) {
+    for(int r=0;r<ShmSize;r++){
+      uint64_t * check = (uint64_t *) ShmCommBufs[r];
+      check[0] = GlobalSharedMemory::WorldNode;
+      check[1] = r;
+      check[2] = 0x5A5A5A;
+    }
+  }
+  ShmBarrier();
+  for(int r=0;r<ShmSize;r++){
+    uint64_t * check = (uint64_t *) ShmCommBufs[r];
+    
+    assert(check[0]==GlobalSharedMemory::WorldNode);
+    assert(check[1]==r);
+    assert(check[2]==0x5A5A5A);
+    
+  }
+  ShmBarrier();
+}
+
+void *SharedMemory::ShmBufferSelf(void)
+{
+  return ShmCommBufs[ShmRank];
+}
+void *SharedMemory::ShmBuffer(int rank)
+{
+  int gpeer = ShmRanks[rank];
+  if (gpeer == MPI_UNDEFINED){
+    return NULL;
+  } else { 
+    return ShmCommBufs[gpeer];
+  }
+}
+void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
+{
+  static int count =0;
+  int gpeer = ShmRanks[rank];
+  assert(gpeer!=ShmRank); // never send to self
+  if (gpeer == MPI_UNDEFINED){
+    return NULL;
+  } else { 
+    uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
+    uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
+    return (void *) remote;
+  }
+}
+
+/////////////////////////////////
+// Alloc, free shmem region
+/////////////////////////////////
+void *SharedMemory::ShmBufferMalloc(size_t bytes){
+  //  bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
+  void *ptr = (void *)heap_top;
+  heap_top  += bytes;
+  heap_bytes+= bytes;
+  if (heap_bytes >= heap_size) {
+    std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
+    std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
+    std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl;
+    assert(heap_bytes<heap_size);
+  }
+  return ptr;
+}
+void SharedMemory::ShmBufferFreeAll(void) { 
+  heap_top  =(size_t)ShmBufferSelf();
+  heap_bytes=0;
+}
+
+}
--- a/lib/communicator/SharedMemoryNone.cc
+++ b/lib/communicator/SharedMemoryNone.cc
@@ -0,0 +1,150 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/communicator/SharedMemory.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/GridCore.h>
+
+namespace Grid { 
+
+/*Construct from an MPI communicator*/
+void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
+{
+  WorldComm = 0;
+  WorldRank = 0;
+  WorldSize = 1;
+  WorldShmComm = 0 ;
+  WorldShmRank = 0 ;
+  WorldShmSize = 1 ;
+  WorldNodes   = 1 ;
+  WorldNode    = 0 ;
+  WorldShmRanks.resize(WorldSize); WorldShmRanks[0] = 0;
+  WorldShmCommBufs.resize(1);
+}
+
+void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
+{
+  optimal_comm = WorldComm;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////
+// Hugetlbfs mapping intended, use anonymous mmap
+////////////////////////////////////////////////////////////////////////////////////////////
+void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
+{
+  void * ShmCommBuf ; 
+  MAX_MPI_SHM_BYTES=bytes;
+  int mmap_flag =0;
+#ifdef MAP_ANONYMOUS
+  mmap_flag = mmap_flag| MAP_SHARED | MAP_ANONYMOUS;
+#endif
+#ifdef MAP_ANON
+  mmap_flag = mmap_flag| MAP_SHARED | MAP_ANON;
+#endif
+#ifdef MAP_HUGETLB
+  if ( flags ) mmap_flag |= MAP_HUGETLB;
+#endif
+  ShmCommBuf =(void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag, -1, 0); 
+  if (ShmCommBuf == (void *)MAP_FAILED) {
+    perror("mmap failed ");
+    exit(EXIT_FAILURE);  
+  }
+#ifdef MADV_HUGEPAGE
+  if (!Hugepages ) madvise(ShmCommBuf,bytes,MADV_HUGEPAGE);
+#endif
+  bzero(ShmCommBuf,bytes);
+  WorldShmCommBufs[0] = ShmCommBuf;
+};
+
+void GlobalSharedMemory::SharedMemoryFree(void)
+{
+  assert(ShmSetup);
+  assert(0); // unimplemented
+}
+
+  ////////////////////////////////////////////////////////
+  // Global shared functionality finished
+  // Now move to per communicator functionality
+  ////////////////////////////////////////////////////////
+void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
+{
+  ShmRanks.resize(1);
+  ShmCommBufs.resize(1);
+  ShmRanks[0] = 0;
+  ShmRank     = 0;
+  ShmSize     = 1;
+  //////////////////////////////////////////////////////////////////////
+  // Map ShmRank to WorldShmRank and use the right buffer
+  //////////////////////////////////////////////////////////////////////
+  ShmCommBufs[0] = GlobalSharedMemory::WorldShmCommBufs[0];
+  heap_size      = GlobalSharedMemory::MAX_MPI_SHM_BYTES;
+  ShmBufferFreeAll();
+  return;
+}
+//////////////////////////////////////////////////////////////////
+// On node barrier
+//////////////////////////////////////////////////////////////////
+void SharedMemory::ShmBarrier(void){ return ; }
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Test the shared memory is working
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+void SharedMemory::SharedMemoryTest(void) { return; }
+
+void *SharedMemory::ShmBufferSelf(void)
+{
+  return ShmCommBufs[ShmRank];
+}
+void *SharedMemory::ShmBuffer(int rank)
+{
+  return NULL;
+}
+void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
+{
+  return NULL;
+}
+
+/////////////////////////////////
+// Alloc, free shmem region ; common to MPI and none?
+/////////////////////////////////
+void *SharedMemory::ShmBufferMalloc(size_t bytes){
+  void *ptr = (void *)heap_top;
+  heap_top  += bytes;
+  heap_bytes+= bytes;
+  if (heap_bytes >= heap_size) {
+    std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
+    std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
+    std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl;
+    assert(heap_bytes<heap_size);
+  }
+  return ptr;
+}
+void SharedMemory::ShmBufferFreeAll(void) { 
+  heap_top  =(size_t)ShmBufferSelf();
+  heap_bytes=0;
+}
+
+}