diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc
index 6a283085..a32d7535 100644
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -208,7 +208,6 @@ int main (int argc, char ** argv)
 
     std::cout<<GridLogMessage<< "res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl;
 
-
     RealF sum=0;
     for(int x=0;x<latt4[0];x++){
     for(int y=0;y<latt4[1];y++){
@@ -221,9 +220,9 @@ int main (int argc, char ** argv)
       peekSite(simd,sresult,site);
       sum=sum+norm2(normal-simd);
       if (norm2(normal-simd) > 1.0e-6 ) {
-  std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
-  std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" normal "<<normal<<std::endl;
-  std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" simd   "<<simd<<std::endl;
+	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
+	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" normal "<<normal<<std::endl;
+	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" simd   "<<simd<<std::endl;
       }
     }}}}}
     std::cout<<GridLogMessage<<" difference between normal and simd is "<<sum<<std::endl;
diff --git a/lib/AlignedAllocator.h b/lib/AlignedAllocator.h
index 89731246..44929ca8 100644
--- a/lib/AlignedAllocator.h
+++ b/lib/AlignedAllocator.h
@@ -151,15 +151,6 @@ public:
   void deallocate(pointer __p, size_type) { 
     shmem_free((void *)__p);
   }
-#elif defined(GRID_COMMS_MPI3)
-  pointer allocate(size_type __n, const void* _p= 0)
-  { 
-#error "implement MPI3 windowed allocate"
-    
-  }
-  void deallocate(pointer __p, size_type) { 
-#error "implement MPI3 windowed allocate"
-  }
 #else
   pointer allocate(size_type __n, const void* _p= 0) 
   {
diff --git a/lib/Init.cc b/lib/Init.cc
index bd9638c3..a21cbce0 100644
--- a/lib/Init.cc
+++ b/lib/Init.cc
@@ -279,7 +279,7 @@ void Grid_init(int *argc,char ***argv)
   
 void Grid_finalize(void)
 {
-#ifdef GRID_COMMS_MPI
+#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
   MPI_Finalize();
   Grid_unquiesce_nodes();
 #endif
diff --git a/lib/Log.cc b/lib/Log.cc
index 2082570d..a55a1c9e 100644
--- a/lib/Log.cc
+++ b/lib/Log.cc
@@ -73,7 +73,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
 ////////////////////////////////////////////////////////////
 void Grid_quiesce_nodes(void) {
   int me = 0;
-#ifdef GRID_COMMS_MPI
+#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3)
   MPI_Comm_rank(MPI_COMM_WORLD, &me);
 #endif
 #ifdef GRID_COMMS_SHMEM
diff --git a/lib/Threads.h b/lib/Threads.h
index d502dd82..70350685 100644
--- a/lib/Threads.h
+++ b/lib/Threads.h
@@ -31,6 +31,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 
 #ifdef _OPENMP
 #define GRID_OMP
+#warning "OpenMP"
 #endif
 
 #define UNROLL  _Pragma("unroll")
@@ -127,6 +128,22 @@ class GridThread {
     ThreadBarrier();
   };
 
+  static void bcopy(const void *src, void *dst, size_t len) {
+#ifdef GRID_OMP
+#pragma omp parallel 
+    {
+      const char *c_src =(char *) src;
+      char *c_dest=(char *) dst;
+      int me,mywork,myoff;
+      GridThread::GetWorkBarrier(len,me, mywork,myoff);
+      bcopy(&c_src[myoff],&c_dest[myoff],mywork);
+    }
+#else 
+    bcopy(src,dst,len);
+#endif
+  }
+
+
 };
 
 }
diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h
index 9b5ae8cb..6d6602be 100644
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -45,7 +45,7 @@ class CartesianCommunicator {
   public:    
 
   // Communicator should know nothing of the physics grid, only processor grid.
-
+  
     int              _Nprocessors;     // How many in all
     std::vector<int> _processors;      // Which dimensions get relayed out over processors lanes.
     int              _processor;       // linear processor rank
@@ -56,10 +56,13 @@ class CartesianCommunicator {
     MPI_Comm communicator;
     typedef MPI_Request CommsRequest_t;
 #elif  GRID_COMMS_MPI3
+    int shm_mode;
+
     MPI_Comm communicator;
     typedef MPI_Request CommsRequest_t;
 
-    const int MAXLOG2RANKSPERNODE = 16; // 65536 ranks per node adequate for now
+    const int MAXLOG2RANKSPERNODE = 16;     // 65536 ranks per node adequate for now
+    const uint64_t MAX_MPI_SHM_BYTES = 256*1024*1024; // 256MB shared memory for comms enought for 48^4 local vol comms
 
     std::vector<int>  WorldDims;
     std::vector<int>  GroupDims;
@@ -69,14 +72,23 @@ class CartesianCommunicator {
     std::vector<int> ShmCoor;
     std::vector<int> WorldCoor;
 
-    int GroupRank;
-    int ShmRank;
-    int WorldRank;
+    static std::vector<int> GroupRanks; 
+    static std::vector<int> MyGroup;
+    static int ShmSetup;
+    static MPI_Win ShmWindow; 
+    static MPI_Comm ShmComm;
 
-    int GroupSize;
-    int ShmSize;
+    void * ShmCommBuf;
+    std::vector<void *> ShmCommBufs;
+
+    int WorldRank;
     int WorldSize;
 
+    static int ShmRank;
+    static int ShmSize;
+    static int GroupSize;
+    static int GroupRank;
+
     std::vector<int>  LexicographicToWorldRank;
 #else 
     typedef int CommsRequest_t;
diff --git a/lib/communicator/Communicator_mpi.cc b/lib/communicator/Communicator_mpi.cc
index dff9811a..9c66202f 100644
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@@ -39,11 +39,11 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
   }
 }
 
-  int Rank(void) {
-    int pe;
-    MPI_Comm_rank(MPI_COMM_WORLD,&pe);
-    return pe;
-  }
+int Rank(void) {
+  int pe;
+  MPI_Comm_rank(MPI_COMM_WORLD,&pe);
+  return pe;
+}
 
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
@@ -168,7 +168,6 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
   int nreq=list.size();
   std::vector<MPI_Status> status(nreq);
   int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
-
   assert(ierr==0);
 }
 
diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc
index 11734fd9..1a36e610 100644
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -30,6 +30,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 
 namespace Grid {
 
+
+
 // Global used by Init and nowhere else. How to hide?
 int Rank(void) {
   int pe;
@@ -76,29 +78,129 @@ void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &c
   rank = LexicographicToWorldRank[rank];
 }
 
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Info that is setup once and indept of cartesian layout
+///////////////////////////////////////////////////////////////////////////////////////////////////
+int CartesianCommunicator::ShmSetup = 0;
+int CartesianCommunicator::ShmRank;
+int CartesianCommunicator::ShmSize;
+int CartesianCommunicator::GroupRank;
+int CartesianCommunicator::GroupSize;
+MPI_Comm CartesianCommunicator::ShmComm;
+MPI_Win  CartesianCommunicator::ShmWindow;
+std::vector<int> CartesianCommunicator::GroupRanks; 
+std::vector<int> CartesianCommunicator::MyGroup;
+
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 { 
+
   _ndimension = processors.size();
-  std::cout << "Creating "<< _ndimension << " dim communicator "<<std::endl;
-  for(int d =0;d<_ndimension;d++){
-    std::cout << processors[d]<<" ";
-  };
-  std::cout << std::endl;
 
   WorldDims = processors;
 
   communicator = MPI_COMM_WORLD;
-  MPI_Comm shmcomm;
-  MPI_Comm_split_type(communicator, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&shmcomm);
   MPI_Comm_rank(communicator,&WorldRank);
   MPI_Comm_size(communicator,&WorldSize);
-  MPI_Comm_rank(shmcomm     ,&ShmRank);
-  MPI_Comm_size(shmcomm     ,&ShmSize);
-  GroupSize = WorldSize/ShmSize;
 
-  std::cout<< "Ranks per node "<< ShmSize << std::endl;
-  std::cout<< "Nodes          "<< GroupSize << std::endl;
-  std::cout<< "Ranks          "<< WorldSize << std::endl;
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Plan: allocate a fixed SHM region. Scratch that is just used via some scheme during stencil comms, with no allocate free.
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Does every grid need one, or could we share across all grids via a singleton/guard?
+  int ierr;
+
+  if ( !ShmSetup ) { 
+
+    MPI_Comm_split_type(communicator, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
+    MPI_Comm_rank(ShmComm     ,&ShmRank);
+    MPI_Comm_size(ShmComm     ,&ShmSize);
+    GroupSize = WorldSize/ShmSize;
+
+    /////////////////////////////////////////////////////////////////////
+    // find world ranks in our SHM group (i.e. which ranks are on our node)
+    /////////////////////////////////////////////////////////////////////
+    MPI_Group WorldGroup, ShmGroup;
+    MPI_Comm_group (communicator, &WorldGroup); 
+    MPI_Comm_group (ShmComm, &ShmGroup);
+
+    std::vector<int> world_ranks(WorldSize); 
+    GroupRanks.resize(WorldSize); 
+    MyGroup.resize(ShmSize);
+    for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
+  
+    MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &GroupRanks[0]); 
+
+    ///////////////////////////////////////////////////////////////////
+    // Identify who is in my group and noninate the leader
+    ///////////////////////////////////////////////////////////////////
+    int g=0;
+    for(int rank=0;rank<WorldSize;rank++){
+      if(GroupRanks[rank]!=MPI_UNDEFINED){
+	assert(g<ShmSize);
+	MyGroup[g++] = rank;
+      }
+    }
+  
+    std::sort(MyGroup.begin(),MyGroup.end(),std::greater<int>());
+    int myleader = MyGroup[0];
+    
+    std::vector<int> leaders_1hot(WorldSize,0);
+    std::vector<int> leaders_group(GroupSize,0);
+    leaders_1hot [ myleader ] = 1;
+    
+    ///////////////////////////////////////////////////////////////////
+    // global sum leaders over comm world
+    ///////////////////////////////////////////////////////////////////
+    ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator);
+    assert(ierr==0);
+  
+    ///////////////////////////////////////////////////////////////////
+    // find the group leaders world rank
+    ///////////////////////////////////////////////////////////////////
+    int group=0;
+    for(int l=0;l<WorldSize;l++){
+      if(leaders_1hot[l]){
+	leaders_group[group++] = l;
+      }
+    }
+  
+    ///////////////////////////////////////////////////////////////////
+    // Identify the rank of the group in which I (and my leader) live
+    ///////////////////////////////////////////////////////////////////
+    GroupRank=-1;
+    for(int g=0;g<GroupSize;g++){
+      if (myleader == leaders_group[g]){
+	GroupRank=g;
+      }
+    }
+    assert(GroupRank!=-1);
+    
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // allocate the shared window for our group
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    
+    ShmCommBuf = 0;
+    ierr = MPI_Win_allocate_shared(MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,ShmComm,&ShmCommBuf,&ShmWindow);
+    assert(ierr==0);
+
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Verbose for now
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    std::cout<< "Ranks per node "<< ShmSize << std::endl;
+    std::cout<< "Nodes          "<< GroupSize << std::endl;
+    std::cout<< "Ranks          "<< WorldSize << std::endl;
+    std::cout<< "Shm CommBuf "<< ShmCommBuf << std::endl;
+
+    // Done
+    ShmSetup=1;
+
+  }
+
+  ShmCommBufs.resize(ShmSize);
+  for(int r=0;r<ShmSize;r++){
+    MPI_Aint sz;
+    int dsp_unit;
+    MPI_Win_shared_query (ShmWindow, r, &sz, &dsp_unit, &ShmCommBufs[r]);
+  }
   
   ////////////////////////////////////////////////////////////////
   // Assert power of two shm_size.
@@ -120,7 +222,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
   
   ShmDims.resize(_ndimension,1);
   GroupDims.resize(_ndimension);
-  
+    
   ShmCoor.resize(_ndimension);
   GroupCoor.resize(_ndimension);
   WorldCoor.resize(_ndimension);
@@ -129,12 +231,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
     ShmDims[dim]*=2;
     dim=(dim+1)%_ndimension;
   }
-  
-  std::cout << "Shm group dims "<<std::endl;
-  for(int d =0;d<_ndimension;d++){
-    std::cout << ShmDims[d]<<" ";
-  };
-  std::cout << std::endl;
 
   ////////////////////////////////////////////////////////////////
   // Establish torus of processes and nodes with sub-blockings
@@ -142,22 +238,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
   for(int d=0;d<_ndimension;d++){
     GroupDims[d] = WorldDims[d]/ShmDims[d];
   }
-  std::cout << "Group dims "<<std::endl;
-  for(int d =0;d<_ndimension;d++){
-    std::cout << GroupDims[d]<<" ";
-  };
-  std::cout << std::endl;
-  
-  MPI_Group WorldGroup, ShmGroup;
-  MPI_Comm_group (communicator, &WorldGroup); 
-  MPI_Comm_group (shmcomm, &ShmGroup);
-  
-  std::vector<int> world_ranks(WorldSize); 
-  std::vector<int> group_ranks(WorldSize); 
-  std::vector<int> mygroup(GroupSize);
-  for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
-  
-  MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &group_ranks[0]); 
 
   ////////////////////////////////////////////////////////////////
   // Check processor counts match
@@ -166,55 +246,9 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
   _processors = processors;
   _processor_coor.resize(_ndimension);
   for(int i=0;i<_ndimension;i++){
-    std::cout << " p " << _processors[i]<<std::endl;
     _Nprocessors*=_processors[i];
   }
-  std::cout << " World " <<WorldSize <<" Nproc "<<_Nprocessors<<std::endl;
   assert(WorldSize==_Nprocessors);
-  
-  ///////////////////////////////////////////////////////////////////
-  // Identify who is in my group and noninate the leader
-  ///////////////////////////////////////////////////////////////////
-  int g=0;
-  for(int rank=0;rank<WorldSize;rank++){
-    if(group_ranks[rank]!=MPI_UNDEFINED){
-	  mygroup[g] = rank;
-    }
-  }
-  
-  std::sort(mygroup.begin(),mygroup.end(),std::greater<int>());
-  int myleader = mygroup[0];
-  
-  std::vector<int> leaders_1hot(WorldSize,0);
-  std::vector<int> leaders_group(GroupSize,0);
-  leaders_1hot [ myleader ] = 1;
-      
-  ///////////////////////////////////////////////////////////////////
-  // global sum leaders over comm world
-  ///////////////////////////////////////////////////////////////////
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator);
-  assert(ierr==0);
-  
-  ///////////////////////////////////////////////////////////////////
-  // find the group leaders world rank
-  ///////////////////////////////////////////////////////////////////
-  int group=0;
-  for(int l=0;l<WorldSize;l++){
-    if(leaders_1hot[l]){
-      leaders_group[group++] = l;
-    }
-  }
-  
-  ///////////////////////////////////////////////////////////////////
-  // Identify the rank of the group in which I (and my leader) live
-  ///////////////////////////////////////////////////////////////////
-  GroupRank=-1;
-  for(int g=0;g<GroupSize;g++){
-    if (myleader == leaders_group[g]){
-      GroupRank=g;
-    }
-  }
-  assert(GroupRank!=-1);
       
   ////////////////////////////////////////////////////////////////
   // Establish mapping between lexico physics coord and WorldRank
@@ -309,16 +343,56 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 {
   MPI_Request xrq;
   MPI_Request rrq;
+  
   int rank = _processor;
   int ierr;
-  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
-  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
+  int tag;
+  int small = (bytes<MAX_MPI_SHM_BYTES) || (shm_mode==0);
+  static int sequence;
+  int check;
+  assert(dest != _processor);
+  assert(from != _processor);
   
-  assert(ierr==0);
-
-  list.push_back(xrq);
-  list.push_back(rrq);
+  int gdest = GroupRanks[dest];
+  int gme   = GroupRanks[_processor];
+  sequence++;
+  
+  assert(gme == ShmRank);
+  
+  if ( small && (dest !=MPI_UNDEFINED) ) {
+    char *ptr = (char *)ShmCommBufs[gdest];
+    assert(gme != gdest);
+    GridThread::bcopy(xmit,ptr,bytes);
+    bcopy(&_processor,&ptr[bytes],sizeof(_processor));
+    bcopy(&  sequence,&ptr[bytes+4],sizeof(sequence));
+  } else { 
+    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+    assert(ierr==0);
+    list.push_back(xrq);
+  }
+  
+  MPI_Win_sync (ShmWindow);   
+  MPI_Barrier  (ShmComm);
+  MPI_Win_sync (ShmWindow);   
+  
+  if (small && (from !=MPI_UNDEFINED) ) {
+    char *ptr = (char *)ShmCommBufs[ShmRank];
+    GridThread::bcopy(ptr,recv,bytes);
+    bcopy(&ptr[bytes]  ,&tag  ,sizeof(tag));
+    bcopy(&ptr[bytes+4],&check,sizeof(check));
+    assert(check==sequence);
+    assert(tag==from);
+  } else { 
+    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
+    assert(ierr==0);
+    list.push_back(rrq);
+  }
+  
+  MPI_Win_sync (ShmWindow);   
+  MPI_Barrier  (ShmComm);
+  MPI_Win_sync (ShmWindow);   
 }
+
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
   int nreq=list.size();