From a762b1fb712fa620e5662e4e7d41a94925ecff00 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Fri, 21 Oct 2016 09:03:26 +0100
Subject: [PATCH 01/27] MPI3 working with a bounce through shared memory on my
 laptop. Longer term plan: make the "u_comm_buf" in Stencil point to the
 shared region and avoid the send between ranks on same node.

---
 benchmarks/Benchmark_dwf.cc           |   7 +-
 lib/AlignedAllocator.h                |   9 -
 lib/Init.cc                           |   2 +-
 lib/Log.cc                            |   2 +-
 lib/Threads.h                         |  17 ++
 lib/communicator/Communicator_base.h  |  26 ++-
 lib/communicator/Communicator_mpi.cc  |  11 +-
 lib/communicator/Communicator_mpi3.cc | 250 +++++++++++++++++---------
 8 files changed, 208 insertions(+), 116 deletions(-)

diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc
index 6a283085..a32d7535 100644
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -208,7 +208,6 @@ int main (int argc, char ** argv)
 
     std::cout<<GridLogMessage<< "res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl;
 
-
     RealF sum=0;
     for(int x=0;x<latt4[0];x++){
     for(int y=0;y<latt4[1];y++){
@@ -221,9 +220,9 @@ int main (int argc, char ** argv)
       peekSite(simd,sresult,site);
       sum=sum+norm2(normal-simd);
       if (norm2(normal-simd) > 1.0e-6 ) {
-  std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
-  std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" normal "<<normal<<std::endl;
-  std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" simd   "<<simd<<std::endl;
+	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
+	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" normal "<<normal<<std::endl;
+	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" simd   "<<simd<<std::endl;
       }
     }}}}}
     std::cout<<GridLogMessage<<" difference between normal and simd is "<<sum<<std::endl;
diff --git a/lib/AlignedAllocator.h b/lib/AlignedAllocator.h
index 89731246..44929ca8 100644
--- a/lib/AlignedAllocator.h
+++ b/lib/AlignedAllocator.h
@@ -151,15 +151,6 @@ public:
   void deallocate(pointer __p, size_type) { 
     shmem_free((void *)__p);
   }
-#elif defined(GRID_COMMS_MPI3)
-  pointer allocate(size_type __n, const void* _p= 0)
-  { 
-#error "implement MPI3 windowed allocate"
-    
-  }
-  void deallocate(pointer __p, size_type) { 
-#error "implement MPI3 windowed allocate"
-  }
 #else
   pointer allocate(size_type __n, const void* _p= 0) 
   {
diff --git a/lib/Init.cc b/lib/Init.cc
index bd9638c3..a21cbce0 100644
--- a/lib/Init.cc
+++ b/lib/Init.cc
@@ -279,7 +279,7 @@ void Grid_init(int *argc,char ***argv)
   
 void Grid_finalize(void)
 {
-#ifdef GRID_COMMS_MPI
+#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
   MPI_Finalize();
   Grid_unquiesce_nodes();
 #endif
diff --git a/lib/Log.cc b/lib/Log.cc
index 2082570d..a55a1c9e 100644
--- a/lib/Log.cc
+++ b/lib/Log.cc
@@ -73,7 +73,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
 ////////////////////////////////////////////////////////////
 void Grid_quiesce_nodes(void) {
   int me = 0;
-#ifdef GRID_COMMS_MPI
+#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3)
   MPI_Comm_rank(MPI_COMM_WORLD, &me);
 #endif
 #ifdef GRID_COMMS_SHMEM
diff --git a/lib/Threads.h b/lib/Threads.h
index d502dd82..70350685 100644
--- a/lib/Threads.h
+++ b/lib/Threads.h
@@ -31,6 +31,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 
 #ifdef _OPENMP
 #define GRID_OMP
+#warning "OpenMP"
 #endif
 
 #define UNROLL  _Pragma("unroll")
@@ -127,6 +128,22 @@ class GridThread {
     ThreadBarrier();
   };
 
+  static void bcopy(const void *src, void *dst, size_t len) {
+#ifdef GRID_OMP
+#pragma omp parallel 
+    {
+      const char *c_src =(char *) src;
+      char *c_dest=(char *) dst;
+      int me,mywork,myoff;
+      GridThread::GetWorkBarrier(len,me, mywork,myoff);
+      bcopy(&c_src[myoff],&c_dest[myoff],mywork);
+    }
+#else 
+    bcopy(src,dst,len);
+#endif
+  }
+
+
 };
 
 }
diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h
index 9b5ae8cb..6d6602be 100644
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -45,7 +45,7 @@ class CartesianCommunicator {
   public:    
 
   // Communicator should know nothing of the physics grid, only processor grid.
-
+  
     int              _Nprocessors;     // How many in all
     std::vector<int> _processors;      // Which dimensions get relayed out over processors lanes.
     int              _processor;       // linear processor rank
@@ -56,10 +56,13 @@ class CartesianCommunicator {
     MPI_Comm communicator;
     typedef MPI_Request CommsRequest_t;
 #elif  GRID_COMMS_MPI3
+    int shm_mode;
+
     MPI_Comm communicator;
     typedef MPI_Request CommsRequest_t;
 
-    const int MAXLOG2RANKSPERNODE = 16; // 65536 ranks per node adequate for now
+    const int MAXLOG2RANKSPERNODE = 16;     // 65536 ranks per node adequate for now
+    const uint64_t MAX_MPI_SHM_BYTES = 256*1024*1024; // 256MB shared memory for comms enought for 48^4 local vol comms
 
     std::vector<int>  WorldDims;
     std::vector<int>  GroupDims;
@@ -69,14 +72,23 @@ class CartesianCommunicator {
     std::vector<int> ShmCoor;
     std::vector<int> WorldCoor;
 
-    int GroupRank;
-    int ShmRank;
-    int WorldRank;
+    static std::vector<int> GroupRanks; 
+    static std::vector<int> MyGroup;
+    static int ShmSetup;
+    static MPI_Win ShmWindow; 
+    static MPI_Comm ShmComm;
 
-    int GroupSize;
-    int ShmSize;
+    void * ShmCommBuf;
+    std::vector<void *> ShmCommBufs;
+
+    int WorldRank;
     int WorldSize;
 
+    static int ShmRank;
+    static int ShmSize;
+    static int GroupSize;
+    static int GroupRank;
+
     std::vector<int>  LexicographicToWorldRank;
 #else 
     typedef int CommsRequest_t;
diff --git a/lib/communicator/Communicator_mpi.cc b/lib/communicator/Communicator_mpi.cc
index dff9811a..9c66202f 100644
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@@ -39,11 +39,11 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
   }
 }
 
-  int Rank(void) {
-    int pe;
-    MPI_Comm_rank(MPI_COMM_WORLD,&pe);
-    return pe;
-  }
+int Rank(void) {
+  int pe;
+  MPI_Comm_rank(MPI_COMM_WORLD,&pe);
+  return pe;
+}
 
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
@@ -168,7 +168,6 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
   int nreq=list.size();
   std::vector<MPI_Status> status(nreq);
   int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
-
   assert(ierr==0);
 }
 
diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc
index 11734fd9..1a36e610 100644
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -30,6 +30,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 
 namespace Grid {
 
+
+
 // Global used by Init and nowhere else. How to hide?
 int Rank(void) {
   int pe;
@@ -76,29 +78,129 @@ void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &c
   rank = LexicographicToWorldRank[rank];
 }
 
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Info that is setup once and indept of cartesian layout
+///////////////////////////////////////////////////////////////////////////////////////////////////
+int CartesianCommunicator::ShmSetup = 0;
+int CartesianCommunicator::ShmRank;
+int CartesianCommunicator::ShmSize;
+int CartesianCommunicator::GroupRank;
+int CartesianCommunicator::GroupSize;
+MPI_Comm CartesianCommunicator::ShmComm;
+MPI_Win  CartesianCommunicator::ShmWindow;
+std::vector<int> CartesianCommunicator::GroupRanks; 
+std::vector<int> CartesianCommunicator::MyGroup;
+
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 { 
+
   _ndimension = processors.size();
-  std::cout << "Creating "<< _ndimension << " dim communicator "<<std::endl;
-  for(int d =0;d<_ndimension;d++){
-    std::cout << processors[d]<<" ";
-  };
-  std::cout << std::endl;
 
   WorldDims = processors;
 
   communicator = MPI_COMM_WORLD;
-  MPI_Comm shmcomm;
-  MPI_Comm_split_type(communicator, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&shmcomm);
   MPI_Comm_rank(communicator,&WorldRank);
   MPI_Comm_size(communicator,&WorldSize);
-  MPI_Comm_rank(shmcomm     ,&ShmRank);
-  MPI_Comm_size(shmcomm     ,&ShmSize);
-  GroupSize = WorldSize/ShmSize;
 
-  std::cout<< "Ranks per node "<< ShmSize << std::endl;
-  std::cout<< "Nodes          "<< GroupSize << std::endl;
-  std::cout<< "Ranks          "<< WorldSize << std::endl;
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Plan: allocate a fixed SHM region. Scratch that is just used via some scheme during stencil comms, with no allocate free.
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Does every grid need one, or could we share across all grids via a singleton/guard?
+  int ierr;
+
+  if ( !ShmSetup ) { 
+
+    MPI_Comm_split_type(communicator, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
+    MPI_Comm_rank(ShmComm     ,&ShmRank);
+    MPI_Comm_size(ShmComm     ,&ShmSize);
+    GroupSize = WorldSize/ShmSize;
+
+    /////////////////////////////////////////////////////////////////////
+    // find world ranks in our SHM group (i.e. which ranks are on our node)
+    /////////////////////////////////////////////////////////////////////
+    MPI_Group WorldGroup, ShmGroup;
+    MPI_Comm_group (communicator, &WorldGroup); 
+    MPI_Comm_group (ShmComm, &ShmGroup);
+
+    std::vector<int> world_ranks(WorldSize); 
+    GroupRanks.resize(WorldSize); 
+    MyGroup.resize(ShmSize);
+    for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
+  
+    MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &GroupRanks[0]); 
+
+    ///////////////////////////////////////////////////////////////////
+    // Identify who is in my group and noninate the leader
+    ///////////////////////////////////////////////////////////////////
+    int g=0;
+    for(int rank=0;rank<WorldSize;rank++){
+      if(GroupRanks[rank]!=MPI_UNDEFINED){
+	assert(g<ShmSize);
+	MyGroup[g++] = rank;
+      }
+    }
+  
+    std::sort(MyGroup.begin(),MyGroup.end(),std::greater<int>());
+    int myleader = MyGroup[0];
+    
+    std::vector<int> leaders_1hot(WorldSize,0);
+    std::vector<int> leaders_group(GroupSize,0);
+    leaders_1hot [ myleader ] = 1;
+    
+    ///////////////////////////////////////////////////////////////////
+    // global sum leaders over comm world
+    ///////////////////////////////////////////////////////////////////
+    ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator);
+    assert(ierr==0);
+  
+    ///////////////////////////////////////////////////////////////////
+    // find the group leaders world rank
+    ///////////////////////////////////////////////////////////////////
+    int group=0;
+    for(int l=0;l<WorldSize;l++){
+      if(leaders_1hot[l]){
+	leaders_group[group++] = l;
+      }
+    }
+  
+    ///////////////////////////////////////////////////////////////////
+    // Identify the rank of the group in which I (and my leader) live
+    ///////////////////////////////////////////////////////////////////
+    GroupRank=-1;
+    for(int g=0;g<GroupSize;g++){
+      if (myleader == leaders_group[g]){
+	GroupRank=g;
+      }
+    }
+    assert(GroupRank!=-1);
+    
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // allocate the shared window for our group
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    
+    ShmCommBuf = 0;
+    ierr = MPI_Win_allocate_shared(MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,ShmComm,&ShmCommBuf,&ShmWindow);
+    assert(ierr==0);
+
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Verbose for now
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    std::cout<< "Ranks per node "<< ShmSize << std::endl;
+    std::cout<< "Nodes          "<< GroupSize << std::endl;
+    std::cout<< "Ranks          "<< WorldSize << std::endl;
+    std::cout<< "Shm CommBuf "<< ShmCommBuf << std::endl;
+
+    // Done
+    ShmSetup=1;
+
+  }
+
+  ShmCommBufs.resize(ShmSize);
+  for(int r=0;r<ShmSize;r++){
+    MPI_Aint sz;
+    int dsp_unit;
+    MPI_Win_shared_query (ShmWindow, r, &sz, &dsp_unit, &ShmCommBufs[r]);
+  }
   
   ////////////////////////////////////////////////////////////////
   // Assert power of two shm_size.
@@ -120,7 +222,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
   
   ShmDims.resize(_ndimension,1);
   GroupDims.resize(_ndimension);
-  
+    
   ShmCoor.resize(_ndimension);
   GroupCoor.resize(_ndimension);
   WorldCoor.resize(_ndimension);
@@ -129,12 +231,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
     ShmDims[dim]*=2;
     dim=(dim+1)%_ndimension;
   }
-  
-  std::cout << "Shm group dims "<<std::endl;
-  for(int d =0;d<_ndimension;d++){
-    std::cout << ShmDims[d]<<" ";
-  };
-  std::cout << std::endl;
 
   ////////////////////////////////////////////////////////////////
   // Establish torus of processes and nodes with sub-blockings
@@ -142,22 +238,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
   for(int d=0;d<_ndimension;d++){
     GroupDims[d] = WorldDims[d]/ShmDims[d];
   }
-  std::cout << "Group dims "<<std::endl;
-  for(int d =0;d<_ndimension;d++){
-    std::cout << GroupDims[d]<<" ";
-  };
-  std::cout << std::endl;
-  
-  MPI_Group WorldGroup, ShmGroup;
-  MPI_Comm_group (communicator, &WorldGroup); 
-  MPI_Comm_group (shmcomm, &ShmGroup);
-  
-  std::vector<int> world_ranks(WorldSize); 
-  std::vector<int> group_ranks(WorldSize); 
-  std::vector<int> mygroup(GroupSize);
-  for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
-  
-  MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &group_ranks[0]); 
 
   ////////////////////////////////////////////////////////////////
   // Check processor counts match
@@ -166,55 +246,9 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
   _processors = processors;
   _processor_coor.resize(_ndimension);
   for(int i=0;i<_ndimension;i++){
-    std::cout << " p " << _processors[i]<<std::endl;
     _Nprocessors*=_processors[i];
   }
-  std::cout << " World " <<WorldSize <<" Nproc "<<_Nprocessors<<std::endl;
   assert(WorldSize==_Nprocessors);
-  
-  ///////////////////////////////////////////////////////////////////
-  // Identify who is in my group and noninate the leader
-  ///////////////////////////////////////////////////////////////////
-  int g=0;
-  for(int rank=0;rank<WorldSize;rank++){
-    if(group_ranks[rank]!=MPI_UNDEFINED){
-	  mygroup[g] = rank;
-    }
-  }
-  
-  std::sort(mygroup.begin(),mygroup.end(),std::greater<int>());
-  int myleader = mygroup[0];
-  
-  std::vector<int> leaders_1hot(WorldSize,0);
-  std::vector<int> leaders_group(GroupSize,0);
-  leaders_1hot [ myleader ] = 1;
-      
-  ///////////////////////////////////////////////////////////////////
-  // global sum leaders over comm world
-  ///////////////////////////////////////////////////////////////////
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator);
-  assert(ierr==0);
-  
-  ///////////////////////////////////////////////////////////////////
-  // find the group leaders world rank
-  ///////////////////////////////////////////////////////////////////
-  int group=0;
-  for(int l=0;l<WorldSize;l++){
-    if(leaders_1hot[l]){
-      leaders_group[group++] = l;
-    }
-  }
-  
-  ///////////////////////////////////////////////////////////////////
-  // Identify the rank of the group in which I (and my leader) live
-  ///////////////////////////////////////////////////////////////////
-  GroupRank=-1;
-  for(int g=0;g<GroupSize;g++){
-    if (myleader == leaders_group[g]){
-      GroupRank=g;
-    }
-  }
-  assert(GroupRank!=-1);
       
   ////////////////////////////////////////////////////////////////
   // Establish mapping between lexico physics coord and WorldRank
@@ -309,16 +343,56 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 {
   MPI_Request xrq;
   MPI_Request rrq;
+  
   int rank = _processor;
   int ierr;
-  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
-  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
+  int tag;
+  int small = (bytes<MAX_MPI_SHM_BYTES) || (shm_mode==0);
+  static int sequence;
+  int check;
+  assert(dest != _processor);
+  assert(from != _processor);
   
-  assert(ierr==0);
-
-  list.push_back(xrq);
-  list.push_back(rrq);
+  int gdest = GroupRanks[dest];
+  int gme   = GroupRanks[_processor];
+  sequence++;
+  
+  assert(gme == ShmRank);
+  
+  if ( small && (dest !=MPI_UNDEFINED) ) {
+    char *ptr = (char *)ShmCommBufs[gdest];
+    assert(gme != gdest);
+    GridThread::bcopy(xmit,ptr,bytes);
+    bcopy(&_processor,&ptr[bytes],sizeof(_processor));
+    bcopy(&  sequence,&ptr[bytes+4],sizeof(sequence));
+  } else { 
+    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+    assert(ierr==0);
+    list.push_back(xrq);
+  }
+  
+  MPI_Win_sync (ShmWindow);   
+  MPI_Barrier  (ShmComm);
+  MPI_Win_sync (ShmWindow);   
+  
+  if (small && (from !=MPI_UNDEFINED) ) {
+    char *ptr = (char *)ShmCommBufs[ShmRank];
+    GridThread::bcopy(ptr,recv,bytes);
+    bcopy(&ptr[bytes]  ,&tag  ,sizeof(tag));
+    bcopy(&ptr[bytes+4],&check,sizeof(check));
+    assert(check==sequence);
+    assert(tag==from);
+  } else { 
+    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
+    assert(ierr==0);
+    list.push_back(rrq);
+  }
+  
+  MPI_Win_sync (ShmWindow);   
+  MPI_Barrier  (ShmComm);
+  MPI_Win_sync (ShmWindow);   
 }
+
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
   int nreq=list.size();

From 306160ad9ad7675774931164e48154806c0ce6aa Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Fri, 21 Oct 2016 12:07:28 +0100
Subject: [PATCH 02/27] bcopy threaded

---
 lib/communicator/Communicator_mpi3.cc | 54 ++++++++++++++++++++++-----
 1 file changed, 44 insertions(+), 10 deletions(-)

diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc
index 1a36e610..240aa09e 100644
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -196,10 +196,12 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
   }
 
   ShmCommBufs.resize(ShmSize);
+  ShmStencilBufs.resize(ShmSize);
   for(int r=0;r<ShmSize;r++){
     MPI_Aint sz;
     int dsp_unit;
     MPI_Win_shared_query (ShmWindow, r, &sz, &dsp_unit, &ShmCommBufs[r]);
+    ShmStencilBufs[r] = (void *) ((uint64_t)ShmCommBufs[r]+MAX_MPI_SHM_BYTES/4);
   }
   
   ////////////////////////////////////////////////////////////////
@@ -341,30 +343,54 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 						int from,
 						int bytes)
 {
+#undef SHM_USE_BCOPY
   MPI_Request xrq;
   MPI_Request rrq;
   
+  static int sequence;
+
   int rank = _processor;
   int ierr;
   int tag;
-  int small = (bytes<MAX_MPI_SHM_BYTES) || (shm_mode==0);
-  static int sequence;
   int check;
+
   assert(dest != _processor);
   assert(from != _processor);
   
   int gdest = GroupRanks[dest];
   int gme   = GroupRanks[_processor];
+
   sequence++;
   
+  char *to_ptr   = (char *)ShmCommBufs[gdest];
+  char *from_ptr = (char *)ShmCommBufs[ShmRank];
+
+  int small = (bytes<MAX_MPI_SHM_BYTES) || (shm_mode==0);
+
+#ifndef SHM_USE_BCOPY
+  int words = bytes>>2;
+  assert(((size_t)bytes &0x3)==0);
+  assert(((size_t)xmit  &0x3)==0);
+  assert(((size_t)recv  &0x3)==0);
+#endif
+
   assert(gme == ShmRank);
   
   if ( small && (dest !=MPI_UNDEFINED) ) {
-    char *ptr = (char *)ShmCommBufs[gdest];
     assert(gme != gdest);
-    GridThread::bcopy(xmit,ptr,bytes);
-    bcopy(&_processor,&ptr[bytes],sizeof(_processor));
-    bcopy(&  sequence,&ptr[bytes+4],sizeof(sequence));
+    float *ip = (float *)xmit;
+    float *op = (float *)to_ptr;
+
+#ifdef SHM_USE_BCOPY
+    bcopy(ip,op,bytes);
+#else
+    PARALLEL_FOR_LOOP 
+    for(int w=0;w<words;w++) {
+      op[w]=ip[w];
+    }
+#endif
+    bcopy(&_processor,&to_ptr[bytes],sizeof(_processor));
+    bcopy(&  sequence,&to_ptr[bytes+4],sizeof(sequence));
   } else { 
     ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
     assert(ierr==0);
@@ -376,10 +402,18 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
   MPI_Win_sync (ShmWindow);   
   
   if (small && (from !=MPI_UNDEFINED) ) {
-    char *ptr = (char *)ShmCommBufs[ShmRank];
-    GridThread::bcopy(ptr,recv,bytes);
-    bcopy(&ptr[bytes]  ,&tag  ,sizeof(tag));
-    bcopy(&ptr[bytes+4],&check,sizeof(check));
+    float *ip = (float *)from_ptr;
+    float *op = (float *)recv;
+#ifdef SHM_USE_BCOPY
+    bcopy(ip,op,bytes);
+#else
+    PARALLEL_FOR_LOOP 
+    for(int w=0;w<words;w++) {
+      op[w]=ip[w];
+    }
+#endif
+    bcopy(&from_ptr[bytes]  ,&tag  ,sizeof(tag));
+    bcopy(&from_ptr[bytes+4],&check,sizeof(check));
     assert(check==sequence);
     assert(tag==from);
   } else { 

From 2c54a53d0a3b473c62b21f0ccd500e43ba197dea Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Fri, 21 Oct 2016 12:12:14 +0100
Subject: [PATCH 03/27] Compile verbose reduce

---
 lib/Threads.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lib/Threads.h b/lib/Threads.h
index 70350685..08e5d545 100644
--- a/lib/Threads.h
+++ b/lib/Threads.h
@@ -31,7 +31,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 
 #ifdef _OPENMP
 #define GRID_OMP
-#warning "OpenMP"
 #endif
 
 #define UNROLL  _Pragma("unroll")

From f331809c2715cd4dc1b0e486c77d8552010a79de Mon Sep 17 00:00:00 2001
From: azusayamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Fri, 21 Oct 2016 13:35:37 +0100
Subject: [PATCH 04/27] Use variable type for loop

---
 lib/communicator/Communicator_mpi3.cc | 45 ++++++++++++++++++++-------
 1 file changed, 34 insertions(+), 11 deletions(-)

diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc
index 240aa09e..7d5ffff3 100644
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -1,3 +1,4 @@
+
     /*************************************************************************************
 
     Grid physics library, www.github.com/paboyle/Grid 
@@ -181,6 +182,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
     ShmCommBuf = 0;
     ierr = MPI_Win_allocate_shared(MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,ShmComm,&ShmCommBuf,&ShmWindow);
     assert(ierr==0);
+    MPI_Win_lock_all (MPI_MODE_NOCHECK, ShmWindow);
 
     /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     // Verbose for now
@@ -368,22 +370,24 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
   int small = (bytes<MAX_MPI_SHM_BYTES) || (shm_mode==0);
 
 #ifndef SHM_USE_BCOPY
-  int words = bytes>>2;
-  assert(((size_t)bytes &0x3)==0);
-  assert(((size_t)xmit  &0x3)==0);
-  assert(((size_t)recv  &0x3)==0);
+  typedef double T;
+  int words = bytes/sizeof(T);
+  assert(((size_t)bytes &(sizeof(T)-1))==0);
+  //  assert(((size_t)xmit  &(sizeof(T)-1))==0);
+  //  assert(((size_t)recv  &(sizeof(T)-1))==0);
 #endif
 
   assert(gme == ShmRank);
-  
+
+  //  std::cerr << "proc dest from gme  gdest "<<_processor<<" "<<dest <<" "<< from <<" "<<gme<<" "<< gdest<<std::endl; Barrier();
   if ( small && (dest !=MPI_UNDEFINED) ) {
     assert(gme != gdest);
-    float *ip = (float *)xmit;
-    float *op = (float *)to_ptr;
 
 #ifdef SHM_USE_BCOPY
-    bcopy(ip,op,bytes);
+    bcopy(xmit,to_ptr,bytes);
 #else
+    T *ip = (T *)xmit;
+    T *op = (T *)to_ptr;
     PARALLEL_FOR_LOOP 
     for(int w=0;w<words;w++) {
       op[w]=ip[w];
@@ -397,16 +401,19 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
     list.push_back(xrq);
   }
   
+  //  std::cout << "Syncing "<<std::endl; Barrier();
   MPI_Win_sync (ShmWindow);   
   MPI_Barrier  (ShmComm);
   MPI_Win_sync (ShmWindow);   
+
+  //  std::cout << "Receiving "<<std::endl; Barrier();
   
   if (small && (from !=MPI_UNDEFINED) ) {
-    float *ip = (float *)from_ptr;
-    float *op = (float *)recv;
 #ifdef SHM_USE_BCOPY
-    bcopy(ip,op,bytes);
+    bcopy(from_ptr,recv,bytes);
 #else
+    T *ip = (T *)from_ptr;
+    T *op = (T *)recv;
     PARALLEL_FOR_LOOP 
     for(int w=0;w<words;w++) {
       op[w]=ip[w];
@@ -422,9 +429,25 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
     list.push_back(rrq);
   }
   
+  //  std::cout << "Syncing"<<std::endl; Barrier();
+
   MPI_Win_sync (ShmWindow);   
   MPI_Barrier  (ShmComm);
   MPI_Win_sync (ShmWindow);   
+
+#if 0
+  MPI_Request xrq;
+  MPI_Request rrq;
+  int rank = _processor;
+  int ierr;
+  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
+  
+  assert(ierr==0);
+
+  list.push_back(xrq);
+  list.push_back(rrq);
+#endif
 }
 
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)

From fad96cf2507a5a36bfe3fdd7f9dd9bb19be15543 Mon Sep 17 00:00:00 2001
From: azusayamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Fri, 21 Oct 2016 13:36:00 +0100
Subject: [PATCH 05/27] StencilBufs

---
 lib/communicator/Communicator_base.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h
index 6d6602be..f2a6bf00 100644
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -80,6 +80,7 @@ class CartesianCommunicator {
 
     void * ShmCommBuf;
     std::vector<void *> ShmCommBufs;
+    std::vector<void *> ShmStencilBufs;
 
     int WorldRank;
     int WorldSize;

From 6a9eae6b6b5253fa6372f0a742110517e92b209a Mon Sep 17 00:00:00 2001
From: azusayamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Fri, 21 Oct 2016 13:36:18 +0100
Subject: [PATCH 06/27] Reporting improvements

---
 lib/qcd/action/fermion/WilsonFermion5D.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc
index a96b6fca..7c4fafff 100644
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -194,7 +194,7 @@ void WilsonFermion5D<Impl>::Report(void)
     std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls        : "
               << DhopComputeTime / DhopCalls << " us" << std::endl;
 
-    RealD mflops = 1344*volume*DhopCalls/DhopComputeTime;
+    RealD mflops = 1344*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
     std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
     std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NP << std::endl;
 
@@ -396,7 +396,6 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
            DoubledGaugeField & U,
            const FermionField &in, FermionField &out,int dag)
 {
-  DhopCalls++;
   //  assert((dag==DaggerNo) ||(dag==DaggerYes));
   Compressor compressor(dag);
 
@@ -454,6 +453,7 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 {
+  DhopCalls++;
   conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
   conformable(in._grid,out._grid); // drops the cb check
 
@@ -465,6 +465,7 @@ void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
+  DhopCalls++;
   conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
   conformable(in._grid,out._grid); // drops the cb check
 
@@ -476,6 +477,7 @@ void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int
 template<class Impl>
 void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
+  DhopCalls+=2;
   conformable(in._grid,FermionGrid()); // verifies full grid
   conformable(in._grid,out._grid);
 

From 09fd5c43a711900e101dc1e3d2a449b8a4f1a93c Mon Sep 17 00:00:00 2001
From: azusayamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Fri, 21 Oct 2016 15:17:39 +0100
Subject: [PATCH 07/27] Reasonably fast version

---
 lib/communicator/Communicator_mpi3.cc | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc
index 7d5ffff3..5c7bb09b 100644
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -28,6 +28,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
     /*  END LEGAL */
 #include "Grid.h"
 #include <mpi.h>
+#include <numaif.h>
 
 namespace Grid {
 
@@ -182,6 +183,15 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
     ShmCommBuf = 0;
     ierr = MPI_Win_allocate_shared(MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,ShmComm,&ShmCommBuf,&ShmWindow);
     assert(ierr==0);
+    for(uint64_t page=0;page<MAX_MPI_SHM_BYTES;page+=4096){
+      void *pages = (void *) ( page + ShmCommBuf );
+      int status;
+      int flags=MPOL_MF_MOVE_ALL;
+      int nodes=1; // numa domain == MCDRAM
+      unsigned long count=1;
+      ierr= move_pages(0,count, &pages,&nodes,&status,flags);
+      if (ierr && (page==0)) perror("numa relocate command failed");
+    }
     MPI_Win_lock_all (MPI_MODE_NOCHECK, ShmWindow);
 
     /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -367,10 +377,10 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
   char *to_ptr   = (char *)ShmCommBufs[gdest];
   char *from_ptr = (char *)ShmCommBufs[ShmRank];
 
-  int small = (bytes<MAX_MPI_SHM_BYTES) || (shm_mode==0);
+  int small = (bytes<MAX_MPI_SHM_BYTES);
 
 #ifndef SHM_USE_BCOPY
-  typedef double T;
+  typedef long long T __attribute__ ((__vector_size__ (16)));
   int words = bytes/sizeof(T);
   assert(((size_t)bytes &(sizeof(T)-1))==0);
   //  assert(((size_t)xmit  &(sizeof(T)-1))==0);

From 75ebd3a0d135fb58095362c7f6432c779f4745b2 Mon Sep 17 00:00:00 2001
From: azusayamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Fri, 21 Oct 2016 22:34:29 +0100
Subject: [PATCH 08/27] Typo fixes and rotate for CLANG

---
 lib/simd/Grid_avx512.h    | 12 +++++++++++-
 lib/simd/Intel512avx.h    |  2 +-
 lib/simd/Intel512common.h |  2 +-
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h
index 821898d9..45d6d988 100644
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@@ -41,6 +41,16 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 
 namespace Grid{
 namespace Optimization {
+
+  union u512f {
+    __m512 v;
+    float f[16];
+  };
+
+  union u512d {
+    __m512d v;
+    double f[8];
+  };
   
   struct Vsplat{
     //Complex float
@@ -361,7 +371,7 @@ namespace Optimization {
   // Some Template specialization
 
   // Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases
-#undef GNU_CLANG_COMPILER 
+#define GNU_CLANG_COMPILER 
 #ifdef GNU_CLANG_COMPILER
   //Complex float Reduce
   template<>
diff --git a/lib/simd/Intel512avx.h b/lib/simd/Intel512avx.h
index cdd54a33..19157db4 100644
--- a/lib/simd/Intel512avx.h
+++ b/lib/simd/Intel512avx.h
@@ -53,7 +53,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 
 #define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
   VSHUFMEMd(O,P,tmp)  \
-  VMULMEMd(O,P,B,Biirr)  \ 
+  VMULMEMd(O,P,B,Biirr)  \
   VMULMEMd(O,P,C,Ciirr)  \
   VMULd(tmp,B,Briir)  \
   VMULd(tmp,C,Criir) 
diff --git a/lib/simd/Intel512common.h b/lib/simd/Intel512common.h
index dbfb30c2..cfa20c26 100644
--- a/lib/simd/Intel512common.h
+++ b/lib/simd/Intel512common.h
@@ -37,7 +37,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 // Opcodes common 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 #define MASK_REGS \
-  __asm__ ("mov     $0xAAAA, %%eax \n"\ 
+  __asm__ ("mov     $0xAAAA, %%eax \n"\
            "kmovw    %%eax, %%k6 \n"\
            "mov     $0x5555, %%eax \n"\
            "kmovw    %%eax, %%k7 \n" : : : "%eax");

From 910b8dd6a1d89443ed42306439161db09d05d81b Mon Sep 17 00:00:00 2001
From: azusayamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Fri, 21 Oct 2016 22:35:29 +0100
Subject: [PATCH 09/27] use simd type

---
 lib/communicator/Communicator_mpi3.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc
index 5c7bb09b..be4e66a9 100644
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -380,7 +380,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
   int small = (bytes<MAX_MPI_SHM_BYTES);
 
 #ifndef SHM_USE_BCOPY
-  typedef long long T __attribute__ ((__vector_size__ (16)));
+  typedef vRealD T;
   int words = bytes/sizeof(T);
   assert(((size_t)bytes &(sizeof(T)-1))==0);
   //  assert(((size_t)xmit  &(sizeof(T)-1))==0);
@@ -400,7 +400,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
     T *op = (T *)to_ptr;
     PARALLEL_FOR_LOOP 
     for(int w=0;w<words;w++) {
-      op[w]=ip[w];
+      vstream(op[w],ip[w]);
     }
 #endif
     bcopy(&_processor,&to_ptr[bytes],sizeof(_processor));
@@ -426,7 +426,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
     T *op = (T *)recv;
     PARALLEL_FOR_LOOP 
     for(int w=0;w<words;w++) {
-      op[w]=ip[w];
+      vstream(op[w],ip[w]);
     }
 #endif
     bcopy(&from_ptr[bytes]  ,&tag  ,sizeof(tag));

From 0fcd2e71882e555966a4d38ce9252128756cef13 Mon Sep 17 00:00:00 2001
From: azusayamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Fri, 21 Oct 2016 22:44:10 +0100
Subject: [PATCH 10/27] Simplify the comms structure prior to implementing
 Shared memory direct bouncs

---
 lib/Stencil.h | 168 +++++++++++++++-----------------------------------
 1 file changed, 49 insertions(+), 119 deletions(-)

diff --git a/lib/Stencil.h b/lib/Stencil.h
index 65de9945..4850425b 100644
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -70,20 +70,20 @@
 
  namespace Grid {
 
-template<class vobj,class cobj,class compressor> void 
-Gather_plane_simple_table_compute (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimension,int plane,int cbmask,compressor &compress, int off,std::vector<std::pair<int,int> >& table)
+inline void Gather_plane_simple_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
+					       int off,std::vector<std::pair<int,int> > & table)
 {
   table.resize(0);
-  int rd = rhs._grid->_rdimensions[dimension];
+  int rd = grid->_rdimensions[dimension];
 
-  if ( !rhs._grid->CheckerBoarded(dimension) ) {
+  if ( !grid->CheckerBoarded(dimension) ) {
     cbmask = 0x3;
   }
-  int so= plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
-  int e1=rhs._grid->_slice_nblock[dimension];
-  int e2=rhs._grid->_slice_block[dimension];
+  int so= plane*grid->_ostride[dimension]; // base offset for start of plane 
+  int e1=grid->_slice_nblock[dimension];
+  int e2=grid->_slice_block[dimension];
 
-  int stride=rhs._grid->_slice_stride[dimension];
+  int stride=grid->_slice_stride[dimension];
   if ( cbmask == 0x3 ) { 
     table.resize(e1*e2);
     for(int n=0;n<e1;n++){
@@ -99,7 +99,7 @@ Gather_plane_simple_table_compute (const Lattice<vobj> &rhs,commVector<cobj> &bu
      for(int n=0;n<e1;n++){
        for(int b=0;b<e2;b++){
 	 int o  = n*stride;
-	 int ocb=1<<rhs._grid->CheckerBoardFromOindexTable(o+b);
+	 int ocb=1<<grid->CheckerBoardFromOindexTable(o+b);
 	 if ( ocb &cbmask ) {
 	   table[bo]=std::pair<int,int>(bo,o+b); bo++;
 	 }
@@ -109,8 +109,7 @@ Gather_plane_simple_table_compute (const Lattice<vobj> &rhs,commVector<cobj> &bu
 }
 
 template<class vobj,class cobj,class compressor> void 
-Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,commVector<cobj> &buffer,
-			   compressor &compress, int off,int so)
+Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)
 {
 PARALLEL_FOR_LOOP     
      for(int i=0;i<table.size();i++){
@@ -119,11 +118,11 @@ PARALLEL_FOR_LOOP
 }
 
 template<class vobj,class cobj,class compressor> void 
-Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimension,int plane,int cbmask,compressor &compress, int off,
+Gather_plane_simple_stencil (const Lattice<vobj> &rhs,cobj *buffer,int dimension,int plane,int cbmask,compressor &compress, int off,
 			     double &t_table ,double & t_data )
 {
   std::vector<std::pair<int,int> > table;
-  Gather_plane_simple_table_compute (rhs, buffer,dimension,plane,cbmask,compress,off,table);
+  Gather_plane_simple_table_compute (rhs._grid,dimension,plane,cbmask,off,table);
   int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
   Gather_plane_simple_table         (table,rhs,buffer,compress,off,so);
 }
@@ -143,10 +142,11 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,i
    class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
    public:
 
-       typedef uint32_t StencilInteger;
-       typedef typename cobj::vector_type vector_type;
-       typedef typename cobj::scalar_type scalar_type;
-       typedef typename cobj::scalar_object scalar_object;
+     typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
+     typedef uint32_t StencilInteger;
+     typedef typename cobj::vector_type vector_type;
+     typedef typename cobj::scalar_type scalar_type;
+     typedef typename cobj::scalar_object scalar_object;
 
        //////////////////////////////////////////
        // Comms packet queue for asynch thread
@@ -158,7 +158,6 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,i
 	 Integer to_rank;
 	 Integer from_rank;
 	 Integer bytes;
-	 volatile Integer done;
        };
 
        std::vector<Packet> Packets;
@@ -166,81 +165,39 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,i
        int face_table_computed;
        std::vector<std::vector<std::pair<int,int> > > face_table ;
        
-#define SEND_IMMEDIATE
-#define SERIAL_SENDS
-
        void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){
- #ifdef SEND_IMMEDIATE
-	 commtime-=usecond();
-	 _grid->SendToRecvFrom(xmit,to,rcv,from,bytes);
-	 commtime+=usecond();
- #endif
 	 Packet p;
 	 p.send_buf = xmit;
 	 p.recv_buf = rcv;
 	 p.to_rank  = to;
 	 p.from_rank= from;
 	 p.bytes    = bytes;
-	 p.done     = 0;
 	 comms_bytes+=2.0*bytes;
 	 Packets.push_back(p);
-
        }
 
- #ifdef SERIAL_SENDS
-       void Communicate(void ) { 
-	 commtime-=usecond();
+       void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
+       {
+	 reqs.resize(Packets.size());
+         commtime-=usecond();
 	 for(int i=0;i<Packets.size();i++){
- #ifndef SEND_IMMEDIATE
-	   _grid->SendToRecvFrom(
-				 Packets[i].send_buf,
-				 Packets[i].to_rank,
-				 Packets[i].recv_buf,
-				 Packets[i].from_rank,
-				 Packets[i].bytes);
- #endif
-	   Packets[i].done = 1;
+	   _grid->SendToRecvFromBegin(reqs[i],
+				      Packets[i].send_buf,
+				      Packets[i].to_rank,
+				      Packets[i].recv_buf,
+				      Packets[i].from_rank,
+				      Packets[i].bytes);
 	 }
 	 commtime+=usecond();
        }
- #else
-       void Communicate(void ) { 
-	 typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
-	 std::vector<std::vector<CommsRequest_t> > reqs(Packets.size());
-	 commtime-=usecond();
-	 const int concurrency=2;
-	 for(int i=0;i<Packets.size();i+=concurrency){
-	   for(int ii=0;ii<concurrency;ii++){
-	     int j = i+ii;
-	     if ( j<Packets.size() ) {
- #ifndef SEND_IMMEDIATE
-	       _grid->SendToRecvFromBegin(reqs[j],
-					  Packets[j].send_buf,
-					  Packets[j].to_rank,
-					  Packets[j].recv_buf,
-					  Packets[j].from_rank,
-					  Packets[j].bytes);
- #endif
-	     }
-	   }
-	   for(int ii=0;ii<concurrency;ii++){
-	     int j = i+ii;
-	     if ( j<Packets.size() ) {
- #ifndef SEND_IMMEDIATE
-	       _grid->SendToRecvFromComplete(reqs[i]);
- #endif
-	     }
-	   }
-	   for(int ii=0;ii<concurrency;ii++){
-	     int j = i+ii;
-	     if ( j<Packets.size() ) {
-	       Packets[j].done = 1;
-	     }
-	   }
+       void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
+       {
+         commtime-=usecond();
+	 for(int i=0;i<Packets.size();i++){
+	   _grid->SendToRecvFromComplete(reqs[i]);
 	 }
 	 commtime+=usecond();
        }
- #endif
 
        ///////////////////////////////////////////
        // Simd merge queue for asynch comms
@@ -260,36 +217,19 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,i
 	 m.rpointers= rpointers;
 	 m.buffer_size = buffer_size;
 	 m.packet_id   = packet_id;
- #ifdef SEND_IMMEDIATE
-	 mergetime-=usecond();
- PARALLEL_FOR_LOOP
-	 for(int o=0;o<m.buffer_size;o++){
-	   merge1(m.mpointer[o],m.rpointers,o);
-	 }
-	 mergetime+=usecond();
- #else
 	 Mergers.push_back(m);
- #endif
-
        }
 
        void CommsMerge(void ) { 
-	 //PARALLEL_NESTED_LOOP2 
+
 	 for(int i=0;i<Mergers.size();i++){	
 
-	 spintime-=usecond();
-	 int packet_id = Mergers[i].packet_id;
-	 while(! Packets[packet_id].done ); // spin for completion
-	 spintime+=usecond();
-
- #ifndef SEND_IMMEDIATE
 	 mergetime-=usecond();
- PARALLEL_FOR_LOOP
+PARALLEL_FOR_LOOP
 	   for(int o=0;o<Mergers[i].buffer_size;o++){
 	     merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
 	   }
 	 mergetime+=usecond();
- #endif
 
 	 }
        }
@@ -346,11 +286,14 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,i
 	 else       return cbase + _entries[ent]._byte_offset;
        }
 
+       ///////////////////////////////////////////////////////////
        // Comms buffers
+       ///////////////////////////////////////////////////////////
        std::vector<commVector<scalar_object> > u_simd_send_buf;
        std::vector<commVector<scalar_object> > u_simd_recv_buf;
        commVector<cobj>          u_send_buf;
        commVector<cobj>          comm_buf;
+
        int u_comm_offset;
        int _unified_buffer_size;
 
@@ -483,7 +426,7 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,i
 	 }
        }
        u_send_buf.resize(_unified_buffer_size);
-       comm_buf.resize(_unified_buffer_size);
+         comm_buf.resize(_unified_buffer_size);
 
        PrecomputeByteOffsets();
 
@@ -722,31 +665,16 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,i
        template<class compressor>
        void HaloExchange(const Lattice<vobj> &source,compressor &compress) 
        {
+	 std::vector<std::vector<CommsRequest_t> > reqs;
 	 calls++;
 	 Mergers.resize(0);
          Packets.resize(0);
          HaloGather(source,compress);
-	 this->Communicate();
+	 this->CommunicateBegin(reqs);
+	 this->CommunicateComplete(reqs);
 	 CommsMerge(); // spins
        }
-#if 0
-       // Overlapping comms and compute typically slows down compute and  is useless
-       // unless memory bandwidth greatly exceeds network
-       template<class compressor>
-       std::thread HaloExchangeBegin(const Lattice<vobj> &source,compressor &compress) {
-	 Mergers.resize(0); 
-	 Packets.resize(0);
-	 HaloGather(source,compress);
-	 return std::thread([&] { this->Communicate(); });
-       }
-       void HaloExchangeComplete(std::thread &thr) 
-       {
-	 CommsMerge(); // spins
-	 jointime-=usecond();
-	 thr.join();
-	 jointime+=usecond();
-       }
-#endif
+
        template<class compressor>
        void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
        {
@@ -851,6 +779,9 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,i
 	     int sx        = (x+sshift)%rd;
 	     int comm_proc = ((x+sshift)/rd)%pd;
 
+	     cobj *u_send_buf_p;
+	     cobj   *comm_buf_p;
+
 	     if (comm_proc) {
 
 	       int words = buffer_size;
@@ -863,16 +794,15 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,i
 	       if ( !face_table_computed ) {
 		 t_table-=usecond();
 		 face_table.resize(face_idx+1);
-		 Gather_plane_simple_table_compute (rhs,u_send_buf,dimension,sx,cbmask,compress,u_comm_offset,face_table[face_idx]);
+		 cobj *ptr; ptr = &u_send_buf[0];
+		 Gather_plane_simple_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,
+						    face_table[face_idx]);
 		 t_table+=usecond();
 	       }
 	       t_data-=usecond();
-	       Gather_plane_simple_table         (face_table[face_idx],rhs,u_send_buf,compress,u_comm_offset,so);
-	       face_idx++;
+	       Gather_plane_simple_table         (face_table[face_idx],rhs,&u_send_buf[0],compress,u_comm_offset,so);  face_idx++;
 	       t_data+=usecond();
 	       gathertime+=usecond();
-	       
-	       //	       Gather_plane_simple_stencil (rhs,u_send_buf,dimension,sx,cbmask,compress,u_comm_offset,t_table,t_data);
 
 	       int rank           = _grid->_processor;
 	       int recv_from_rank;

From c190221fd35eba678ec9810aa3b44816085d8283 Mon Sep 17 00:00:00 2001
From: azusayamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Sat, 22 Oct 2016 18:14:27 +0100
Subject: [PATCH 11/27] Internal SHM comms in non-simd directions working Need
 to fix simd directions

---
 benchmarks/Benchmark_dwf.cc                  |    8 +-
 lib/Stencil.h                                | 1609 +++++++++---------
 lib/communicator/Communicator_base.h         |   15 +-
 lib/communicator/Communicator_mpi.cc         |    8 +
 lib/communicator/Communicator_mpi3.cc        |  105 +-
 lib/communicator/Communicator_none.cc        |    8 +
 lib/communicator/Communicator_shmem.cc       |    8 +
 lib/qcd/action/fermion/FermionOperatorImpl.h |  857 +++++-----
 lib/qcd/action/fermion/WilsonFermion.cc      |    8 +-
 lib/qcd/action/fermion/WilsonFermion5D.cc    |  101 +-
 lib/qcd/action/fermion/WilsonFermion5D.h     |  289 ++--
 lib/qcd/action/fermion/WilsonKernels.cc      |   21 +-
 lib/qcd/action/fermion/WilsonKernels.h       |  275 ++-
 lib/qcd/action/fermion/WilsonKernelsAsm.cc   |   95 +-
 lib/qcd/action/fermion/WilsonKernelsHand.cc  |   55 +-
 tests/Test_stencil.cc                        |    6 +-
 16 files changed, 1729 insertions(+), 1739 deletions(-)

diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc
index a32d7535..f75f0385 100644
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -153,7 +153,7 @@ int main (int argc, char ** argv)
     std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
     std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
-    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
     err = ref-result; 
     std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
     Dw.Report();
@@ -192,7 +192,7 @@ int main (int argc, char ** argv)
 
     std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
-    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
     sDw.Report();
   
     if(0){
@@ -262,7 +262,7 @@ int main (int argc, char ** argv)
       double flops=(1344.0*volume*ncall)/2;
 
       std::cout<<GridLogMessage << "sDeo mflop/s =   "<< flops/(t1-t0)<<std::endl;
-      std::cout<<GridLogMessage << "sDeo mflop/s per node   "<< flops/(t1-t0)/NP<<std::endl;
+      std::cout<<GridLogMessage << "sDeo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
       sDw.Report();
 
       sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
@@ -333,7 +333,7 @@ int main (int argc, char ** argv)
     double flops=(1344.0*volume*ncall)/2;
 
     std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
-    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
     Dw.Report();
   }
   Dw.DhopEO(src_o,r_e,DaggerNo);
diff --git a/lib/Stencil.h b/lib/Stencil.h
index 4850425b..6b942539 100644
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -32,6 +32,8 @@
 
  #include <Grid/stencil/Lebesgue.h>   // subdir aggregate
 
+const int ShmDirectCopy = 1;
+
  //////////////////////////////////////////////////////////////////////////////////////////
  // Must not lose sight that goal is to be able to construct really efficient
  // gather to a point stencil code. CSHIFT is not the best way, so need
@@ -68,7 +70,7 @@
  //
  //////////////////////////////////////////////////////////////////////////////////////////
 
- namespace Grid {
+namespace Grid {
 
 inline void Gather_plane_simple_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
 					       int off,std::vector<std::pair<int,int> > & table)
@@ -117,822 +119,837 @@ PARALLEL_FOR_LOOP
      }
 }
 
-template<class vobj,class cobj,class compressor> void 
-Gather_plane_simple_stencil (const Lattice<vobj> &rhs,cobj *buffer,int dimension,int plane,int cbmask,compressor &compress, int off,
-			     double &t_table ,double & t_data )
-{
-  std::vector<std::pair<int,int> > table;
-  Gather_plane_simple_table_compute (rhs._grid,dimension,plane,cbmask,off,table);
-  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
-  Gather_plane_simple_table         (table,rhs,buffer,compress,off,so);
-}
+struct StencilEntry { 
+  uint64_t _offset;
+  uint64_t _byte_offset;
+  uint16_t _is_local;
+  uint16_t _permute;
+  uint32_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline
+};
 
+template<class vobj,class cobj>
+class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
+ public:
 
+  typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
+  typedef uint32_t StencilInteger;
+  typedef typename cobj::vector_type vector_type;
+  typedef typename cobj::scalar_type scalar_type;
+  typedef typename cobj::scalar_object scalar_object;
+  
+  //////////////////////////////////////////
+  // Comms packet queue for asynch thread
+  //////////////////////////////////////////
 
+  struct Packet {
+    void * send_buf;
+    void * recv_buf;
+    Integer to_rank;
+    Integer from_rank;
+    Integer bytes;
+  };
 
-   struct StencilEntry { 
-     uint64_t _offset;
-     uint64_t _byte_offset;
-     uint16_t _is_local;
-     uint16_t _permute;
-     uint32_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline
-   };
+  std::vector<Packet> Packets;
 
-   template<class vobj,class cobj>
-   class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
-   public:
+  int face_table_computed;
+  std::vector<std::vector<std::pair<int,int> > > face_table ;
+  
+  void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){
+    Packet p;
+    p.send_buf = xmit;
+    p.recv_buf = rcv;
+    p.to_rank  = to;
+    p.from_rank= from;
+    p.bytes    = bytes;
+    comms_bytes+=2.0*bytes;
+    Packets.push_back(p);
+  }
 
-     typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
-     typedef uint32_t StencilInteger;
-     typedef typename cobj::vector_type vector_type;
-     typedef typename cobj::scalar_type scalar_type;
-     typedef typename cobj::scalar_object scalar_object;
+  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
+  {
+    reqs.resize(Packets.size());
+    commtime-=usecond();
+    for(int i=0;i<Packets.size();i++){
+      if( ShmDirectCopy ) {
+	_grid->StencilSendToRecvFromBegin(reqs[i],
+					  Packets[i].send_buf,
+					  Packets[i].to_rank,
+					  Packets[i].recv_buf,
+					  Packets[i].from_rank,
+					  Packets[i].bytes);
+      }else{
+	_grid->SendToRecvFromBegin(reqs[i],
+				   Packets[i].send_buf,
+				   Packets[i].to_rank,
+				   Packets[i].recv_buf,
+				   Packets[i].from_rank,
+				   Packets[i].bytes);
+      }
+    }
+    commtime+=usecond();
+  }
+  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
+  {
+    commtime-=usecond();
+    for(int i=0;i<Packets.size();i++){
+      if( ShmDirectCopy ) 
+	_grid->StencilSendToRecvFromComplete(reqs[i]);
+      else 
+	_grid->SendToRecvFromComplete(reqs[i]);
+    }
+    commtime+=usecond();
+  }
 
-       //////////////////////////////////////////
-       // Comms packet queue for asynch thread
-       //////////////////////////////////////////
+  ///////////////////////////////////////////
+  // Simd merge queue for asynch comms
+  ///////////////////////////////////////////
+  struct Merge {
+    cobj * mpointer;
+    std::vector<scalar_object *> rpointers;
+    Integer buffer_size;
+    Integer packet_id;
+  };
+  
+  std::vector<Merge> Mergers;
 
-       struct Packet {
-	 void * send_buf;
-	 void * recv_buf;
-	 Integer to_rank;
-	 Integer from_rank;
-	 Integer bytes;
-       };
+  void AddMerge(cobj *merge_p,std::vector<scalar_object *> &rpointers,Integer buffer_size,Integer packet_id) {
+    Merge m;
+    m.mpointer = merge_p;
+    m.rpointers= rpointers;
+    m.buffer_size = buffer_size;
+    m.packet_id   = packet_id;
+    Mergers.push_back(m);
+  }
 
-       std::vector<Packet> Packets;
+  void CommsMerge(void ) { 
 
-       int face_table_computed;
-       std::vector<std::vector<std::pair<int,int> > > face_table ;
-       
-       void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){
-	 Packet p;
-	 p.send_buf = xmit;
-	 p.recv_buf = rcv;
-	 p.to_rank  = to;
-	 p.from_rank= from;
-	 p.bytes    = bytes;
-	 comms_bytes+=2.0*bytes;
-	 Packets.push_back(p);
-       }
-
-       void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
-       {
-	 reqs.resize(Packets.size());
-         commtime-=usecond();
-	 for(int i=0;i<Packets.size();i++){
-	   _grid->SendToRecvFromBegin(reqs[i],
-				      Packets[i].send_buf,
-				      Packets[i].to_rank,
-				      Packets[i].recv_buf,
-				      Packets[i].from_rank,
-				      Packets[i].bytes);
-	 }
-	 commtime+=usecond();
-       }
-       void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
-       {
-         commtime-=usecond();
-	 for(int i=0;i<Packets.size();i++){
-	   _grid->SendToRecvFromComplete(reqs[i]);
-	 }
-	 commtime+=usecond();
-       }
-
-       ///////////////////////////////////////////
-       // Simd merge queue for asynch comms
-       ///////////////////////////////////////////
-       struct Merge {
-	 cobj * mpointer;
-	 std::vector<scalar_object *> rpointers;
-	 Integer buffer_size;
-	 Integer packet_id;
-       };
-
-       std::vector<Merge> Mergers;
-
-       void AddMerge(cobj *merge_p,std::vector<scalar_object *> &rpointers,Integer buffer_size,Integer packet_id) {
-	 Merge m;
-	 m.mpointer = merge_p;
-	 m.rpointers= rpointers;
-	 m.buffer_size = buffer_size;
-	 m.packet_id   = packet_id;
-	 Mergers.push_back(m);
-       }
-
-       void CommsMerge(void ) { 
-
-	 for(int i=0;i<Mergers.size();i++){	
-
-	 mergetime-=usecond();
+    for(int i=0;i<Mergers.size();i++){	
+      
+      mergetime-=usecond();
 PARALLEL_FOR_LOOP
-	   for(int o=0;o<Mergers[i].buffer_size;o++){
-	     merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
-	   }
-	 mergetime+=usecond();
+      for(int o=0;o<Mergers[i].buffer_size;o++){
+	merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
+      }
+      mergetime+=usecond();
 
-	 }
-       }
+    }
+  }
 
-       ////////////////////////////////////////
-       // Basic Grid and stencil info
-       ////////////////////////////////////////
+  ////////////////////////////////////////
+  // Basic Grid and stencil info
+  ////////////////////////////////////////
+  
+  int                               _checkerboard;
+  int                               _npoints; // Move to template param?
+  GridBase *                        _grid;
+  
+  // npoints of these
+  std::vector<int>                  _directions;
+  std::vector<int>                  _distances;
+  std::vector<int>                  _comm_buf_size;
+  std::vector<int>                  _permute_type;
+  
+  // npoints x Osites() of these
+  // Flat vector, change layout for cache friendly.
+  Vector<StencilEntry>  _entries;
+  
+  inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; }
+  
+  void PrecomputeByteOffsets(void){
+    for(int i=0;i<_entries.size();i++){
+      if( _entries[i]._is_local ) {
+	_entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj);
+      } else { 
+	_entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj);
+      }
+    }
+  };
 
-       int                               _checkerboard;
-       int                               _npoints; // Move to template param?
-       GridBase *                        _grid;
+  inline uint64_t Touch(int ent) {
+    //	 _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0);
+  }
+  inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
+    uint64_t cbase = (uint64_t)&u_recv_buf_p[0];
+    local = _entries[ent]._is_local;
+    perm  = _entries[ent]._permute;
+    if (perm)  ptype = _permute_type[point]; 
+    if (local) {
+      return  base + _entries[ent]._byte_offset;
+    } else {
+      return cbase + _entries[ent]._byte_offset;
+    }
+  }
+  inline uint64_t GetPFInfo(int ent,uint64_t base) {
+    uint64_t cbase = (uint64_t)&u_recv_buf_p[0];
+    int local = _entries[ent]._is_local;
+    if (local) return  base + _entries[ent]._byte_offset;
+    else       return cbase + _entries[ent]._byte_offset;
+  }
+  
+  ///////////////////////////////////////////////////////////
+  // Unified Comms buffers for all directions
+  ///////////////////////////////////////////////////////////
+  std::vector<commVector<scalar_object> > u_simd_send_buf;
+  std::vector<commVector<scalar_object> > u_simd_recv_buf;
+  commVector<cobj>          u_send_buf;
+  commVector<cobj>          u_recv_buf_hide;
+  cobj* u_recv_buf_p;
 
-       // npoints of these
-       std::vector<int>                  _directions;
-       std::vector<int>                  _distances;
-       std::vector<int>                  _comm_buf_size;
-       std::vector<int>                  _permute_type;
+  int u_comm_offset;
+  int _unified_buffer_size;
+  
+  cobj *CommBuf(void) { return u_recv_buf_p; }
 
-       // npoints x Osites() of these
-       // Flat vector, change layout for cache friendly.
-       Vector<StencilEntry>  _entries;
-
-       inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; }
-
-       void PrecomputeByteOffsets(void){
-	 for(int i=0;i<_entries.size();i++){
-	   if( _entries[i]._is_local ) {
-	     _entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj);
-	   } else { 
-	     // PrecomputeByteOffsets [5] 16384/32768 140735768678528 140735781261056 2581581952
-	     _entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj);
-	   }
-	 }
-       };
-
-       inline uint64_t Touch(int ent) {
-	 //	 _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0);
-       }
-       inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
-	 uint64_t cbase = (uint64_t)&comm_buf[0];
-	 local = _entries[ent]._is_local;
-	 perm  = _entries[ent]._permute;
-	 if (perm)  ptype = _permute_type[point]; 
-	 if (local) {
-	   return  base + _entries[ent]._byte_offset;
-	 } else {
-	   return cbase + _entries[ent]._byte_offset;
-	 }
-       }
-       inline uint64_t GetPFInfo(int ent,uint64_t base) {
-	 uint64_t cbase = (uint64_t)&comm_buf[0];
-	 int local = _entries[ent]._is_local;
-	 if (local) return  base + _entries[ent]._byte_offset;
-	 else       return cbase + _entries[ent]._byte_offset;
-       }
-
-       ///////////////////////////////////////////////////////////
-       // Comms buffers
-       ///////////////////////////////////////////////////////////
-       std::vector<commVector<scalar_object> > u_simd_send_buf;
-       std::vector<commVector<scalar_object> > u_simd_recv_buf;
-       commVector<cobj>          u_send_buf;
-       commVector<cobj>          comm_buf;
-
-       int u_comm_offset;
-       int _unified_buffer_size;
-
-       /////////////////////////////////////////
-       // Timing info; ugly; possibly temporary
-       /////////////////////////////////////////
+  /////////////////////////////////////////
+  // Timing info; ugly; possibly temporary
+  /////////////////////////////////////////
  #define TIMING_HACK
  #ifdef TIMING_HACK
-       double jointime;
-       double gathertime;
-       double commtime;
-       double halogtime;
-       double mergetime;
-       double spintime;
-       double comms_bytes;
-       double gathermtime;
-       double splicetime;
-       double nosplicetime;
-       double t_data;
-       double t_table;
-       double calls;
-
-       void ZeroCounters(void) {
-         gathertime = 0.;
-         jointime = 0.;
-         commtime = 0.;
-         halogtime = 0.;
-         mergetime = 0.;
-         spintime = 0.;
-         gathermtime = 0.;
-         splicetime = 0.;
-         nosplicetime = 0.;
-	 t_data = 0.0;
-         t_table= 0.0;
-         comms_bytes = 0.;
-         calls = 0.;
-       };
-
-       void Report(void) {
+  double jointime;
+  double gathertime;
+  double commtime;
+  double halogtime;
+  double mergetime;
+  double spintime;
+  double comms_bytes;
+  double gathermtime;
+  double splicetime;
+  double nosplicetime;
+  double t_data;
+  double t_table;
+  double calls;
+  
+  void ZeroCounters(void) {
+    gathertime = 0.;
+    jointime = 0.;
+    commtime = 0.;
+    halogtime = 0.;
+    mergetime = 0.;
+    spintime = 0.;
+    gathermtime = 0.;
+    splicetime = 0.;
+    nosplicetime = 0.;
+    t_data = 0.0;
+    t_table= 0.0;
+    comms_bytes = 0.;
+    calls = 0.;
+  };
+  
+  void Report(void) {
 #define PRINTIT(A)	\
  std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
-	 if ( calls > 0. ) {
-	   std::cout << GridLogMessage << " Stencil calls "<<calls<<std::endl;
-	   PRINTIT(halogtime);
-	   PRINTIT(gathertime);
-	   PRINTIT(gathermtime);
-	   PRINTIT(mergetime);
-	   if(comms_bytes>1.0){
-	     PRINTIT(comms_bytes);
-	     PRINTIT(commtime);
-	     std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s "<<std::endl;
-	   }
-	   PRINTIT(jointime);
-	   PRINTIT(spintime);
-	   PRINTIT(splicetime);
-	   PRINTIT(nosplicetime);
-	   PRINTIT(t_table);
+    if ( calls > 0. ) {
+      std::cout << GridLogMessage << " Stencil calls "<<calls<<std::endl;
+      PRINTIT(halogtime);
+      PRINTIT(gathertime);
+      PRINTIT(gathermtime);
+      PRINTIT(mergetime);
+      if(comms_bytes>1.0){
+	PRINTIT(comms_bytes);
+	PRINTIT(commtime);
+	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s "<<std::endl;
+      }
+      PRINTIT(jointime);
+      PRINTIT(spintime);
+      PRINTIT(splicetime);
+      PRINTIT(nosplicetime);
+      PRINTIT(t_table);
 	   PRINTIT(t_data);
-	 }
-       };
+    }
+  };
  #endif
 
-   CartesianStencil(GridBase *grid,
-				      int npoints,
-				      int checkerboard,
-				      const std::vector<int> &directions,
-				      const std::vector<int> &distances) 
-     :   _permute_type(npoints), _comm_buf_size(npoints)
-     {
-       face_table_computed=0;
-       _npoints = npoints;
-       _grid    = grid;
-       _directions = directions;
-       _distances  = distances;
-       _unified_buffer_size=0;
-
-       int osites  = _grid->oSites();
-
-       _entries.resize(_npoints* osites);
-       for(int ii=0;ii<npoints;ii++){
-
-	 int i = ii; // reverse direction to get SIMD comms done first
-	 int point = i;
-
-
-	 int dimension    = directions[i];
-	 int displacement = distances[i];
-	 int shift = displacement;
-
-	 int fd = _grid->_fdimensions[dimension];
-	 int rd = _grid->_rdimensions[dimension];
-	 _permute_type[point]=_grid->PermuteType(dimension);
-
-	 _checkerboard = checkerboard;
-
-	 // the permute type
-	 int simd_layout     = _grid->_simd_layout[dimension];
-	 int comm_dim        = _grid->_processors[dimension] >1 ;
-	 int splice_dim      = _grid->_simd_layout[dimension]>1 && (comm_dim);
-	 int rotate_dim      = _grid->_simd_layout[dimension]>2;
-
-	 assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported
-
-	 int sshift[2];
-
-	 // Underlying approach. For each local site build
-	 // up a table containing the npoint "neighbours" and whether they 
-	 // live in lattice or a comms buffer.
-	 if ( !comm_dim ) {
-	   sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
-	   sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
-
-	   if ( sshift[0] == sshift[1] ) {
-	     Local(point,dimension,shift,0x3);
-	   } else {
-	     Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
-	     Local(point,dimension,shift,0x2);// both with block stride loop iteration
-	   }
-	 } else { // All permute extract done in comms phase prior to Stencil application
-	   //        So tables are the same whether comm_dim or splice_dim
-	   sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
-	   sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
-
-	   if ( sshift[0] == sshift[1] ) {
-	     Comms(point,dimension,shift,0x3);
-	   } else {
-	     Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
-	     Comms(point,dimension,shift,0x2);// both with block stride loop iteration
-	   }
-	 }
-       }
-       u_send_buf.resize(_unified_buffer_size);
-         comm_buf.resize(_unified_buffer_size);
-
-       PrecomputeByteOffsets();
-
-       const int Nsimd = grid->Nsimd();
-       u_simd_send_buf.resize(Nsimd);
-       u_simd_recv_buf.resize(Nsimd);
-       for(int l=0;l<Nsimd;l++){
-	 u_simd_send_buf[l].resize(_unified_buffer_size);
-	 u_simd_recv_buf[l].resize(_unified_buffer_size);
-       }
-     }
-
-     void Local     (int point, int dimension,int shiftpm,int cbmask)
-     {
-       int fd = _grid->_fdimensions[dimension];
-       int rd = _grid->_rdimensions[dimension];
-       int ld = _grid->_ldimensions[dimension];
-       int gd = _grid->_gdimensions[dimension];
-       int ly = _grid->_simd_layout[dimension];
-
-       // Map to always positive shift modulo global full dimension.
-       int shift = (shiftpm+fd)%fd;
-
-       // the permute type
-       int permute_dim =_grid->PermuteDim(dimension);
-
-       for(int x=0;x<rd;x++){       
-
-	 int o   = 0;
-	 int bo  = x * _grid->_ostride[dimension];
-
-	 int cb= (cbmask==0x2)? Odd : Even;
-
-	 int sshift = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
-	 int sx     = (x+sshift)%rd;
-
-	 int wraparound=0;
-	 if ( (shiftpm==-1) && (sx>x)  ) {
-	   wraparound = 1;
-	 }
-	 if ( (shiftpm== 1) && (sx<x)  ) {
-	   wraparound = 1;
-	 }
-
-	 int permute_slice=0;
-	 if(permute_dim){
-	   int wrap = sshift/rd;
-	   int  num = sshift%rd;
-	   if ( x< rd-num ) permute_slice=wrap;
-	   else permute_slice = (wrap+1)%ly;
-	 }
-
-	 CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
-
-       }
-     }
-
-     void Comms     (int point,int dimension,int shiftpm,int cbmask)
-     {
-       GridBase *grid=_grid;
-       const int Nsimd = grid->Nsimd();
-
-       int fd              = _grid->_fdimensions[dimension];
-       int ld              = _grid->_ldimensions[dimension];
-       int rd              = _grid->_rdimensions[dimension];
-       int pd              = _grid->_processors[dimension];
-       int simd_layout     = _grid->_simd_layout[dimension];
-       int comm_dim        = _grid->_processors[dimension] >1 ;
-
-       assert(comm_dim==1);
-       int shift = (shiftpm + fd) %fd;
-       assert(shift>=0);
-       assert(shift<fd);
-
-       int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; // done in reduced dims, so SIMD factored
-
-       _comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
-					    // send to one or more remote nodes.
-
-       int cb= (cbmask==0x2)? Odd : Even;
-       int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
-
-       for(int x=0;x<rd;x++){       
-
-	 int permute_type=grid->PermuteType(dimension);
-
-	 int sx        =  (x+sshift)%rd;
-
-	 int offnode = 0;
-	 if ( simd_layout > 1 ) {
-
-	   for(int i=0;i<Nsimd;i++){
-
-	     int inner_bit = (Nsimd>>(permute_type+1));
-	     int ic= (i&inner_bit)? 1:0;
-	     int my_coor          = rd*ic + x;
-	     int nbr_coor         = my_coor+sshift;
-	     int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
-
-	     if ( nbr_proc ) { 
-	       offnode =1;
-	     }
-	   }
-
-	 } else { 
-	   int comm_proc = ((x+sshift)/rd)%pd;
-	   offnode = (comm_proc!= 0);
-	 }
-
-
-	 int wraparound=0;
-	 if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
-	   wraparound = 1;
-	 }
-	 if ( (shiftpm== 1) && (sx<x) && (grid->_processor_coor[dimension]==grid->_processors[dimension]-1) ) {
-	   wraparound = 1;
-	 }
-	 if (!offnode) {
-
-	   int permute_slice=0;
-	   CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); 
-
-	 } else {
-
-	   int words = buffer_size;
-	   if (cbmask != 0x3) words=words>>1;
-
-	   int rank           = grid->_processor;
-	   int recv_from_rank;
-	   int xmit_to_rank;
-
-	   int unified_buffer_offset = _unified_buffer_size;
-	   _unified_buffer_size    += words;
-
-	   ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
-
-	 }
-       }
-     }
-   // Routine builds up integer table for each site in _offsets, _is_local, _permute
-   void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap)
-     {
-       int rd = _grid->_rdimensions[dimension];
-
-       if ( !_grid->CheckerBoarded(dimension) ) {
-
-	 int o   = 0;                                     // relative offset to base within plane
-	 int ro  = rplane*_grid->_ostride[dimension]; // base offset for start of plane 
-	 int lo  = lplane*_grid->_ostride[dimension]; // offset in buffer
-
-	 // Simple block stride gather of SIMD objects
-	 for(int n=0;n<_grid->_slice_nblock[dimension];n++){
-	   for(int b=0;b<_grid->_slice_block[dimension];b++){
-	     int idx=point+(lo+o+b)*_npoints;
-	     _entries[idx]._offset  =ro+o+b;
-	     _entries[idx]._permute=permute;
-	     _entries[idx]._is_local=1;
-	     _entries[idx]._around_the_world=wrap;
-	   }
-	   o +=_grid->_slice_stride[dimension];
-	 }
-
-       } else {
-
-	 int ro  = rplane*_grid->_ostride[dimension]; // base offset for start of plane 
-	 int lo  = lplane*_grid->_ostride[dimension]; // base offset for start of plane 
-	 int o   = 0;                                     // relative offset to base within plane
-
-	 for(int n=0;n<_grid->_slice_nblock[dimension];n++){
-	   for(int b=0;b<_grid->_slice_block[dimension];b++){
-
-	     int ocb=1<<_grid->CheckerBoardFromOindex(o+b);
-
-	     if ( ocb&cbmask ) {
-	       int idx = point+(lo+o+b)*_npoints;
-	       _entries[idx]._offset =ro+o+b;
-	       _entries[idx]._is_local=1;
-	       _entries[idx]._permute=permute;
-	       _entries[idx]._around_the_world=wrap;
-	     }
-
-	   }
-	   o +=_grid->_slice_stride[dimension];
-	 }
-
-       }
-     }
-   // Routine builds up integer table for each site in _offsets, _is_local, _permute
-    void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset, int wrap)
-     {
-       int rd = _grid->_rdimensions[dimension];
-
-       if ( !_grid->CheckerBoarded(dimension) ) {
-
-	 int so  = plane*_grid->_ostride[dimension]; // base offset for start of plane 
-	 int o   = 0;                                    // relative offset to base within plane
-	 int bo  = 0;                                    // offset in buffer
-
-	 // Simple block stride gather of SIMD objects
-	 for(int n=0;n<_grid->_slice_nblock[dimension];n++){
-	   for(int b=0;b<_grid->_slice_block[dimension];b++){
-	     int idx=point+(so+o+b)*_npoints;
-	     _entries[idx]._offset  =offset+(bo++);
-	     _entries[idx]._is_local=0;
-	     _entries[idx]._permute=0;
-	     _entries[idx]._around_the_world=wrap;
-	   }
-	   o +=_grid->_slice_stride[dimension];
-	 }
-
-       } else { 
-
-	 int so  = plane*_grid->_ostride[dimension]; // base offset for start of plane 
-	 int o   = 0;                                      // relative offset to base within plane
-	 int bo  = 0;                                      // offset in buffer
-
-	 for(int n=0;n<_grid->_slice_nblock[dimension];n++){
-	   for(int b=0;b<_grid->_slice_block[dimension];b++){
-
-	     int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
-	     if ( ocb & cbmask ) {
-	       int idx = point+(so+o+b)*_npoints;
-	       _entries[idx]._offset  =offset+(bo++);
-	       _entries[idx]._is_local=0;
-	       _entries[idx]._permute =0;
-	       _entries[idx]._around_the_world=wrap;
-	     }
-	   }
-	   o +=_grid->_slice_stride[dimension];
-	 }
-       }
-     }
-
-
-
-       template<class compressor>
-       void HaloExchange(const Lattice<vobj> &source,compressor &compress) 
-       {
-	 std::vector<std::vector<CommsRequest_t> > reqs;
-	 calls++;
-	 Mergers.resize(0);
-         Packets.resize(0);
-         HaloGather(source,compress);
-	 this->CommunicateBegin(reqs);
-	 this->CommunicateComplete(reqs);
-	 CommsMerge(); // spins
-       }
-
-       template<class compressor>
-       void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
-       {
-	   int dimension    = _directions[point];
-	   int displacement = _distances[point];
-
-	   int fd = _grid->_fdimensions[dimension];
-	   int rd = _grid->_rdimensions[dimension];
-
-
-	   // Map to always positive shift modulo global full dimension.
-	   int shift = (displacement+fd)%fd;
-
-	   //     	  int checkerboard = _grid->CheckerBoardDestination(source.checkerboard,shift);
-	   assert (source.checkerboard== _checkerboard);
-
-	   // the permute type
-	   int simd_layout     = _grid->_simd_layout[dimension];
-	   int comm_dim        = _grid->_processors[dimension] >1 ;
-	   int splice_dim      = _grid->_simd_layout[dimension]>1 && (comm_dim);
-
-	   // Gather phase
-	   int sshift [2];
-	   if ( comm_dim ) {
-	     sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
-	     sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
-	     if ( sshift[0] == sshift[1] ) {
-	       if (splice_dim) {
-		 splicetime-=usecond();
-		 GatherSimd(source,dimension,shift,0x3,compress,face_idx);
-		 splicetime+=usecond();
-	       } else { 
-		 nosplicetime-=usecond();
-		 Gather(source,dimension,shift,0x3,compress,face_idx);
-		 nosplicetime+=usecond();
-	       }
-	     } else {
-	       if(splice_dim){
-		 splicetime-=usecond();
-		 GatherSimd(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes
-		 GatherSimd(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration
-		 splicetime+=usecond();
-	       } else {
-		 nosplicetime-=usecond();
-		 Gather(source,dimension,shift,0x1,compress,face_idx);
-		 Gather(source,dimension,shift,0x2,compress,face_idx);
-		 nosplicetime+=usecond();
-	       }
-	     }
-	   }
-       }
-
-       template<class compressor>
-       void HaloGather(const Lattice<vobj> &source,compressor &compress)
-       {
-	 // conformable(source._grid,_grid);
-	 assert(source._grid==_grid);
-	 halogtime-=usecond();
-
-	 assert (comm_buf.size() == _unified_buffer_size );
-	 u_comm_offset=0;
-
-	 // Gather all comms buffers
-	 int face_idx=0;
-	 for(int point = 0 ; point < _npoints; point++) {
-	   compress.Point(point);
-	   HaloGatherDir(source,compress,point,face_idx);
-	 }
-	 face_table_computed=1;
-
-	 assert(u_comm_offset==_unified_buffer_size);
-	 halogtime+=usecond();
-       }
-
-       template<class compressor>
-	 void Gather(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx)
-	 {
-	   typedef typename cobj::vector_type vector_type;
-	   typedef typename cobj::scalar_type scalar_type;
-
-	   GridBase *grid=_grid;
-	   assert(rhs._grid==_grid);
-	   //	  conformable(_grid,rhs._grid);
-
-	   int fd              = _grid->_fdimensions[dimension];
-	   int rd              = _grid->_rdimensions[dimension];
-	   int pd              = _grid->_processors[dimension];
-	   int simd_layout     = _grid->_simd_layout[dimension];
-	   int comm_dim        = _grid->_processors[dimension] >1 ;
-	   assert(simd_layout==1);
-	   assert(comm_dim==1);
-	   assert(shift>=0);
-	   assert(shift<fd);
-
-	   int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
-
-	   int cb= (cbmask==0x2)? Odd : Even;
-	   int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
-
-	   for(int x=0;x<rd;x++){       
-
-	     int sx        = (x+sshift)%rd;
-	     int comm_proc = ((x+sshift)/rd)%pd;
-
-	     cobj *u_send_buf_p;
-	     cobj   *comm_buf_p;
-
-	     if (comm_proc) {
-
-	       int words = buffer_size;
-	       if (cbmask != 0x3) words=words>>1;
-
-	       int bytes = words * sizeof(cobj);
-
-	       gathertime-=usecond();
-	       int so  = sx*rhs._grid->_ostride[dimension]; // base offset for start of plane 
-	       if ( !face_table_computed ) {
-		 t_table-=usecond();
-		 face_table.resize(face_idx+1);
-		 cobj *ptr; ptr = &u_send_buf[0];
-		 Gather_plane_simple_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,
-						    face_table[face_idx]);
-		 t_table+=usecond();
-	       }
-	       t_data-=usecond();
-	       Gather_plane_simple_table         (face_table[face_idx],rhs,&u_send_buf[0],compress,u_comm_offset,so);  face_idx++;
-	       t_data+=usecond();
-	       gathertime+=usecond();
-
-	       int rank           = _grid->_processor;
-	       int recv_from_rank;
-	       int xmit_to_rank;
-	       _grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
-	       assert (xmit_to_rank   != _grid->ThisRank());
-	       assert (recv_from_rank != _grid->ThisRank());
-
-	       //      FIXME Implement asynchronous send & also avoid buffer copy
-	       AddPacket((void *)&u_send_buf[u_comm_offset],
-			 (void *)  &comm_buf[u_comm_offset],
-			 xmit_to_rank,
-			 recv_from_rank,
-			 bytes);
-
-	       u_comm_offset+=words;
-	     }
-	   }
-	 }
-
-
-       template<class compressor>
-	 void  GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx)
-	 {
-	   const int Nsimd = _grid->Nsimd();
-
-	   int fd = _grid->_fdimensions[dimension];
-	   int rd = _grid->_rdimensions[dimension];
-	   int ld = _grid->_ldimensions[dimension];
-	   int pd              = _grid->_processors[dimension];
-	   int simd_layout     = _grid->_simd_layout[dimension];
-	   int comm_dim        = _grid->_processors[dimension] >1 ;
-
-	   assert(comm_dim==1);
-	   // This will not work with a rotate dim
-	   assert(simd_layout==2);
-	   assert(shift>=0);
-	   assert(shift<fd);
-
-	   int permute_type=_grid->PermuteType(dimension);
-
-	   ///////////////////////////////////////////////
-	   // Simd direction uses an extract/merge pair
-	   ///////////////////////////////////////////////
-	   int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
-	   int words = sizeof(cobj)/sizeof(vector_type);
-
-	   assert(cbmask==0x3); // Fixme think there is a latent bug if not true
-
-	   int bytes = buffer_size*sizeof(scalar_object);
-
-	   std::vector<scalar_object *> rpointers(Nsimd);
-	   std::vector<scalar_object *> spointers(Nsimd);
-
-	   ///////////////////////////////////////////
-	   // Work out what to send where
-	   ///////////////////////////////////////////
-
-	   int cb    = (cbmask==0x2)? Odd : Even;
-	   int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
-
-	   // loop over outer coord planes orthog to dim
-	   for(int x=0;x<rd;x++){       
-
-	     int any_offnode = ( ((x+sshift)%fd) >= rd );
-
-	     if ( any_offnode ) {
-
-	       for(int i=0;i<Nsimd;i++){       
-		 spointers[i] = &u_simd_send_buf[i][u_comm_offset];
-	       }
-
-	       int sx   = (x+sshift)%rd;
-
-	       gathermtime-=usecond();
-	       Gather_plane_extract<cobj>(rhs,spointers,dimension,sx,cbmask,compress);
-	       gathermtime+=usecond();
-
-	       for(int i=0;i<Nsimd;i++){
-
-		 // FIXME 
-		 // This logic is hard coded to simd_layout ==2 and not allowing >2
-		 //		std::cout << "GatherSimd : lane 1st elem " << i << u_simd_send_buf[i ][u_comm_offset]<<std::endl;
-
-		 int inner_bit = (Nsimd>>(permute_type+1));
-		 int ic= (i&inner_bit)? 1:0;
-
-		 int my_coor          = rd*ic + x;
-		 int nbr_coor         = my_coor+sshift;
-		 int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
-		 int nbr_lcoor= (nbr_coor%ld);
-		 int nbr_ic   = (nbr_lcoor)/rd;    // inner coord of peer
-		 int nbr_ox   = (nbr_lcoor%rd);    // outer coord of peer
-		 int nbr_lane = (i&(~inner_bit));
-
-		 if (nbr_ic) nbr_lane|=inner_bit;
-		 assert (sx == nbr_ox);
-
-		 auto rp = &u_simd_recv_buf[i       ][u_comm_offset];
-		 auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset];
-
-		 void *vrp = (void *)rp;
-		 void *vsp = (void *)sp;
-
-
-		 if(nbr_proc){
-
-		   int recv_from_rank;
-		   int xmit_to_rank;
-
-		   _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
-
-		   AddPacket( vsp,vrp,xmit_to_rank,recv_from_rank,bytes);
-
-		   rpointers[i] = rp;
-
-		 } else { 
-
-		   rpointers[i] = sp;
-
-		 }
-	       }
-
-	       AddMerge(&comm_buf[u_comm_offset],rpointers,buffer_size,Packets.size()-1);
-
-	       u_comm_offset     +=buffer_size;
-	     }
-	   }
-	 }
-
-   };
- }
+ CartesianStencil(GridBase *grid,
+		  int npoints,
+		  int checkerboard,
+		  const std::vector<int> &directions,
+		  const std::vector<int> &distances) 
+   :   _permute_type(npoints), _comm_buf_size(npoints)
+  {
+    face_table_computed=0;
+    _npoints = npoints;
+    _grid    = grid;
+    _directions = directions;
+    _distances  = distances;
+    _unified_buffer_size=0;
+
+    int osites  = _grid->oSites();
+    
+    _entries.resize(_npoints* osites);
+    for(int ii=0;ii<npoints;ii++){
+      
+      int i = ii; // reverse direction to get SIMD comms done first
+      int point = i;
+      
+      int dimension    = directions[i];
+      int displacement = distances[i];
+      int shift = displacement;
+      
+      int fd = _grid->_fdimensions[dimension];
+      int rd = _grid->_rdimensions[dimension];
+      _permute_type[point]=_grid->PermuteType(dimension);
+      
+      _checkerboard = checkerboard;
+      
+      // the permute type
+      int simd_layout     = _grid->_simd_layout[dimension];
+      int comm_dim        = _grid->_processors[dimension] >1 ;
+      int splice_dim      = _grid->_simd_layout[dimension]>1 && (comm_dim);
+      int rotate_dim      = _grid->_simd_layout[dimension]>2;
+
+      assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported
+      
+      int sshift[2];
+      
+      // Underlying approach. For each local site build
+      // up a table containing the npoint "neighbours" and whether they 
+      // live in lattice or a comms buffer.
+      if ( !comm_dim ) {
+	sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
+	sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
+	
+	if ( sshift[0] == sshift[1] ) {
+	  Local(point,dimension,shift,0x3);
+	} else {
+	  Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
+	  Local(point,dimension,shift,0x2);// both with block stride loop iteration
+	}
+      } else { // All permute extract done in comms phase prior to Stencil application
+	//        So tables are the same whether comm_dim or splice_dim
+	sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
+	sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
+	
+	if ( sshift[0] == sshift[1] ) {
+	  Comms(point,dimension,shift,0x3);
+	} else {
+	  Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
+	  Comms(point,dimension,shift,0x2);// both with block stride loop iteration
+	}
+      }
+    }
+    u_send_buf.resize(_unified_buffer_size);
+
+    /////////////////////////////////////////////////////////////////////////////////
+    // Try to allocate for receiving in a shared memory region, fall back to buffer
+    /////////////////////////////////////////////////////////////////////////////////
+    if( ShmDirectCopy ) {
+
+      u_recv_buf_p=(cobj *)_grid->ShmBufferSelf();
+      if ( u_recv_buf_p == NULL ) {
+	u_recv_buf_hide.resize(_unified_buffer_size);
+	u_recv_buf_p=&u_recv_buf_hide[0];
+      }
+    } else {
+      u_recv_buf_hide.resize(_unified_buffer_size);
+      u_recv_buf_p=&u_recv_buf_hide[0];
+    }
+
+    PrecomputeByteOffsets();
+
+    const int Nsimd = grid->Nsimd();
+    u_simd_send_buf.resize(Nsimd);
+    u_simd_recv_buf.resize(Nsimd);
+    for(int l=0;l<Nsimd;l++){
+      u_simd_send_buf[l].resize(_unified_buffer_size);
+      u_simd_recv_buf[l].resize(_unified_buffer_size);
+    }
+  }
+
+  void Local     (int point, int dimension,int shiftpm,int cbmask)
+  {
+    int fd = _grid->_fdimensions[dimension];
+    int rd = _grid->_rdimensions[dimension];
+    int ld = _grid->_ldimensions[dimension];
+    int gd = _grid->_gdimensions[dimension];
+    int ly = _grid->_simd_layout[dimension];
+    
+    // Map to always positive shift modulo global full dimension.
+    int shift = (shiftpm+fd)%fd;
+
+    // the permute type
+    int permute_dim =_grid->PermuteDim(dimension);
+    
+    for(int x=0;x<rd;x++){       
+      
+      int o   = 0;
+      int bo  = x * _grid->_ostride[dimension];
+      
+      int cb= (cbmask==0x2)? Odd : Even;
+      
+      int sshift = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
+      int sx     = (x+sshift)%rd;
+      
+      int wraparound=0;
+      if ( (shiftpm==-1) && (sx>x)  ) {
+	wraparound = 1;
+      }
+      if ( (shiftpm== 1) && (sx<x)  ) {
+	wraparound = 1;
+      }
+      
+      int permute_slice=0;
+      if(permute_dim){
+	int wrap = sshift/rd;
+	int  num = sshift%rd;
+	if ( x< rd-num ) permute_slice=wrap;
+	else permute_slice = (wrap+1)%ly;
+      }
+      
+      CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
+      
+    }
+  }
+  
+  void Comms     (int point,int dimension,int shiftpm,int cbmask)
+  {
+    GridBase *grid=_grid;
+    const int Nsimd = grid->Nsimd();
+    
+    int fd              = _grid->_fdimensions[dimension];
+    int ld              = _grid->_ldimensions[dimension];
+    int rd              = _grid->_rdimensions[dimension];
+    int pd              = _grid->_processors[dimension];
+    int simd_layout     = _grid->_simd_layout[dimension];
+    int comm_dim        = _grid->_processors[dimension] >1 ;
+    
+    assert(comm_dim==1);
+    int shift = (shiftpm + fd) %fd;
+    assert(shift>=0);
+    assert(shift<fd);
+    
+    int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; // done in reduced dims, so SIMD factored
+    
+    _comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
+    // send to one or more remote nodes.
+    
+    int cb= (cbmask==0x2)? Odd : Even;
+    int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
+    
+    for(int x=0;x<rd;x++){       
+      
+      int permute_type=grid->PermuteType(dimension);
+      
+      int sx        =  (x+sshift)%rd;
+      
+      int offnode = 0;
+      if ( simd_layout > 1 ) {
+	
+	for(int i=0;i<Nsimd;i++){
+	  
+	  int inner_bit = (Nsimd>>(permute_type+1));
+	  int ic= (i&inner_bit)? 1:0;
+	  int my_coor          = rd*ic + x;
+	  int nbr_coor         = my_coor+sshift;
+	  int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
+	  
+	  if ( nbr_proc ) { 
+	    offnode =1;
+	  }
+	}
+	
+      } else { 
+	int comm_proc = ((x+sshift)/rd)%pd;
+	offnode = (comm_proc!= 0);
+      }
+      
+      
+      int wraparound=0;
+      if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
+	wraparound = 1;
+      }
+      if ( (shiftpm== 1) && (sx<x) && (grid->_processor_coor[dimension]==grid->_processors[dimension]-1) ) {
+	wraparound = 1;
+      }
+      if (!offnode) {
+	
+	int permute_slice=0;
+	CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); 
+	
+      } else {
+
+	int words = buffer_size;
+	if (cbmask != 0x3) words=words>>1;
+	
+	int rank           = grid->_processor;
+	int recv_from_rank;
+	int xmit_to_rank;
+	
+	int unified_buffer_offset = _unified_buffer_size;
+	_unified_buffer_size    += words;
+	
+	ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
+	
+      }
+    }
+  }
+  // Routine builds up integer table for each site in _offsets, _is_local, _permute
+  void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap)
+  {
+    int rd = _grid->_rdimensions[dimension];
+    
+    if ( !_grid->CheckerBoarded(dimension) ) {
+      
+      int o   = 0;                                     // relative offset to base within plane
+      int ro  = rplane*_grid->_ostride[dimension]; // base offset for start of plane 
+      int lo  = lplane*_grid->_ostride[dimension]; // offset in buffer
+      
+      // Simple block stride gather of SIMD objects
+      for(int n=0;n<_grid->_slice_nblock[dimension];n++){
+	for(int b=0;b<_grid->_slice_block[dimension];b++){
+	  int idx=point+(lo+o+b)*_npoints;
+	  _entries[idx]._offset  =ro+o+b;
+	  _entries[idx]._permute=permute;
+	  _entries[idx]._is_local=1;
+	  _entries[idx]._around_the_world=wrap;
+	}
+	o +=_grid->_slice_stride[dimension];
+      }
+      
+    } else {
+      
+      int ro  = rplane*_grid->_ostride[dimension]; // base offset for start of plane 
+      int lo  = lplane*_grid->_ostride[dimension]; // base offset for start of plane 
+      int o   = 0;                                     // relative offset to base within plane
+      
+      for(int n=0;n<_grid->_slice_nblock[dimension];n++){
+	for(int b=0;b<_grid->_slice_block[dimension];b++){
+	  
+	  int ocb=1<<_grid->CheckerBoardFromOindex(o+b);
+	  
+	  if ( ocb&cbmask ) {
+	    int idx = point+(lo+o+b)*_npoints;
+	    _entries[idx]._offset =ro+o+b;
+	    _entries[idx]._is_local=1;
+	    _entries[idx]._permute=permute;
+	    _entries[idx]._around_the_world=wrap;
+	  }
+	  
+	}
+	o +=_grid->_slice_stride[dimension];
+      }
+      
+    }
+  }
+  // Routine builds up integer table for each site in _offsets, _is_local, _permute
+  void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset, int wrap)
+  {
+    int rd = _grid->_rdimensions[dimension];
+    
+    if ( !_grid->CheckerBoarded(dimension) ) {
+      
+      int so  = plane*_grid->_ostride[dimension]; // base offset for start of plane 
+      int o   = 0;                                    // relative offset to base within plane
+      int bo  = 0;                                    // offset in buffer
+      
+      // Simple block stride gather of SIMD objects
+      for(int n=0;n<_grid->_slice_nblock[dimension];n++){
+	for(int b=0;b<_grid->_slice_block[dimension];b++){
+	  int idx=point+(so+o+b)*_npoints;
+	  _entries[idx]._offset  =offset+(bo++);
+	  _entries[idx]._is_local=0;
+	  _entries[idx]._permute=0;
+	  _entries[idx]._around_the_world=wrap;
+	}
+	o +=_grid->_slice_stride[dimension];
+      }
+      
+    } else { 
+      
+      int so  = plane*_grid->_ostride[dimension]; // base offset for start of plane 
+      int o   = 0;                                      // relative offset to base within plane
+      int bo  = 0;                                      // offset in buffer
+      
+      for(int n=0;n<_grid->_slice_nblock[dimension];n++){
+	for(int b=0;b<_grid->_slice_block[dimension];b++){
+	  
+	  int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
+	  if ( ocb & cbmask ) {
+	    int idx = point+(so+o+b)*_npoints;
+	    _entries[idx]._offset  =offset+(bo++);
+	    _entries[idx]._is_local=0;
+	    _entries[idx]._permute =0;
+	    _entries[idx]._around_the_world=wrap;
+	  }
+	}
+	o +=_grid->_slice_stride[dimension];
+      }
+    }
+  }
+  
+  template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress) 
+  {
+    std::vector<std::vector<CommsRequest_t> > reqs;
+    calls++;
+    Mergers.resize(0);
+    Packets.resize(0);
+    HaloGather(source,compress);
+    this->CommunicateBegin(reqs);
+    this->CommunicateComplete(reqs);
+    CommsMerge(); // spins
+  }
+  
+  template<class compressor> void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
+  {
+    int dimension    = _directions[point];
+    int displacement = _distances[point];
+
+    int fd = _grid->_fdimensions[dimension];
+    int rd = _grid->_rdimensions[dimension];
+    
+    
+    // Map to always positive shift modulo global full dimension.
+    int shift = (displacement+fd)%fd;
+    
+    //     	  int checkerboard = _grid->CheckerBoardDestination(source.checkerboard,shift);
+    assert (source.checkerboard== _checkerboard);
+    
+    // the permute type
+    int simd_layout     = _grid->_simd_layout[dimension];
+    int comm_dim        = _grid->_processors[dimension] >1 ;
+    int splice_dim      = _grid->_simd_layout[dimension]>1 && (comm_dim);
+    
+    // Gather phase
+    int sshift [2];
+    if ( comm_dim ) {
+      sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
+      sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
+      if ( sshift[0] == sshift[1] ) {
+	if (splice_dim) {
+	  splicetime-=usecond();
+	  GatherSimd(source,dimension,shift,0x3,compress,face_idx);
+	  splicetime+=usecond();
+	} else { 
+	  nosplicetime-=usecond();
+	  Gather(source,dimension,shift,0x3,compress,face_idx);
+	  nosplicetime+=usecond();
+	}
+      } else {
+	if(splice_dim){
+	  splicetime-=usecond();
+	  GatherSimd(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes
+	  GatherSimd(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration
+	  splicetime+=usecond();
+	} else {
+	  nosplicetime-=usecond();
+	  Gather(source,dimension,shift,0x1,compress,face_idx);
+	  Gather(source,dimension,shift,0x2,compress,face_idx);
+	  nosplicetime+=usecond();
+	}
+      }
+    }
+  }
+  
+  template<class compressor>
+  void HaloGather(const Lattice<vobj> &source,compressor &compress)
+  {
+    // conformable(source._grid,_grid);
+    assert(source._grid==_grid);
+    halogtime-=usecond();
+    
+    u_comm_offset=0;
+    
+    // Gather all comms buffers
+    int face_idx=0;
+    for(int point = 0 ; point < _npoints; point++) {
+      compress.Point(point);
+      HaloGatherDir(source,compress,point,face_idx);
+    }
+    face_table_computed=1;
+    
+    assert(u_comm_offset==_unified_buffer_size);
+    halogtime+=usecond();
+  }
+  
+  template<class compressor>
+  void Gather(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx)
+  {
+    typedef typename cobj::vector_type vector_type;
+    typedef typename cobj::scalar_type scalar_type;
+    
+    GridBase *grid=_grid;
+    assert(rhs._grid==_grid);
+    //	  conformable(_grid,rhs._grid);
+    
+    int fd              = _grid->_fdimensions[dimension];
+    int rd              = _grid->_rdimensions[dimension];
+    int pd              = _grid->_processors[dimension];
+    int simd_layout     = _grid->_simd_layout[dimension];
+    int comm_dim        = _grid->_processors[dimension] >1 ;
+    assert(simd_layout==1);
+    assert(comm_dim==1);
+    assert(shift>=0);
+    assert(shift<fd);
+    
+    int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
+    
+    int cb= (cbmask==0x2)? Odd : Even;
+    int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
+    
+    for(int x=0;x<rd;x++){       
+      
+      int sx        = (x+sshift)%rd;
+      int comm_proc = ((x+sshift)/rd)%pd;
+      
+      if (comm_proc) {
+
+	int words = buffer_size;
+	if (cbmask != 0x3) words=words>>1;
+	
+	int bytes = words * sizeof(cobj);
+	
+	gathertime-=usecond();
+	int so  = sx*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+	if ( !face_table_computed ) {
+	  t_table-=usecond();
+	  face_table.resize(face_idx+1);
+	  Gather_plane_simple_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,
+					     face_table[face_idx]);
+	  t_table+=usecond();
+	}
+	
+	
+	int rank           = _grid->_processor;
+	int recv_from_rank;
+	int xmit_to_rank;
+	_grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
+	
+	assert (xmit_to_rank   != _grid->ThisRank());
+	assert (recv_from_rank != _grid->ThisRank());
+	
+	/////////////////////////////////////////////////////////
+	// try the direct copy if possible
+	/////////////////////////////////////////////////////////
+
+	cobj *u_send_buf_p = &u_send_buf[0];
+	if (ShmDirectCopy) { 
+	  cobj *shm = (cobj *) _grid->ShmBuffer(xmit_to_rank);
+	  if ( shm!=NULL) { 
+	    u_send_buf_p = shm;
+	  }
+	}
+	
+	t_data-=usecond();
+	Gather_plane_simple_table         (face_table[face_idx],rhs,u_send_buf_p,compress,u_comm_offset,so);  face_idx++;
+	t_data+=usecond();
+	
+	AddPacket((void *)&u_send_buf_p[u_comm_offset],
+		  (void *)&u_recv_buf_p[u_comm_offset],
+		  xmit_to_rank,
+		  recv_from_rank,
+		  bytes);
+
+	gathertime+=usecond();
+	u_comm_offset+=words;
+      }
+    }
+  }
+  
+  template<class compressor>
+  void  GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx)
+  {
+    const int Nsimd = _grid->Nsimd();
+
+    int fd = _grid->_fdimensions[dimension];
+    int rd = _grid->_rdimensions[dimension];
+    int ld = _grid->_ldimensions[dimension];
+    int pd              = _grid->_processors[dimension];
+    int simd_layout     = _grid->_simd_layout[dimension];
+    int comm_dim        = _grid->_processors[dimension] >1 ;
+    
+    assert(comm_dim==1);
+    // This will not work with a rotate dim
+    assert(simd_layout==2);
+    assert(shift>=0);
+    assert(shift<fd);
+    
+    int permute_type=_grid->PermuteType(dimension);
+    
+    ///////////////////////////////////////////////
+    // Simd direction uses an extract/merge pair
+    ///////////////////////////////////////////////
+    int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
+    int words = sizeof(cobj)/sizeof(vector_type);
+    
+    assert(cbmask==0x3); // Fixme think there is a latent bug if not true
+    
+    int bytes = buffer_size*sizeof(scalar_object);
+    
+    std::vector<scalar_object *> rpointers(Nsimd);
+    std::vector<scalar_object *> spointers(Nsimd);
+    
+    ///////////////////////////////////////////
+    // Work out what to send where
+    ///////////////////////////////////////////
+    
+    int cb    = (cbmask==0x2)? Odd : Even;
+    int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
+    
+    // loop over outer coord planes orthog to dim
+    for(int x=0;x<rd;x++){       
+      
+      int any_offnode = ( ((x+sshift)%fd) >= rd );
+      
+      if ( any_offnode ) {
+	
+	for(int i=0;i<Nsimd;i++){       
+	  spointers[i] = &u_simd_send_buf[i][u_comm_offset];
+	}
+	
+	int sx   = (x+sshift)%rd;
+	
+	gathermtime-=usecond();
+	Gather_plane_extract<cobj>(rhs,spointers,dimension,sx,cbmask,compress);
+	gathermtime+=usecond();
+
+	for(int i=0;i<Nsimd;i++){
+	  
+	  // FIXME 
+	  // This logic is hard coded to simd_layout ==2 and not allowing >2
+	  //		std::cout << "GatherSimd : lane 1st elem " << i << u_simd_send_buf[i ][u_comm_offset]<<std::endl;
+	  
+	  int inner_bit = (Nsimd>>(permute_type+1));
+	  int ic= (i&inner_bit)? 1:0;
+	  
+	  int my_coor          = rd*ic + x;
+	  int nbr_coor         = my_coor+sshift;
+	  int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
+	  int nbr_lcoor= (nbr_coor%ld);
+	  int nbr_ic   = (nbr_lcoor)/rd;    // inner coord of peer
+	  int nbr_ox   = (nbr_lcoor%rd);    // outer coord of peer
+	  int nbr_lane = (i&(~inner_bit));
+	  
+	  if (nbr_ic) nbr_lane|=inner_bit;
+	  assert (sx == nbr_ox);
+	  
+	  auto rp = &u_simd_recv_buf[i       ][u_comm_offset];
+	  auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset];
+	  
+	  if(nbr_proc){
+	    
+	    int recv_from_rank;
+	    int xmit_to_rank;
+	    
+	    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
+	    
+	    AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes);
+	    
+	    rpointers[i] = rp;
+	    
+	  } else { 
+	    
+	    rpointers[i] = sp;
+	    
+	  }
+	}
+
+	assert(0);
+	AddMerge(&u_recv_buf_p[u_comm_offset],rpointers,buffer_size,Packets.size()-1);
+	
+	u_comm_offset     +=buffer_size;
+      }
+    }
+  }
+  
+};
+}
 #endif
diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h
index f2a6bf00..4139f72a 100644
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -80,7 +80,6 @@ class CartesianCommunicator {
 
     void * ShmCommBuf;
     std::vector<void *> ShmCommBufs;
-    std::vector<void *> ShmStencilBufs;
 
     int WorldRank;
     int WorldSize;
@@ -105,6 +104,10 @@ class CartesianCommunicator {
     int  RankFromProcessorCoor(std::vector<int> &coor);
     void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
 
+    // Helper function for SHM Windows in MPI3
+    void *ShmBufferSelf(void);
+    void *ShmBuffer(int rank);
+
     /////////////////////////////////
     // Grid information queries
     /////////////////////////////////
@@ -173,6 +176,16 @@ class CartesianCommunicator {
 			 int recv_from_rank,
 			 int bytes);
     void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
+    void StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+			 void *xmit,
+			 int xmit_to_rank,
+			 void *recv,
+			 int recv_from_rank,
+			 int bytes);
+    void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
+    {
+      SendToRecvFromComplete(waitall);
+    }
 
     ////////////////////////////////////////////////////////////
     // Barrier
diff --git a/lib/communicator/Communicator_mpi.cc b/lib/communicator/Communicator_mpi.cc
index 9c66202f..4291b319 100644
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@@ -67,6 +67,14 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
   
   assert(Size==_Nprocessors);
 }
+void *CartesianCommunicator::ShmBufferSelf(void)
+{
+  return NULL;
+}
+void *CartesianCommunicator::ShmBuffer(int rank)
+{
+  return NULL;
+}
 
 void CartesianCommunicator::GlobalSum(uint32_t &u){
   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc
index be4e66a9..f5bbdbda 100644
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -197,10 +197,10 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
     /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     // Verbose for now
     /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    std::cout<< "Ranks per node "<< ShmSize << std::endl;
-    std::cout<< "Nodes          "<< GroupSize << std::endl;
-    std::cout<< "Ranks          "<< WorldSize << std::endl;
-    std::cout<< "Shm CommBuf "<< ShmCommBuf << std::endl;
+    std::cout<<GridLogMessage<< "MPI-3 configuration: Ranks per node "<< ShmSize ;
+    std::cout<< " Nodes "<< GroupSize;
+    std::cout<< " Ranks "<< WorldSize;
+    std::cout<< " Shm CommBuf address"<< std::hex <<ShmCommBuf << std::dec<<std::endl;
 
     // Done
     ShmSetup=1;
@@ -208,12 +208,10 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
   }
 
   ShmCommBufs.resize(ShmSize);
-  ShmStencilBufs.resize(ShmSize);
   for(int r=0;r<ShmSize;r++){
     MPI_Aint sz;
     int dsp_unit;
     MPI_Win_shared_query (ShmWindow, r, &sz, &dsp_unit, &ShmCommBufs[r]);
-    ShmStencilBufs[r] = (void *) ((uint64_t)ShmCommBufs[r]+MAX_MPI_SHM_BYTES/4);
   }
   
   ////////////////////////////////////////////////////////////////
@@ -240,6 +238,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
   ShmCoor.resize(_ndimension);
   GroupCoor.resize(_ndimension);
   WorldCoor.resize(_ndimension);
+
   for(int l2=0;l2<log2size;l2++){
     while ( WorldDims[dim] / ShmDims[dim] <= 1 ) dim=(dim+1)%_ndimension;
     ShmDims[dim]*=2;
@@ -347,6 +346,21 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
   }
 }
 
+
+void *CartesianCommunicator::ShmBufferSelf(void)
+{
+  return ShmCommBufs[ShmRank];
+}
+void *CartesianCommunicator::ShmBuffer(int rank)
+{
+  int gpeer = GroupRanks[rank];
+  if (gpeer == MPI_UNDEFINED){
+    return NULL;
+  } else { 
+    return ShmCommBufs[gpeer];
+  }
+}
+
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
@@ -355,13 +369,11 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 						int from,
 						int bytes)
 {
-#undef SHM_USE_BCOPY
   MPI_Request xrq;
   MPI_Request rrq;
   
   static int sequence;
 
-  int rank = _processor;
   int ierr;
   int tag;
   int check;
@@ -370,6 +382,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
   assert(from != _processor);
   
   int gdest = GroupRanks[dest];
+  int gfrom = GroupRanks[from];
   int gme   = GroupRanks[_processor];
 
   sequence++;
@@ -379,30 +392,23 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 
   int small = (bytes<MAX_MPI_SHM_BYTES);
 
-#ifndef SHM_USE_BCOPY
   typedef vRealD T;
   int words = bytes/sizeof(T);
-  assert(((size_t)bytes &(sizeof(T)-1))==0);
-  //  assert(((size_t)xmit  &(sizeof(T)-1))==0);
-  //  assert(((size_t)recv  &(sizeof(T)-1))==0);
-#endif
 
+  assert(((size_t)bytes &(sizeof(T)-1))==0);
   assert(gme == ShmRank);
 
-  //  std::cerr << "proc dest from gme  gdest "<<_processor<<" "<<dest <<" "<< from <<" "<<gme<<" "<< gdest<<std::endl; Barrier();
-  if ( small && (dest !=MPI_UNDEFINED) ) {
+  if ( small && (gdest !=MPI_UNDEFINED) ) {
+
     assert(gme != gdest);
 
-#ifdef SHM_USE_BCOPY
-    bcopy(xmit,to_ptr,bytes);
-#else
     T *ip = (T *)xmit;
     T *op = (T *)to_ptr;
-    PARALLEL_FOR_LOOP 
+PARALLEL_FOR_LOOP 
     for(int w=0;w<words;w++) {
       vstream(op[w],ip[w]);
     }
-#endif
+
     bcopy(&_processor,&to_ptr[bytes],sizeof(_processor));
     bcopy(&  sequence,&to_ptr[bytes+4],sizeof(sequence));
   } else { 
@@ -411,24 +417,17 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
     list.push_back(xrq);
   }
   
-  //  std::cout << "Syncing "<<std::endl; Barrier();
   MPI_Win_sync (ShmWindow);   
   MPI_Barrier  (ShmComm);
   MPI_Win_sync (ShmWindow);   
 
-  //  std::cout << "Receiving "<<std::endl; Barrier();
-  
-  if (small && (from !=MPI_UNDEFINED) ) {
-#ifdef SHM_USE_BCOPY
-    bcopy(from_ptr,recv,bytes);
-#else
+  if (small && (gfrom !=MPI_UNDEFINED) ) {
     T *ip = (T *)from_ptr;
     T *op = (T *)recv;
-    PARALLEL_FOR_LOOP 
+PARALLEL_FOR_LOOP 
     for(int w=0;w<words;w++) {
       vstream(op[w],ip[w]);
     }
-#endif
     bcopy(&from_ptr[bytes]  ,&tag  ,sizeof(tag));
     bcopy(&from_ptr[bytes+4],&check,sizeof(check));
     assert(check==sequence);
@@ -438,28 +437,52 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
     assert(ierr==0);
     list.push_back(rrq);
   }
-  
-  //  std::cout << "Syncing"<<std::endl; Barrier();
 
   MPI_Win_sync (ShmWindow);   
   MPI_Barrier  (ShmComm);
   MPI_Win_sync (ShmWindow);   
+}
 
-#if 0
+void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+						       void *xmit,
+						       int dest,
+						       void *recv,
+						       int from,
+						       int bytes)
+{
   MPI_Request xrq;
   MPI_Request rrq;
-  int rank = _processor;
-  int ierr;
-  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
-  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
-  
-  assert(ierr==0);
 
-  list.push_back(xrq);
-  list.push_back(rrq);
-#endif
+  int ierr;
+
+  assert(dest != _processor);
+  assert(from != _processor);
+  
+  int gdest = GroupRanks[dest];
+  int gfrom = GroupRanks[from];
+  int gme   = GroupRanks[_processor];
+
+  assert(gme == ShmRank);
+
+  if ( gdest == MPI_UNDEFINED ) {
+    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+    assert(ierr==0);
+    list.push_back(xrq);
+  }
+  
+  if ( gfrom ==MPI_UNDEFINED) {
+    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
+    assert(ierr==0);
+    list.push_back(rrq);
+  }
+
+  MPI_Win_sync (ShmWindow);   
+  MPI_Barrier  (ShmComm);
+  MPI_Win_sync (ShmWindow);   
+  
 }
 
+
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
   int nreq=list.size();
diff --git a/lib/communicator/Communicator_none.cc b/lib/communicator/Communicator_none.cc
index 8601255a..80b8fb90 100644
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@@ -33,6 +33,14 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
 }
 
 int Rank(void ){ return 0; };
+void *CartesianCommunicator::ShmBufferSelf(void)
+{
+  return NULL;
+}
+void *CartesianCommunicator::ShmBuffer(int rank)
+{
+  return NULL;
+}
 
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
diff --git a/lib/communicator/Communicator_shmem.cc b/lib/communicator/Communicator_shmem.cc
index 091e266e..4af719b0 100644
--- a/lib/communicator/Communicator_shmem.cc
+++ b/lib/communicator/Communicator_shmem.cc
@@ -50,6 +50,14 @@ typedef struct HandShake_t {
 static Vector< HandShake > XConnections;
 static Vector< HandShake > RConnections;
 
+void *CartesianCommunicator::ShmBufferSelf(void)
+{
+  return NULL;
+}
+void *CartesianCommunicator::ShmBuffer(int rank)
+{
+  return NULL;
+}
 void CartesianCommunicator::Init(int *argc, char ***argv) {
   shmem_init();
   XConnections.resize(shmem_n_pes());
diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h
index 4248ec4c..0800dea6 100644
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -33,511 +33,500 @@ directory
 #define GRID_QCD_FERMION_OPERATOR_IMPL_H
 
 namespace Grid {
-
-  namespace QCD {
+namespace QCD {
 
 
-    //////////////////////////////////////////////
-    // Template parameter class constructs to package
-    // externally control Fermion implementations
-    // in orthogonal directions
-    //
-    // Ultimately need Impl to always define types where XXX is opaque
-    //
-    //    typedef typename XXX               Simd;
-    //    typedef typename XXX     GaugeLinkField;	
-    //    typedef typename XXX         GaugeField;
-    //    typedef typename XXX      GaugeActField;
-    //    typedef typename XXX       FermionField;
-    //    typedef typename XXX  DoubledGaugeField;
-    //    typedef typename XXX         SiteSpinor;
-    //    typedef typename XXX     SiteHalfSpinor;	
-    //    typedef typename XXX         Compressor;	
-    //
-    // and Methods:
-    //    void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
-    //    void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
-    //    void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
-    //    void InsertForce4D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
-    //    void InsertForce5D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
-    //
-    //
-    // To acquire the typedefs from "Base" (either a base class or template param) use:
-    //
-    // INHERIT_GIMPL_TYPES(Base)
-    // INHERIT_FIMPL_TYPES(Base)
-    // INHERIT_IMPL_TYPES(Base)
-    //
-    // The Fermion operators will do the following:
-    //
-    // struct MyOpParams { 
-    //   RealD mass;
-    // };
-    //
-    //
-    // template<class Impl>
-    // class MyOp : public<Impl> { 
-    // public:
-    //
-    //    INHERIT_ALL_IMPL_TYPES(Impl);
-    //
-    //    MyOp(MyOpParams Myparm, ImplParams &ImplParam) :  Impl(ImplParam)
-    //    {
-    //
-    //    };
-    //    
-    //  }
-    //////////////////////////////////////////////
-
-
-    ////////////////////////////////////////////////////////////////////////
-    // Implementation dependent fermion types
-    ////////////////////////////////////////////////////////////////////////
+  //////////////////////////////////////////////
+  // Template parameter class constructs to package
+  // externally control Fermion implementations
+  // in orthogonal directions
+  //
+  // Ultimately need Impl to always define types where XXX is opaque
+  //
+  //    typedef typename XXX               Simd;
+  //    typedef typename XXX     GaugeLinkField;	
+  //    typedef typename XXX         GaugeField;
+  //    typedef typename XXX      GaugeActField;
+  //    typedef typename XXX       FermionField;
+  //    typedef typename XXX  DoubledGaugeField;
+  //    typedef typename XXX         SiteSpinor;
+  //    typedef typename XXX     SiteHalfSpinor;	
+  //    typedef typename XXX         Compressor;	
+  //
+  // and Methods:
+  //    void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
+  //    void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
+  //    void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
+  //    void InsertForce4D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
+  //    void InsertForce5D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
+  //
+  //
+  // To acquire the typedefs from "Base" (either a base class or template param) use:
+  //
+  // INHERIT_GIMPL_TYPES(Base)
+  // INHERIT_FIMPL_TYPES(Base)
+  // INHERIT_IMPL_TYPES(Base)
+  //
+  // The Fermion operators will do the following:
+  //
+  // struct MyOpParams { 
+  //   RealD mass;
+  // };
+  //
+  //
+  // template<class Impl>
+  // class MyOp : public<Impl> { 
+  // public:
+  //
+  //    INHERIT_ALL_IMPL_TYPES(Impl);
+  //
+  //    MyOp(MyOpParams Myparm, ImplParams &ImplParam) :  Impl(ImplParam)
+  //    {
+  //
+  //    };
+  //    
+  //  }
+  //////////////////////////////////////////////
+  
 
+  ////////////////////////////////////////////////////////////////////////
+  // Implementation dependent fermion types
+  ////////////////////////////////////////////////////////////////////////
+  
 #define INHERIT_FIMPL_TYPES(Impl)\
-    typedef typename Impl::FermionField           FermionField;		\
-    typedef typename Impl::DoubledGaugeField DoubledGaugeField;		\
-    typedef typename Impl::SiteSpinor               SiteSpinor;		\
-    typedef typename Impl::SiteHalfSpinor       SiteHalfSpinor;		\
-    typedef typename Impl::Compressor               Compressor;		\
-    typedef typename Impl::StencilImpl             StencilImpl;		\
-    typedef typename Impl::ImplParams ImplParams;			\
-    typedef typename Impl::Coeff_t       Coeff_t;
-
+  typedef typename Impl::FermionField           FermionField;		\
+  typedef typename Impl::DoubledGaugeField DoubledGaugeField;		\
+  typedef typename Impl::SiteSpinor               SiteSpinor;		\
+  typedef typename Impl::SiteHalfSpinor       SiteHalfSpinor;		\
+  typedef typename Impl::Compressor               Compressor;		\
+  typedef typename Impl::StencilImpl             StencilImpl;		\
+  typedef typename Impl::ImplParams ImplParams;				\
+  typedef typename Impl::Coeff_t       Coeff_t;
+  
 #define INHERIT_IMPL_TYPES(Base) \
-    INHERIT_GIMPL_TYPES(Base)	 \
-    INHERIT_FIMPL_TYPES(Base)
+  INHERIT_GIMPL_TYPES(Base)	 \
+  INHERIT_FIMPL_TYPES(Base)
+  
+  /////////////////////////////////////////////////////////////////////////////
+  // Single flavour four spinors with colour index
+  /////////////////////////////////////////////////////////////////////////////
+  template <class S, class Representation = FundamentalRepresentation,class _Coeff_t = RealD >
+  class WilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
+
+    public:
+
+    static const int Dimension = Representation::Dimension;
+    typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
+      
+    //Necessary?
+    constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
     
-    ///////
-    // Single flavour four spinors with colour index
-    ///////
-    template <class S, class Representation = FundamentalRepresentation,class _Coeff_t = RealD >
-    class WilsonImpl
-      : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
-    public:
-      static const int Dimension = Representation::Dimension;
-      typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
-      
-      //Necessary?
-      constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
+    const bool LsVectorised=false;
+    typedef _Coeff_t Coeff_t;
 
-      const bool LsVectorised=false;
-      typedef _Coeff_t Coeff_t;
-
-
-      INHERIT_GIMPL_TYPES(Gimpl);
+    INHERIT_GIMPL_TYPES(Gimpl);
       
-      template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
-      template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
-      template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
+    template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
+    template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
+    template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
+    
+    typedef iImplSpinor<Simd>            SiteSpinor;
+    typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
+    typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
+    
+    typedef Lattice<SiteSpinor>            FermionField;
+    typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
+    
+    typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
+    typedef WilsonImplParams ImplParams;
+    typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
+    
+    ImplParams Params;
+    
+    WilsonImpl(const ImplParams &p = ImplParams()) : Params(p){};
       
-      typedef iImplSpinor<Simd>            SiteSpinor;
-      typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
-      typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
+    bool overlapCommsCompute(void) { return Params.overlapCommsCompute; };
       
-      typedef Lattice<SiteSpinor>            FermionField;
-      typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
+    inline void multLink(SiteHalfSpinor &phi,
+			 const SiteDoubledGaugeField &U,
+			 const SiteHalfSpinor &chi,
+			 int mu,
+			 StencilEntry *SE,
+			 StencilImpl &St) {
+      mult(&phi(), &U(mu), &chi());
+    }
       
-      typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
-      typedef WilsonImplParams ImplParams;
-      typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
+    template <class ref>
+    inline void loadLinkElement(Simd &reg, ref &memory) {
+      reg = memory;
+    }
       
-      ImplParams Params;
-      
-      WilsonImpl(const ImplParams &p = ImplParams()) : Params(p){};
-      
-      bool overlapCommsCompute(void) { return Params.overlapCommsCompute; };
-      
-      inline void multLink(SiteHalfSpinor &phi,
-			   const SiteDoubledGaugeField &U,
-			   const SiteHalfSpinor &chi,
-			   int mu,
-			   StencilEntry *SE,
-			   StencilImpl &St) {
-	mult(&phi(), &U(mu), &chi());
+    inline void DoubleStore(GridBase *GaugeGrid,
+			    DoubledGaugeField &Uds,
+			    const GaugeField &Umu) {
+      conformable(Uds._grid, GaugeGrid);
+      conformable(Umu._grid, GaugeGrid);
+      GaugeLinkField U(GaugeGrid);
+      for (int mu = 0; mu < Nd; mu++) {
+	U = PeekIndex<LorentzIndex>(Umu, mu);
+	PokeIndex<LorentzIndex>(Uds, U, mu);
+	U = adj(Cshift(U, mu, -1));
+	PokeIndex<LorentzIndex>(Uds, U, mu + 4);
       }
+    }
+
+    inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
+      GaugeLinkField link(mat._grid);
+      link = TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
+      PokeIndex<LorentzIndex>(mat,link,mu);
+    }   
       
-      template <class ref>
-      inline void loadLinkElement(Simd &reg,
-				  ref &memory) {
-	reg = memory;
-      }
+    inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
       
-      inline void DoubleStore(GridBase *GaugeGrid,
-			      DoubledGaugeField &Uds,
-			      const GaugeField &Umu) {
-	conformable(Uds._grid, GaugeGrid);
-	conformable(Umu._grid, GaugeGrid);
-	GaugeLinkField U(GaugeGrid);
-	for (int mu = 0; mu < Nd; mu++) {
-	  U = PeekIndex<LorentzIndex>(Umu, mu);
-	  PokeIndex<LorentzIndex>(Uds, U, mu);
-	  U = adj(Cshift(U, mu, -1));
-	  PokeIndex<LorentzIndex>(Uds, U, mu + 4);
+      int Ls=Btilde._grid->_fdimensions[0];
+      GaugeLinkField tmp(mat._grid);
+      tmp = zero;
+      
+      PARALLEL_FOR_LOOP
+      for(int sss=0;sss<tmp._grid->oSites();sss++){
+	int sU=sss;
+	for(int s=0;s<Ls;s++){
+	  int sF = s+Ls*sU;
+	  tmp[sU] = tmp[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde[sF],Atilde[sF])); // ordering here
 	}
       }
-
-      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
-	GaugeLinkField link(mat._grid);
-	link = TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
-	PokeIndex<LorentzIndex>(mat,link,mu);
-      }   
+      PokeIndex<LorentzIndex>(mat,tmp,mu);
       
-      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
-	
-	int Ls=Btilde._grid->_fdimensions[0];
-	GaugeLinkField tmp(mat._grid);
-	tmp = zero;
+    }
+  };
 
-        PARALLEL_FOR_LOOP
-	  for(int sss=0;sss<tmp._grid->oSites();sss++){
-	    int sU=sss;
-	    for(int s=0;s<Ls;s++){
-	      int sF = s+Ls*sU;
-	      tmp[sU] = tmp[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde[sF],Atilde[sF])); // ordering here
-	    }
-	  }
-	PokeIndex<LorentzIndex>(mat,tmp,mu);
+  ////////////////////////////////////////////////////////////////////////////////////
+  // Single flavour four spinors with colour index, 5d redblack
+  ////////////////////////////////////////////////////////////////////////////////////
 
+template<class S,int Nrepresentation=Nc,class _Coeff_t = RealD>
+class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
+  public:
+      
+  static const int Dimension = Nrepresentation;
+  const bool LsVectorised=true;
+  typedef _Coeff_t Coeff_t;      
+  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Nrepresentation> > Gimpl;
+  
+  INHERIT_GIMPL_TYPES(Gimpl);
+  
+  template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
+  template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
+  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>;
+  template <typename vtype> using iImplGaugeField        = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd>;
+  template <typename vtype> using iImplGaugeLink         = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
+  
+  typedef iImplSpinor<Simd> SiteSpinor;
+  typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
+  typedef Lattice<SiteSpinor> FermionField;
+  
+  // Make the doubled gauge field a *scalar*
+  typedef iImplDoubledGaugeField<typename Simd::scalar_type>  SiteDoubledGaugeField;  // This is a scalar
+  typedef iImplGaugeField<typename Simd::scalar_type>         SiteScalarGaugeField;  // scalar
+  typedef iImplGaugeLink<typename Simd::scalar_type>          SiteScalarGaugeLink;  // scalar
+      
+  typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
+      
+  typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
+  typedef WilsonImplParams ImplParams;
+  typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
+  
+  ImplParams Params;
+  
+  DomainWallVec5dImpl(const ImplParams &p = ImplParams()) : Params(p){};
+      
+  bool overlapCommsCompute(void) { return false; };
+      
+  template <class ref>
+  inline void loadLinkElement(Simd &reg, ref &memory) {
+    vsplat(reg, memory);
+  }
+
+  inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
+		       const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
+		       StencilImpl &St) {
+    SiteGaugeLink UU;
+    for (int i = 0; i < Nrepresentation; i++) {
+      for (int j = 0; j < Nrepresentation; j++) {
+	vsplat(UU()()(i, j), U(mu)()(i, j));
       }
-    };
-
-    ///////
-    // Single flavour four spinors with colour index, 5d redblack
-    ///////
-    template<class S,int Nrepresentation=Nc,class _Coeff_t = RealD>
-    class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
-    public:
+    }
+    mult(&phi(), &UU(), &chi());
+  }
       
-      static const int Dimension = Nrepresentation;
-      const bool LsVectorised=true;
-      typedef _Coeff_t Coeff_t;      
-      typedef PeriodicGaugeImpl<GaugeImplTypes<S, Nrepresentation> > Gimpl;
-
-      INHERIT_GIMPL_TYPES(Gimpl);
+  inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu) 
+  {
+    SiteScalarGaugeField ScalarUmu;
+    SiteDoubledGaugeField ScalarUds;
+    
+    GaugeLinkField U(Umu._grid);
+    GaugeField Uadj(Umu._grid);
+    for (int mu = 0; mu < Nd; mu++) {
+      U = PeekIndex<LorentzIndex>(Umu, mu);
+      U = adj(Cshift(U, mu, -1));
+      PokeIndex<LorentzIndex>(Uadj, U, mu);
+    }
+    
+    for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
+      std::vector<int> lcoor;
+      GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
       
-      template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
-      template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
-      template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>;
-      template <typename vtype> using iImplGaugeField        = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd>;
-      template <typename vtype> using iImplGaugeLink         = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
+      peekLocalSite(ScalarUmu, Umu, lcoor);
+      for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
       
-      typedef iImplSpinor<Simd> SiteSpinor;
-      typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
-      typedef Lattice<SiteSpinor> FermionField;
+      peekLocalSite(ScalarUmu, Uadj, lcoor);
+      for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
       
-      // Make the doubled gauge field a *scalar*
-      typedef iImplDoubledGaugeField<typename Simd::scalar_type>
-      SiteDoubledGaugeField;  // This is a scalar
-      typedef iImplGaugeField<typename Simd::scalar_type>
-      SiteScalarGaugeField;  // scalar
-      typedef iImplGaugeLink<typename Simd::scalar_type>
-      SiteScalarGaugeLink;  // scalar
+      pokeLocalSite(ScalarUds, Uds, lcoor);
+    }
+  }
       
-      typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
+  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu) 
+  {
+    assert(0);
+  }
       
-      typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
-      typedef WilsonImplParams ImplParams;
-      typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
-      
-      ImplParams Params;
-      
-      DomainWallVec5dImpl(const ImplParams &p = ImplParams()) : Params(p){};
-      
-      bool overlapCommsCompute(void) { return false; };
-      
-      template <class ref>
-      inline void loadLinkElement(Simd &reg, ref &memory) {
-	vsplat(reg, memory);
-      }
-      inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
-			   const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
-			   StencilImpl &St) {
-	SiteGaugeLink UU;
-	for (int i = 0; i < Nrepresentation; i++) {
-	  for (int j = 0; j < Nrepresentation; j++) {
-	    vsplat(UU()()(i, j), U(mu)()(i, j));
-	  }
-	}
-	mult(&phi(), &UU(), &chi());
-      }
-      
-      inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,
-			      const GaugeField &Umu) {
-	SiteScalarGaugeField ScalarUmu;
-	SiteDoubledGaugeField ScalarUds;
-	
-	GaugeLinkField U(Umu._grid);
-	GaugeField Uadj(Umu._grid);
-	for (int mu = 0; mu < Nd; mu++) {
-	  U = PeekIndex<LorentzIndex>(Umu, mu);
-	  U = adj(Cshift(U, mu, -1));
-	  PokeIndex<LorentzIndex>(Uadj, U, mu);
-	}
-	
-	for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
-	  std::vector<int> lcoor;
-	  GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
-	  
-	  peekLocalSite(ScalarUmu, Umu, lcoor);
-	  for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
-	  
-	  peekLocalSite(ScalarUmu, Uadj, lcoor);
-	  for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
-	  
-	  pokeLocalSite(ScalarUds, Uds, lcoor);
-	}
-      }
-      
-      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,
-				FermionField &A, int mu) {
+  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,FermionField &Atilde, int mu) 
+  {
 	assert(0);
-      }
-      
-      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,
-				FermionField &Atilde, int mu) {
-	assert(0);
-      }
-    };
+  }
+};
     
     ////////////////////////////////////////////////////////////////////////////////////////
     // Flavour doubled spinors; is Gparity the only? what about C*?
     ////////////////////////////////////////////////////////////////////////////////////////
     
-    template <class S, int Nrepresentation,class _Coeff_t = RealD>
-    class GparityWilsonImpl
-      : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
-    public:
-      static const int Dimension = Nrepresentation;
+template <class S, int Nrepresentation,class _Coeff_t = RealD>
+class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
+ public:
 
-      const bool LsVectorised=false;
+ static const int Dimension = Nrepresentation;
 
-      typedef _Coeff_t Coeff_t;
-      typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;
+ const bool LsVectorised=false;
+
+ typedef _Coeff_t Coeff_t;
+ typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;
+ 
+ INHERIT_GIMPL_TYPES(Gimpl);
       
-      INHERIT_GIMPL_TYPES(Gimpl);
+ template <typename vtype> using iImplSpinor            = iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp>;
+ template <typename vtype> using iImplHalfSpinor        = iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp>;
+ template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>;
       
-      template <typename vtype>
-      using iImplSpinor =
-      iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp>;
-      template <typename vtype>
-      using iImplHalfSpinor =
-	iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp>;
-      template <typename vtype>
-      using iImplDoubledGaugeField =
-	iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>;
+ typedef iImplSpinor<Simd> SiteSpinor;
+ typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
+ typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
+ 
+ typedef Lattice<SiteSpinor> FermionField;
+ typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
+ 
+ typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
+ typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
+ 
+ typedef GparityWilsonImplParams ImplParams;
       
-      typedef iImplSpinor<Simd> SiteSpinor;
-      typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
-      typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
-      
-      typedef Lattice<SiteSpinor> FermionField;
-      typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
-      
-      typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
-      typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
+ ImplParams Params;
 
-      typedef GparityWilsonImplParams ImplParams;
-      
-      ImplParams Params;
+ GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){};
 
+ bool overlapCommsCompute(void) { return Params.overlapCommsCompute; };
 
-      GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){};
+ // provide the multiply by link that is differentiated between Gparity (with
+ // flavour index) and non-Gparity
+ inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
+		      const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
+		      StencilImpl &St) {
 
-      bool overlapCommsCompute(void) { return Params.overlapCommsCompute; };
-
-      // provide the multiply by link that is differentiated between Gparity (with
-      // flavour index) and non-Gparity
-      inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
-			   const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
-			   StencilImpl &St) {
-	typedef SiteHalfSpinor vobj;
-	typedef typename SiteHalfSpinor::scalar_object sobj;
+  typedef SiteHalfSpinor vobj;
+   typedef typename SiteHalfSpinor::scalar_object sobj;
 	
-	vobj vtmp;
-	sobj stmp;
+   vobj vtmp;
+   sobj stmp;
 	
-	GridBase *grid = St._grid;
+   GridBase *grid = St._grid;
 	
-	const int Nsimd = grid->Nsimd();
+   const int Nsimd = grid->Nsimd();
 	
-	int direction = St._directions[mu];
-	int distance = St._distances[mu];
-	int ptype = St._permute_type[mu];
-	int sl = St._grid->_simd_layout[direction];
+   int direction = St._directions[mu];
+   int distance = St._distances[mu];
+   int ptype = St._permute_type[mu];
+   int sl = St._grid->_simd_layout[direction];
+   
+   // Fixme X.Y.Z.T hardcode in stencil
+   int mmu = mu % Nd;
 	
-	// Fixme X.Y.Z.T hardcode in stencil
-	int mmu = mu % Nd;
+   // assert our assumptions
+   assert((distance == 1) || (distance == -1));  // nearest neighbour stencil hard code
+   assert((sl == 1) || (sl == 2));
+   
+   std::vector<int> icoor;
 	
-	// assert our assumptions
-	assert((distance == 1) || (distance == -1));  // nearest neighbour stencil hard code
-	assert((sl == 1) || (sl == 2));
-	
-	std::vector<int> icoor;
-	
-	if ( SE->_around_the_world && Params.twists[mmu] ) {
+   if ( SE->_around_the_world && Params.twists[mmu] ) {
 
-	  if ( sl == 2 ) {
+     if ( sl == 2 ) {
+       
+       std::vector<sobj> vals(Nsimd);
 
-	    std::vector<sobj> vals(Nsimd);
+       extract(chi,vals);
+       for(int s=0;s<Nsimd;s++){
 
-	    extract(chi,vals);
-	    for(int s=0;s<Nsimd;s++){
-
-	      grid->iCoorFromIindex(icoor,s);
+	 grid->iCoorFromIindex(icoor,s);
 	      
-	      assert((icoor[direction]==0)||(icoor[direction]==1));
+	 assert((icoor[direction]==0)||(icoor[direction]==1));
 	      
-	      int permute_lane;
-	      if ( distance == 1) {
-		permute_lane = icoor[direction]?1:0;
-	      } else {
-		permute_lane = icoor[direction]?0:1;
+	 int permute_lane;
+	 if ( distance == 1) {
+	   permute_lane = icoor[direction]?1:0;
+	 } else {
+	   permute_lane = icoor[direction]?0:1;
+	 }
+	      
+	 if ( permute_lane ) { 
+	   stmp(0) = vals[s](1);
+	   stmp(1) = vals[s](0);
+	   vals[s] = stmp;
 	      }
-	      
-	      if ( permute_lane ) { 
-		stmp(0) = vals[s](1);
-		stmp(1) = vals[s](0);
-		vals[s] = stmp;
-	      }
-	    }
-	    merge(vtmp,vals);
+       }
+       merge(vtmp,vals);
+	    
+     } else { 
+       vtmp(0) = chi(1);
+       vtmp(1) = chi(0);
+     }
+     mult(&phi(0),&U(0)(mu),&vtmp(0));
+     mult(&phi(1),&U(1)(mu),&vtmp(1));
+     
+   } else { 
+     mult(&phi(0),&U(0)(mu),&chi(0));
+     mult(&phi(1),&U(1)(mu),&chi(1));
+   }
+   
+ }
 
-	  } else { 
-	    vtmp(0) = chi(1);
-	    vtmp(1) = chi(0);
-	  }
-	  mult(&phi(0),&U(0)(mu),&vtmp(0));
-	  mult(&phi(1),&U(1)(mu),&vtmp(1));
-	  
-	} else { 
-	  mult(&phi(0),&U(0)(mu),&chi(0));
-	  mult(&phi(1),&U(1)(mu),&chi(1));
-	}
+ inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
+ {
+   conformable(Uds._grid,GaugeGrid);
+   conformable(Umu._grid,GaugeGrid);
+   
+   GaugeLinkField Utmp (GaugeGrid);
+   GaugeLinkField U    (GaugeGrid);
+   GaugeLinkField Uconj(GaugeGrid);
+   
+   Lattice<iScalar<vInteger> > coor(GaugeGrid);
 	
-  }
-
-      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
-      {
-	
-	conformable(Uds._grid,GaugeGrid);
-	conformable(Umu._grid,GaugeGrid);
-	
-	GaugeLinkField Utmp (GaugeGrid);
-	GaugeLinkField U    (GaugeGrid);
-	GaugeLinkField Uconj(GaugeGrid);
-	
-	Lattice<iScalar<vInteger> > coor(GaugeGrid);
-	
-	
-	for(int mu=0;mu<Nd;mu++){
+   for(int mu=0;mu<Nd;mu++){
 	  
-	  LatticeCoordinate(coor,mu);
+     LatticeCoordinate(coor,mu);
 	  
-	  U     = PeekIndex<LorentzIndex>(Umu,mu);
-	  Uconj = conjugate(U);
+     U     = PeekIndex<LorentzIndex>(Umu,mu);
+     Uconj = conjugate(U);
+     
+     // This phase could come from a simple bc 1,1,-1,1 ..
+     int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
+     if ( Params.twists[mu] ) { 
+       Uconj = where(coor==neglink,-Uconj,Uconj);
+     }
 	  
-	  // This phase could come from a simple bc 1,1,-1,1 ..
-	  int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
-	  if ( Params.twists[mu] ) { 
-	    Uconj = where(coor==neglink,-Uconj,Uconj);
-	  }
+PARALLEL_FOR_LOOP
+     for(auto ss=U.begin();ss<U.end();ss++){
+       Uds[ss](0)(mu) = U[ss]();
+       Uds[ss](1)(mu) = Uconj[ss]();
+     }
 	  
+     U     = adj(Cshift(U    ,mu,-1));      // correct except for spanning the boundary
+     Uconj = adj(Cshift(Uconj,mu,-1));
+ 
+     Utmp = U;
+     if ( Params.twists[mu] ) { 
+       Utmp = where(coor==0,Uconj,Utmp);
+     }
 	  
-	  PARALLEL_FOR_LOOP
-	    for(auto ss=U.begin();ss<U.end();ss++){
-	      Uds[ss](0)(mu) = U[ss]();
-	      Uds[ss](1)(mu) = Uconj[ss]();
-	    }
+PARALLEL_FOR_LOOP
+     for(auto ss=U.begin();ss<U.end();ss++){
+       Uds[ss](0)(mu+4) = Utmp[ss]();
+     }
 	  
-	  U     = adj(Cshift(U    ,mu,-1));      // correct except for spanning the boundary
-	  Uconj = adj(Cshift(Uconj,mu,-1));
+     Utmp = Uconj;
+     if ( Params.twists[mu] ) { 
+       Utmp = where(coor==0,U,Utmp);
+     }
 	  
-	  Utmp = U;
-	  if ( Params.twists[mu] ) { 
-	    Utmp = where(coor==0,Uconj,Utmp);
-	  }
+PARALLEL_FOR_LOOP
+     for(auto ss=U.begin();ss<U.end();ss++){
+       Uds[ss](1)(mu+4) = Utmp[ss]();
+     }
 	  
-	  PARALLEL_FOR_LOOP
-	    for(auto ss=U.begin();ss<U.end();ss++){
-	      Uds[ss](0)(mu+4) = Utmp[ss]();
-	    }
-	  
-	  Utmp = Uconj;
-	  if ( Params.twists[mu] ) { 
-	    Utmp = where(coor==0,U,Utmp);
-	  }
-	  
-	  PARALLEL_FOR_LOOP
-	    for(auto ss=U.begin();ss<U.end();ss++){
-	      Uds[ss](1)(mu+4) = Utmp[ss]();
-	    }
-	  
-	}
-      }
+   }
+ }
       
       
-      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,
-				FermionField &A, int mu) {
-	// DhopDir provides U or Uconj depending on coor/flavour.
-	GaugeLinkField link(mat._grid);
-	// use lorentz for flavour as hack.
-	auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
-	PARALLEL_FOR_LOOP
-	  for (auto ss = tmp.begin(); ss < tmp.end(); ss++) {
-	    link[ss]() = tmp[ss](0, 0) - conjugate(tmp[ss](1, 1));
-	  }
-	PokeIndex<LorentzIndex>(mat, link, mu);
-	return;
-      }
+ inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
+
+   // DhopDir provides U or Uconj depending on coor/flavour.
+   GaugeLinkField link(mat._grid);
+   // use lorentz for flavour as hack.
+   auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
+PARALLEL_FOR_LOOP
+   for (auto ss = tmp.begin(); ss < tmp.end(); ss++) {
+     link[ss]() = tmp[ss](0, 0) - conjugate(tmp[ss](1, 1));
+   }
+   PokeIndex<LorentzIndex>(mat, link, mu);
+   return;
+ }
       
-      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,
-				FermionField &Atilde, int mu) {
-	int Ls = Btilde._grid->_fdimensions[0];
+ inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
+
+   int Ls = Btilde._grid->_fdimensions[0];
 	
-	GaugeLinkField tmp(mat._grid);
-	tmp = zero;
-	PARALLEL_FOR_LOOP
-	  for (int ss = 0; ss < tmp._grid->oSites(); ss++) {
-	    for (int s = 0; s < Ls; s++) {
-	      int sF = s + Ls * ss;
-	      auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde[sF], Atilde[sF]));
-	      tmp[ss]() = tmp[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
-	    }
-	  }
-	PokeIndex<LorentzIndex>(mat, tmp, mu);
-	return;
-      }
-    };
+   GaugeLinkField tmp(mat._grid);
+   tmp = zero;
+PARALLEL_FOR_LOOP
+   for (int ss = 0; ss < tmp._grid->oSites(); ss++) {
+     for (int s = 0; s < Ls; s++) {
+       int sF = s + Ls * ss;
+       auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde[sF], Atilde[sF]));
+       tmp[ss]() = tmp[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
+     }
+   }
+   PokeIndex<LorentzIndex>(mat, tmp, mu);
+   return;
+ }
 
-    typedef WilsonImpl<vComplex,  FundamentalRepresentation > WilsonImplR;   // Real.. whichever prec
-    typedef WilsonImpl<vComplexF, FundamentalRepresentation > WilsonImplF;  // Float
-    typedef WilsonImpl<vComplexD, FundamentalRepresentation > WilsonImplD;  // Double
+};
 
+ typedef WilsonImpl<vComplex,  FundamentalRepresentation > WilsonImplR;   // Real.. whichever prec
+ typedef WilsonImpl<vComplexF, FundamentalRepresentation > WilsonImplF;  // Float
+ typedef WilsonImpl<vComplexD, FundamentalRepresentation > WilsonImplD;  // Double
 
-    typedef WilsonImpl<vComplex,  FundamentalRepresentation, ComplexD > ZWilsonImplR; // Real.. whichever prec
-    typedef WilsonImpl<vComplexF, FundamentalRepresentation, ComplexD > ZWilsonImplF; // Float
-    typedef WilsonImpl<vComplexD, FundamentalRepresentation, ComplexD > ZWilsonImplD; // Double
+ typedef WilsonImpl<vComplex,  FundamentalRepresentation, ComplexD > ZWilsonImplR; // Real.. whichever prec
+ typedef WilsonImpl<vComplexF, FundamentalRepresentation, ComplexD > ZWilsonImplF; // Float
+ typedef WilsonImpl<vComplexD, FundamentalRepresentation, ComplexD > ZWilsonImplD; // Double
+ 
+ typedef WilsonImpl<vComplex,  AdjointRepresentation > WilsonAdjImplR;   // Real.. whichever prec
+ typedef WilsonImpl<vComplexF, AdjointRepresentation > WilsonAdjImplF;  // Float
+ typedef WilsonImpl<vComplexD, AdjointRepresentation > WilsonAdjImplD;  // Double
+ 
+ typedef WilsonImpl<vComplex,  TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplR;   // Real.. whichever prec
+ typedef WilsonImpl<vComplexF, TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplF;  // Float
+ typedef WilsonImpl<vComplexD, TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplD;  // Double
+ 
+ typedef DomainWallVec5dImpl<vComplex ,Nc> DomainWallVec5dImplR; // Real.. whichever prec
+ typedef DomainWallVec5dImpl<vComplexF,Nc> DomainWallVec5dImplF; // Float
+ typedef DomainWallVec5dImpl<vComplexD,Nc> DomainWallVec5dImplD; // Double
+ 
+ typedef DomainWallVec5dImpl<vComplex ,Nc,ComplexD> ZDomainWallVec5dImplR; // Real.. whichever prec
+ typedef DomainWallVec5dImpl<vComplexF,Nc,ComplexD> ZDomainWallVec5dImplF; // Float
+ typedef DomainWallVec5dImpl<vComplexD,Nc,ComplexD> ZDomainWallVec5dImplD; // Double
+ 
+ typedef GparityWilsonImpl<vComplex , Nc> GparityWilsonImplR;  // Real.. whichever prec
+ typedef GparityWilsonImpl<vComplexF, Nc> GparityWilsonImplF;  // Float
+ typedef GparityWilsonImpl<vComplexD, Nc> GparityWilsonImplD;  // Double
 
-    typedef WilsonImpl<vComplex,  AdjointRepresentation > WilsonAdjImplR;   // Real.. whichever prec
-    typedef WilsonImpl<vComplexF, AdjointRepresentation > WilsonAdjImplF;  // Float
-    typedef WilsonImpl<vComplexD, AdjointRepresentation > WilsonAdjImplD;  // Double
+}}
 
-    typedef WilsonImpl<vComplex,  TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplR;   // Real.. whichever prec
-    typedef WilsonImpl<vComplexF, TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplF;  // Float
-    typedef WilsonImpl<vComplexD, TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplD;  // Double
-
-    typedef DomainWallVec5dImpl<vComplex ,Nc> DomainWallVec5dImplR; // Real.. whichever prec
-    typedef DomainWallVec5dImpl<vComplexF,Nc> DomainWallVec5dImplF; // Float
-    typedef DomainWallVec5dImpl<vComplexD,Nc> DomainWallVec5dImplD; // Double
-    
-    typedef DomainWallVec5dImpl<vComplex ,Nc,ComplexD> ZDomainWallVec5dImplR; // Real.. whichever prec
-    typedef DomainWallVec5dImpl<vComplexF,Nc,ComplexD> ZDomainWallVec5dImplF; // Float
-    typedef DomainWallVec5dImpl<vComplexD,Nc,ComplexD> ZDomainWallVec5dImplD; // Double
-
-    typedef GparityWilsonImpl<vComplex, Nc>  GparityWilsonImplR;  // Real.. whichever prec
-    typedef GparityWilsonImpl<vComplexF, Nc> GparityWilsonImplF;  // Float
-    typedef GparityWilsonImpl<vComplexD, Nc> GparityWilsonImplD;  // Double
-}
-}
 #endif
diff --git a/lib/qcd/action/fermion/WilsonFermion.cc b/lib/qcd/action/fermion/WilsonFermion.cc
index d887e05b..4bc28bc7 100644
--- a/lib/qcd/action/fermion/WilsonFermion.cc
+++ b/lib/qcd/action/fermion/WilsonFermion.cc
@@ -166,7 +166,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
     ////////////////////////
     PARALLEL_FOR_LOOP
     for (int sss = 0; sss < B._grid->oSites(); sss++) {
-      Kernels::DiracOptDhopDir(st, U, st.comm_buf, sss, sss, B, Btilde, mu,
+      Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sss, sss, B, Btilde, mu,
                                gamma);
     }
 
@@ -277,7 +277,7 @@ void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,
 
   PARALLEL_FOR_LOOP
   for (int sss = 0; sss < in._grid->oSites(); sss++) {
-    Kernels::DiracOptDhopDir(Stencil, Umu, Stencil.comm_buf, sss, sss, in, out,
+    Kernels::DiracOptDhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out,
                              dirdisp, gamma);
   }
 };
@@ -295,13 +295,13 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
   if (dag == DaggerYes) {
     PARALLEL_FOR_LOOP
     for (int sss = 0; sss < in._grid->oSites(); sss++) {
-      Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sss, sss, 1, 1, in,
+      Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in,
                                    out);
     }
   } else {
     PARALLEL_FOR_LOOP
     for (int sss = 0; sss < in._grid->oSites(); sss++) {
-      Kernels::DiracOptDhopSite(st, lo, U, st.comm_buf, sss, sss, 1, 1, in,
+      Kernels::DiracOptDhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in,
                                 out);
     }
   }
diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc
index 7c4fafff..8c059667 100644
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -184,44 +184,37 @@ void WilsonFermion5D<Impl>::Report(void)
 
   if ( DhopCalls > 0 ) {
     std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D Number of Dhop Calls     : " << DhopCalls  << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " << DhopCommTime
-              << " us" << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls           : "
-              << DhopCommTime / DhopCalls << " us" << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D Total Compute time       : "
-              << DhopComputeTime << " us" << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls        : "
-              << DhopComputeTime / DhopCalls << " us" << std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D Number of Dhop Calls     : " << DhopCalls   << std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " << DhopCommTime<< " us" << std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls           : " << DhopCommTime / DhopCalls << " us" << std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D Total Compute time       : " << DhopComputeTime << " us" << std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls        : " << DhopComputeTime / DhopCalls << " us" << std::endl;
 
     RealD mflops = 1344*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
     std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NP << std::endl;
+    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
 
    }
 
   if ( DerivCalls > 0 ) {
-  std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
-  std::cout << GridLogMessage << "WilsonFermion5D Number of Deriv Calls    : " <<DerivCalls <<std::endl;
-  std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " <<DerivCommTime <<" us"<<std::endl;
-  std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls           : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
-  std::cout << GridLogMessage << "WilsonFermion5D Total Compute time       : " <<DerivComputeTime <<" us"<<std::endl;
-  std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls        : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
-  std::cout << GridLogMessage << "WilsonFermion5D Total Dhop Compute time  : " <<DerivDhopComputeTime <<" us"<<std::endl;
-  std::cout << GridLogMessage << "WilsonFermion5D Dhop ComputeTime/Calls   : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
-
-
-
-  RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
-  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NP << std::endl;
-
+    std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D Number of Deriv Calls    : " <<DerivCalls <<std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " <<DerivCommTime <<" us"<<std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls           : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D Total Compute time       : " <<DerivComputeTime <<" us"<<std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls        : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D Total Dhop Compute time  : " <<DerivDhopComputeTime <<" us"<<std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D Dhop ComputeTime/Calls   : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
+    
+    RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
+    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
+    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NP << std::endl;
   }
 
   if (DerivCalls > 0 || DhopCalls > 0){
-  std::cout << GridLogMessage << "WilsonFermion5D Stencil"<<std::endl;  Stencil.Report();
-  std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report();
-  std::cout << GridLogMessage << "WilsonFermion5D StencilOdd"<<std::endl;  StencilOdd.Report();
+    std::cout << GridLogMessage << "WilsonFermion5D Stencil"<<std::endl;  Stencil.Report();
+    std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report();
+    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd"<<std::endl;  StencilOdd.Report();
   }
 }
 
@@ -275,7 +268,7 @@ PARALLEL_FOR_LOOP
     for(int s=0;s<Ls;s++){
       int sU=ss;
       int sF = s+Ls*sU; 
-      Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.comm_buf,sF,sU,in,out,dirdisp,gamma);
+      Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.CommBuf(),sF,sU,in,out,dirdisp,gamma);
     }
   }
 };
@@ -327,8 +320,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
         assert(sF < B._grid->oSites());
         assert(sU < U._grid->oSites());
 
-        Kernels::DiracOptDhopDir(st, U, st.comm_buf, sF, sU, B, Btilde, mu,
-                                 gamma);
+        Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sF, sU, B, Btilde, mu, gamma);
 
         ////////////////////////////
         // spin trace outer product
@@ -342,10 +334,10 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
 }
 
 template<class Impl>
-void WilsonFermion5D<Impl>::DhopDeriv(      GaugeField &mat,
-              const FermionField &A,
-              const FermionField &B,
-              int dag)
+void WilsonFermion5D<Impl>::DhopDeriv(GaugeField &mat,
+				      const FermionField &A,
+				      const FermionField &B,
+				      int dag)
 {
   conformable(A._grid,FermionGrid());  
   conformable(A._grid,B._grid);
@@ -358,9 +350,9 @@ void WilsonFermion5D<Impl>::DhopDeriv(      GaugeField &mat,
 
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
-          const FermionField &A,
-          const FermionField &B,
-          int dag)
+					const FermionField &A,
+					const FermionField &B,
+					int dag)
 {
   conformable(A._grid,FermionRedBlackGrid());
   conformable(GaugeRedBlackGrid(),mat._grid);
@@ -376,9 +368,9 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
 
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
-          const FermionField &A,
-          const FermionField &B,
-          int dag)
+					const FermionField &A,
+					const FermionField &B,
+					int dag)
 {
   conformable(A._grid,FermionRedBlackGrid());
   conformable(GaugeRedBlackGrid(),mat._grid);
@@ -393,8 +385,8 @@ void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
 
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
-           DoubledGaugeField & U,
-           const FermionField &in, FermionField &out,int dag)
+					 DoubledGaugeField & U,
+					 const FermionField &in, FermionField &out,int dag)
 {
   //  assert((dag==DaggerNo) ||(dag==DaggerYes));
   Compressor compressor(dag);
@@ -412,27 +404,25 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
     for (int ss = 0; ss < U._grid->oSites(); ss++) {
       int sU = ss;
       int sF = LLs * sU;
-      Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in,
-                                   out);
+      Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sF, sU, LLs, 1, in, out);
     }
 #ifdef AVX512
   } else if (stat.is_init() ) {
 
     int nthreads;
     stat.start();
-    #pragma omp parallel
+#pragma omp parallel
     {
-    #pragma omp master
+#pragma omp master
     nthreads = omp_get_num_threads();
     int mythread = omp_get_thread_num();
     stat.enter(mythread);
-    #pragma omp for nowait
-   for(int ss=0;ss<U._grid->oSites();ss++)
-    {
-       int sU=ss;
-       int sF=LLs*sU;
-       Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
-     }
+#pragma omp for nowait
+    for(int ss=0;ss<U._grid->oSites();ss++) {
+      int sU=ss;
+      int sF=LLs*sU;
+      Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
+    }
     stat.exit(mythread);
     }
     stat.accum(nthreads);
@@ -442,8 +432,7 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
     for (int ss = 0; ss < U._grid->oSites(); ss++) {
       int sU = ss;
       int sF = LLs * sU;
-      Kernels::DiracOptDhopSite(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in,
-                                out);
+      Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
     }
   }
   DhopComputeTime+=usecond();
diff --git a/lib/qcd/action/fermion/WilsonFermion5D.h b/lib/qcd/action/fermion/WilsonFermion5D.h
index bc7ec543..295e2997 100644
--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@@ -34,155 +34,154 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/Stat.h>
 
 namespace Grid {
+namespace QCD {
 
-  namespace QCD {
+  ////////////////////////////////////////////////////////////////////////////////
+  // This is the 4d red black case appropriate to support
+  //
+  // parity = (x+y+z+t)|2;
+  // generalised five dim fermions like mobius, zolotarev etc..	
+  //
+  // i.e. even even contains fifth dim hopping term.
+  //
+  // [DIFFERS from original CPS red black implementation parity = (x+y+z+t+s)|2 ]
+  ////////////////////////////////////////////////////////////////////////////////
 
-    ////////////////////////////////////////////////////////////////////////////////
-    // This is the 4d red black case appropriate to support
-    //
-    // parity = (x+y+z+t)|2;
-    // generalised five dim fermions like mobius, zolotarev etc..	
-    //
-    // i.e. even even contains fifth dim hopping term.
-    //
-    // [DIFFERS from original CPS red black implementation parity = (x+y+z+t+s)|2 ]
-    ////////////////////////////////////////////////////////////////////////////////
-
-    class WilsonFermion5DStatic { 
-    public:
-      // S-direction is INNERMOST and takes no part in the parity.
-      static const std::vector<int> directions;
-      static const std::vector<int> displacements;
-      const int npoint = 8;
-    };
-
-    template<class Impl>
-    class WilsonFermion5D : public WilsonKernels<Impl>, public WilsonFermion5DStatic
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-     typedef WilsonKernels<Impl> Kernels;
-     PmuStat stat;
-
-     void Report(void);
-     void ZeroCounters(void);
-     double DhopCalls;
-     double DhopCommTime;
-     double DhopComputeTime;
-
-     double DerivCalls;
-     double DerivCommTime;
-     double DerivComputeTime;
-     double DerivDhopComputeTime;
-
-      ///////////////////////////////////////////////////////////////
-      // Implement the abstract base
-      ///////////////////////////////////////////////////////////////
-      GridBase *GaugeGrid(void)              { return _FourDimGrid ;}
-      GridBase *GaugeRedBlackGrid(void)      { return _FourDimRedBlackGrid ;}
-      GridBase *FermionGrid(void)            { return _FiveDimGrid;}
-      GridBase *FermionRedBlackGrid(void)    { return _FiveDimRedBlackGrid;}
-
-      // full checkerboard operations; leave unimplemented as abstract for now
-      virtual RealD  M    (const FermionField &in, FermionField &out){assert(0); return 0.0;};
-      virtual RealD  Mdag (const FermionField &in, FermionField &out){assert(0); return 0.0;};
-
-      // half checkerboard operations; leave unimplemented as abstract for now
-      virtual void   Meooe       (const FermionField &in, FermionField &out){assert(0);};
-      virtual void   Mooee       (const FermionField &in, FermionField &out){assert(0);};
-      virtual void   MooeeInv    (const FermionField &in, FermionField &out){assert(0);};
-
-      virtual void   MeooeDag    (const FermionField &in, FermionField &out){assert(0);};
-      virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
-      virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
-      virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
-
-      // These can be overridden by fancy 5d chiral action
-      virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-      virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-      virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-
-      // Implement hopping term non-hermitian hopping term; half cb or both
-      // Implement s-diagonal DW
-      void DW    (const FermionField &in, FermionField &out,int dag);
-      void Dhop  (const FermionField &in, FermionField &out,int dag);
-      void DhopOE(const FermionField &in, FermionField &out,int dag);
-      void DhopEO(const FermionField &in, FermionField &out,int dag);
-
-      // add a DhopComm
+  class WilsonFermion5DStatic { 
+  public:
+    // S-direction is INNERMOST and takes no part in the parity.
+    static const std::vector<int> directions;
+    static const std::vector<int> displacements;
+    const int npoint = 8;
+  };
+  
+  template<class Impl>
+  class WilsonFermion5D : public WilsonKernels<Impl>, public WilsonFermion5DStatic
+  {
+  public:
+    INHERIT_IMPL_TYPES(Impl);
+    typedef WilsonKernels<Impl> Kernels;
+    PmuStat stat;
+    
+    void Report(void);
+    void ZeroCounters(void);
+    double DhopCalls;
+    double DhopCommTime;
+    double DhopComputeTime;
+    
+    double DerivCalls;
+    double DerivCommTime;
+    double DerivComputeTime;
+    double DerivDhopComputeTime;
+    
+    ///////////////////////////////////////////////////////////////
+    // Implement the abstract base
+    ///////////////////////////////////////////////////////////////
+    GridBase *GaugeGrid(void)              { return _FourDimGrid ;}
+    GridBase *GaugeRedBlackGrid(void)      { return _FourDimRedBlackGrid ;}
+    GridBase *FermionGrid(void)            { return _FiveDimGrid;}
+    GridBase *FermionRedBlackGrid(void)    { return _FiveDimRedBlackGrid;}
+    
+    // full checkerboard operations; leave unimplemented as abstract for now
+    virtual RealD  M    (const FermionField &in, FermionField &out){assert(0); return 0.0;};
+    virtual RealD  Mdag (const FermionField &in, FermionField &out){assert(0); return 0.0;};
+    
+    // half checkerboard operations; leave unimplemented as abstract for now
+    virtual void   Meooe       (const FermionField &in, FermionField &out){assert(0);};
+    virtual void   Mooee       (const FermionField &in, FermionField &out){assert(0);};
+    virtual void   MooeeInv    (const FermionField &in, FermionField &out){assert(0);};
+    
+    virtual void   MeooeDag    (const FermionField &in, FermionField &out){assert(0);};
+    virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
+    virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
+    virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
+    
+    // These can be overridden by fancy 5d chiral action
+    virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+    virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+    virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+    
+    // Implement hopping term non-hermitian hopping term; half cb or both
+    // Implement s-diagonal DW
+    void DW    (const FermionField &in, FermionField &out,int dag);
+    void Dhop  (const FermionField &in, FermionField &out,int dag);
+    void DhopOE(const FermionField &in, FermionField &out,int dag);
+    void DhopEO(const FermionField &in, FermionField &out,int dag);
+    
+    // add a DhopComm
       // -- suboptimal interface will presently trigger multiple comms.
-      void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
-
-      ///////////////////////////////////////////////////////////////
-      // New methods added 
-      ///////////////////////////////////////////////////////////////
-      void DerivInternal(StencilImpl & st,
-			 DoubledGaugeField & U,
-			 GaugeField &mat,
-			 const FermionField &A,
-			 const FermionField &B,
-			 int dag);
-
-      void DhopInternal(StencilImpl & st,
-			LebesgueOrder &lo,
-			DoubledGaugeField &U,
-			const FermionField &in, 
-			FermionField &out,
-			int dag);
-
-      // Constructors
-      WilsonFermion5D(GaugeField &_Umu,
-		      GridCartesian         &FiveDimGrid,
-		      GridRedBlackCartesian &FiveDimRedBlackGrid,
-		      GridCartesian         &FourDimGrid,
-		      GridRedBlackCartesian &FourDimRedBlackGrid,
-		      double _M5,const ImplParams &p= ImplParams());
-
-      // Constructors
-      /*
+    void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
+    
+    ///////////////////////////////////////////////////////////////
+    // New methods added 
+    ///////////////////////////////////////////////////////////////
+    void DerivInternal(StencilImpl & st,
+		       DoubledGaugeField & U,
+		       GaugeField &mat,
+		       const FermionField &A,
+		       const FermionField &B,
+		       int dag);
+    
+    void DhopInternal(StencilImpl & st,
+		      LebesgueOrder &lo,
+		      DoubledGaugeField &U,
+		      const FermionField &in, 
+		      FermionField &out,
+		      int dag);
+    
+    // Constructors
+    WilsonFermion5D(GaugeField &_Umu,
+		    GridCartesian         &FiveDimGrid,
+		    GridRedBlackCartesian &FiveDimRedBlackGrid,
+		    GridCartesian         &FourDimGrid,
+		    GridRedBlackCartesian &FourDimRedBlackGrid,
+		    double _M5,const ImplParams &p= ImplParams());
+    
+    // Constructors
+    /*
       WilsonFermion5D(int simd, 
-		      GaugeField &_Umu,
-		      GridCartesian         &FiveDimGrid,
-		      GridRedBlackCartesian &FiveDimRedBlackGrid,
-		      GridCartesian         &FourDimGrid,
-		      double _M5,const ImplParams &p= ImplParams());
-      */
+      GaugeField &_Umu,
+      GridCartesian         &FiveDimGrid,
+      GridRedBlackCartesian &FiveDimRedBlackGrid,
+      GridCartesian         &FourDimGrid,
+      double _M5,const ImplParams &p= ImplParams());
+    */
+    
+    // DoubleStore
+    void ImportGauge(const GaugeField &_Umu);
+    
+    ///////////////////////////////////////////////////////////////
+    // Data members require to support the functionality
+    ///////////////////////////////////////////////////////////////
+  public:
+    
+    // Add these to the support from Wilson
+    GridBase *_FourDimGrid;
+    GridBase *_FourDimRedBlackGrid;
+    GridBase *_FiveDimGrid;
+    GridBase *_FiveDimRedBlackGrid;
+    
+    double                        M5;
+    int Ls;
+    
+    //Defines the stencils for even and odd
+    StencilImpl Stencil; 
+    StencilImpl StencilEven; 
+    StencilImpl StencilOdd; 
+    
+    // Copy of the gauge field , with even and odd subsets
+    DoubledGaugeField Umu;
+    DoubledGaugeField UmuEven;
+    DoubledGaugeField UmuOdd;
+    
+    LebesgueOrder Lebesgue;
+    LebesgueOrder LebesgueEvenOdd;
+    
+    // Comms buffer
+    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
+    
+  };
 
-      // DoubleStore
-      void ImportGauge(const GaugeField &_Umu);
-
-      ///////////////////////////////////////////////////////////////
-      // Data members require to support the functionality
-      ///////////////////////////////////////////////////////////////
-    public:
-
-      // Add these to the support from Wilson
-      GridBase *_FourDimGrid;
-      GridBase *_FourDimRedBlackGrid;
-      GridBase *_FiveDimGrid;
-      GridBase *_FiveDimRedBlackGrid;
-
-      double                        M5;
-      int Ls;
-
-      //Defines the stencils for even and odd
-      StencilImpl Stencil; 
-      StencilImpl StencilEven; 
-      StencilImpl StencilOdd; 
-
-      // Copy of the gauge field , with even and odd subsets
-      DoubledGaugeField Umu;
-      DoubledGaugeField UmuEven;
-      DoubledGaugeField UmuOdd;
-
-      LebesgueOrder Lebesgue;
-      LebesgueOrder LebesgueEvenOdd;
-
-      // Comms buffer
-      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
-      
-    };
-  }
-}
+}}
 
 #endif
diff --git a/lib/qcd/action/fermion/WilsonKernels.cc b/lib/qcd/action/fermion/WilsonKernels.cc
index 58e62cc8..52ee8704 100644
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -43,10 +43,9 @@ WilsonKernels<Impl>::WilsonKernels(const ImplParams &p) : Base(p){};
 ////////////////////////////////////////////
 
 template <class Impl>
-void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(
-    StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-    commVector<SiteHalfSpinor> &buf, int sF,
-    int sU, const FermionField &in, FermionField &out) {
+void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+						     SiteHalfSpinor *buf, int sF,
+						     int sU, const FermionField &in, FermionField &out) {
   SiteHalfSpinor tmp;
   SiteHalfSpinor chi;
   SiteHalfSpinor *chi_p;
@@ -220,10 +219,9 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(
 
 // Need controls to do interior, exterior, or both
 template <class Impl>
-void WilsonKernels<Impl>::DiracOptGenericDhopSite(
-    StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-    commVector<SiteHalfSpinor> &buf, int sF,
-    int sU, const FermionField &in, FermionField &out) {
+void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+						  SiteHalfSpinor *buf, int sF,
+						  int sU, const FermionField &in, FermionField &out) {
   SiteHalfSpinor tmp;
   SiteHalfSpinor chi;
   SiteHalfSpinor *chi_p;
@@ -396,10 +394,9 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSite(
 };
 
 template <class Impl>
-void WilsonKernels<Impl>::DiracOptDhopDir(
-    StencilImpl &st, DoubledGaugeField &U,
-    commVector<SiteHalfSpinor> &buf, int sF,
-    int sU, const FermionField &in, FermionField &out, int dir, int gamma) {
+void WilsonKernels<Impl>::DiracOptDhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int sF,
+					   int sU, const FermionField &in, FermionField &out, int dir, int gamma) {
+
   SiteHalfSpinor tmp;
   SiteHalfSpinor chi;
   SiteSpinor result;
diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h
index 66770c3c..669ee4be 100644
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -32,175 +32,132 @@ directory
 #define GRID_QCD_DHOP_H
 
 namespace Grid {
+namespace QCD {
 
-  namespace QCD {
-
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // Helper routines that implement Wilson stencil for a single site.
-    // Common to both the WilsonFermion and WilsonFermion5D
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    class WilsonKernelsStatic { 
-    public:
-      // S-direction is INNERMOST and takes no part in the parity.
-      static int AsmOpt;  // these are a temporary hack
-      static int HandOpt; // these are a temporary hack
-    };
-
-    template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic { 
-    public:
-
-      INHERIT_IMPL_TYPES(Impl);
-      typedef FermionOperator<Impl> Base;
-     
-    public:
-
-      template <bool EnableBool = true>
-      typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
-	DiracOptDhopSite(
-			 StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-			 commVector<SiteHalfSpinor> &buf,
-			 int sF, int sU, int Ls, int Ns, const FermionField &in,
-			 FermionField &out) {
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Helper routines that implement Wilson stencil for a single site.
+  // Common to both the WilsonFermion and WilsonFermion5D
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+class WilsonKernelsStatic { 
+ public:
+  // S-direction is INNERMOST and takes no part in the parity.
+  static int AsmOpt;  // these are a temporary hack
+  static int HandOpt; // these are a temporary hack
+};
+ 
+template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic { 
+ public:
+   
+  INHERIT_IMPL_TYPES(Impl);
+  typedef FermionOperator<Impl> Base;
+   
+public:
+   
+  template <bool EnableBool = true>
+  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
+  DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+		   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
 #ifdef AVX512
-	if (AsmOpt) {
-	  WilsonKernels<Impl>::DiracOptAsmDhopSite(st, lo, U, buf, sF, sU, Ls, Ns,
-						   in, out);
-
-	} else {
+    if (AsmOpt) {
+      WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
+    } else {
 #else
-	  {
+    {
 #endif
-	    for (int site = 0; site < Ns; site++) {
-	      for (int s = 0; s < Ls; s++) {
-		if (HandOpt)
-		  WilsonKernels<Impl>::DiracOptHandDhopSite(st, lo, U, buf, sF, sU,
-							    in, out);
-		else
-		  WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU,
-							       in, out);
-		sF++;
-	      }
-	      sU++;
-	    }
-	  }
+      for (int site = 0; site < Ns; site++) {
+	for (int s = 0; s < Ls; s++) {
+	  if (HandOpt)
+	    WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
+	  else
+	    WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
+	  sF++;
 	}
-
-	template <bool EnableBool = true>
-	  typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
-	  DiracOptDhopSite(
-			   StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-			   commVector<SiteHalfSpinor> &buf,
-			   int sF, int sU, int Ls, int Ns, const FermionField &in,
-			   FermionField &out) {
-	  for (int site = 0; site < Ns; site++) {
-	    for (int s = 0; s < Ls; s++) {
-	      WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in,
-							   out);
-	      sF++;
-	    }
-	    sU++;
-	  }
-	}
-
-	template <bool EnableBool = true>
-	  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,
-				  void>::type
-	  DiracOptDhopSiteDag(
-			      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-			      commVector<SiteHalfSpinor> &buf,
-			      int sF, int sU, int Ls, int Ns, const FermionField &in,
-			      FermionField &out) {
-#ifdef AVX512
-				    if (AsmOpt) {
-				      WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st, lo, U, buf, sF, sU, Ls,
-										  Ns, in, out);
-				    } else {
-#else
-				      {
-#endif
-					for (int site = 0; site < Ns; site++) {
-					  for (int s = 0; s < Ls; s++) {
-					    if (HandOpt)
-					      WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st, lo, U, buf, sF, sU,
-											   in, out);
-					    else
-					      WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF,
-											      sU, in, out);
-					    sF++;
-					  }
-					  sU++;
-					}
-				      }
-				    }
-
-				    template <bool EnableBool = true>
-				      typename std::enable_if<
-				      (Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,
-				      void>::type
-				      DiracOptDhopSiteDag(
-							  StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-							  commVector<SiteHalfSpinor> &buf,
-							  int sF, int sU, int Ls, int Ns, const FermionField &in,
-							  FermionField &out) {
-					for (int site = 0; site < Ns; site++) {
-					  for (int s = 0; s < Ls; s++) {
-					    WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF, sU,
-											    in, out);
-					    sF++;
-					  }
-					  sU++;
-					}
-				      }
-
-				    void DiracOptDhopDir(
-							 StencilImpl &st, DoubledGaugeField &U,
-							 commVector<SiteHalfSpinor> &buf,
-							 int sF, int sU, const FermionField &in, FermionField &out, int dirdisp,
-							 int gamma);
-
-	private:
-				    // Specialised variants
-				    void DiracOptGenericDhopSite(
-								 StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-								 commVector<SiteHalfSpinor> &buf,
-								 int sF, int sU, const FermionField &in, FermionField &out);
-
-				    void DiracOptGenericDhopSiteDag(
-								    StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-								    commVector<SiteHalfSpinor> &buf,
-								    int sF, int sU, const FermionField &in, FermionField &out);
-
-				    void DiracOptAsmDhopSite(
-							     StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-							     commVector<SiteHalfSpinor> &buf,
-							     int sF, int sU, int Ls, int Ns, const FermionField &in,
-							     FermionField &out);
-
-				    void DiracOptAsmDhopSiteDag(
-								StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-								commVector<SiteHalfSpinor> &buf,
-								int sF, int sU, int Ls, int Ns, const FermionField &in,
-								FermionField &out);
-
-				    void DiracOptHandDhopSite(
-							      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-							      commVector<SiteHalfSpinor> &buf,
-							      int sF, int sU, const FermionField &in, FermionField &out);
-
-				    void DiracOptHandDhopSiteDag(
-								 StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-								 commVector<SiteHalfSpinor> &buf,
-								 int sF, int sU, const FermionField &in, FermionField &out);
-
-	public:
-				    WilsonKernels(const ImplParams &p = ImplParams());
-				  };
-    
+	sU++;
       }
     }
+  }
+     
+  template <bool EnableBool = true>
+  typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
+  DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+		   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
+     
+    for (int site = 0; site < Ns; site++) {
+      for (int s = 0; s < Ls; s++) {
+	WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in, out);
+	sF++;
+      }
+      sU++;
+    }
+  }
+     
+  template <bool EnableBool = true>
+  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,void>::type
+  DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+		      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
+#ifdef AVX512
+    if (AsmOpt) {
+      WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
+    } else {
+#else
+    {
+#endif
+      for (int site = 0; site < Ns; site++) {
+	for (int s = 0; s < Ls; s++) {
+	  if (HandOpt)
+	    WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
+	  else
+	    WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
+	  sF++;
+	}
+	sU++;
+      }
+    }
+  }
 
+  template <bool EnableBool = true>
+  typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,void>::type
+  DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,SiteHalfSpinor * buf,
+		      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
 
+    for (int site = 0; site < Ns; site++) {
+      for (int s = 0; s < Ls; s++) {
+	WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
+	sF++;
+      }
+      sU++;
+    }
+  }
 
+  void DiracOptDhopDir(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
+		       int sF, int sU, const FermionField &in, FermionField &out, int dirdisp, int gamma);
+      
+private:
+     // Specialised variants
+  void DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+			       int sF, int sU, const FermionField &in, FermionField &out);
+      
+  void DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+				  int sF, int sU, const FermionField &in, FermionField &out);
 
+  void DiracOptAsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+			   int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
+
+  void DiracOptAsmDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+			      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out);
+
+  void DiracOptHandDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+			    int sF, int sU, const FermionField &in, FermionField &out);
+
+  void DiracOptHandDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+			       int sF, int sU, const FermionField &in, FermionField &out);
+      
+public:
+
+  WilsonKernels(const ImplParams &p = ImplParams());
+
+};
+    
+}}
 
 #endif
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
index 7857e89a..74862400 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -33,31 +33,27 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 
 
 namespace Grid {
-  namespace QCD {
+namespace QCD {
     
-    ///////////////////////////////////////////////////////////
-    // Default to no assembler implementation
-    ///////////////////////////////////////////////////////////
-    template<class Impl>
-      void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-                             commVector<SiteHalfSpinor>  &buf,
-                             int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-    {
-      assert(0);
-    }
-    template<class Impl>
-      void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-                                commVector<SiteHalfSpinor>  &buf,
-                                int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-    {
-      assert(0);
-    }
-
+///////////////////////////////////////////////////////////
+// Default to no assembler implementation
+///////////////////////////////////////////////////////////
+template<class Impl> void 
+WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+					  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+{
+  assert(0);
+}
 
+template<class Impl> void 
+WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+					     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+{
+  assert(0);
+}
 
 #if defined(AVX512) 
     
-    
     ///////////////////////////////////////////////////////////
     // If we are AVX512 specialise the single precision routine
     ///////////////////////////////////////////////////////////
@@ -65,16 +61,16 @@ namespace Grid {
 #include <simd/Intel512wilson.h>
 #include <simd/Intel512single.h>
     
-    static Vector<vComplexF> signs;
+static Vector<vComplexF> signs;
     
-    int setupSigns(void ){
-      Vector<vComplexF> bother(2);
-      signs = bother;
-      vrsign(signs[0]);
-      visign(signs[1]);
-      return 1;
-    }
-    static int signInit = setupSigns();
+  int setupSigns(void ){
+    Vector<vComplexF> bother(2);
+    signs = bother;
+    vrsign(signs[0]);
+    visign(signs[1]);
+    return 1;
+  }
+  static int signInit = setupSigns();
   
 #define label(A)  ilabel(A)
 #define ilabel(A) ".globl\n"  #A ":\n" 
@@ -84,17 +80,15 @@ namespace Grid {
 #define FX(A) WILSONASM_ ##A
   
 #undef KERNEL_DAG
-    template<>
-    void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-							 commVector<SiteHalfSpinor>  &buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+template<> void 
+WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
       
 #define KERNEL_DAG
-    template<>
-    void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-							    commVector<SiteHalfSpinor>  &buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+template<> void 
+WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+						   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 				    
 #undef VMOVIDUP
@@ -109,31 +103,26 @@ namespace Grid {
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
 				    
 #undef KERNEL_DAG
-    template<>
-    void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-								  commVector<SiteHalfSpinor>  &buf,
-								  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+template<> void 
+WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 				    
 #define KERNEL_DAG
-    template<>
-    void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-								     commVector<SiteHalfSpinor>  &buf,
-								     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+template<> void 
+WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 				    
 #endif
 
-
 #define INSTANTIATE_ASM(A)\
-template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\
-                                   commVector<SiteHalfSpinor>  &buf,\
+template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
                                   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
-template void WilsonKernels<A>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\
-                                   commVector<SiteHalfSpinor>  &buf,\
+ \
+template void WilsonKernels<A>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
                                   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
 
-
 INSTANTIATE_ASM(WilsonImplF);
 INSTANTIATE_ASM(WilsonImplD);
 INSTANTIATE_ASM(ZWilsonImplF);
@@ -144,6 +133,6 @@ INSTANTIATE_ASM(DomainWallVec5dImplF);
 INSTANTIATE_ASM(DomainWallVec5dImplD);
 INSTANTIATE_ASM(ZDomainWallVec5dImplF);
 INSTANTIATE_ASM(ZDomainWallVec5dImplD);
-  }
-}
+
+}}
 
diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc
index 9d7eac23..f5900832 100644
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -311,10 +311,9 @@ namespace Grid {
 namespace QCD {
 
 
-  template<class Impl>
-  void WilsonKernels<Impl>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-					       commVector<SiteHalfSpinor>  &buf,
-					       int ss,int sU,const FermionField &in, FermionField &out)
+template<class Impl> void 
+WilsonKernels<Impl>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
+					  int ss,int sU,const FermionField &in, FermionField &out)
 {
   typedef typename Simd::scalar_type S;
   typedef typename Simd::vector_type V;
@@ -554,10 +553,9 @@ namespace QCD {
   }
 }
 
-  template<class Impl>
-  void WilsonKernels<Impl>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-					       commVector<SiteHalfSpinor>  &buf,
-					       int ss,int sU,const FermionField &in, FermionField &out)
+template<class Impl>
+void WilsonKernels<Impl>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+						  int ss,int sU,const FermionField &in, FermionField &out)
 {
   //  std::cout << "Hand op Dhop "<<std::endl;
   typedef typename Simd::scalar_type S;
@@ -798,38 +796,35 @@ namespace QCD {
   }
 }
 
-
   ////////////////////////////////////////////////
   // Specialise Gparity to simple implementation
   ////////////////////////////////////////////////
-template<>
-void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-							     commVector<SiteHalfSpinor>  &buf,
-							     int sF,int sU,const FermionField &in, FermionField &out)
+template<> void 
+WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+							SiteHalfSpinor *buf,
+							int sF,int sU,const FermionField &in, FermionField &out)
 {
   assert(0);
 }
 
-template<>
-void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								commVector<SiteHalfSpinor>  &buf,
-								int sF,int sU,const FermionField &in, FermionField &out)
+template<> void 
+WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+							   SiteHalfSpinor *buf,
+							   int sF,int sU,const FermionField &in, FermionField &out)
 {
   assert(0);
 }
 
-template<>
-void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-							     commVector<SiteHalfSpinor>  &buf,
-							     int sF,int sU,const FermionField &in, FermionField &out)
+template<> void 
+WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							int sF,int sU,const FermionField &in, FermionField &out)
 {
   assert(0);
 }
 
-template<>
-void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								commVector<SiteHalfSpinor>  &buf,
-								int sF,int sU,const FermionField &in, FermionField &out)
+template<> void 
+WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							   int sF,int sU,const FermionField &in, FermionField &out)
 {
   assert(0);
 }
@@ -840,12 +835,10 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,
 // Need Nc=3 though //
 
 #define INSTANTIATE_THEM(A) \
-template void WilsonKernels<A>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
-							       commVector<SiteHalfSpinor>  &buf,\
-							       int ss,int sU,const FermionField &in, FermionField &out);\
-template void WilsonKernels<A>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
-								  commVector<SiteHalfSpinor>  &buf,\
-								  int ss,int sU,const FermionField &in, FermionField &out);
+template void WilsonKernels<A>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
+						     int ss,int sU,const FermionField &in, FermionField &out); \
+template void WilsonKernels<A>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
+							int ss,int sU,const FermionField &in, FermionField &out);
 
 INSTANTIATE_THEM(WilsonImplF);
 INSTANTIATE_THEM(WilsonImplD);
diff --git a/tests/Test_stencil.cc b/tests/Test_stencil.cc
index 0e35a414..1b71b8a5 100644
--- a/tests/Test_stencil.cc
+++ b/tests/Test_stencil.cc
@@ -116,7 +116,7 @@ int main (int argc, char ** argv)
 	  else if (SE->_is_local)
 	    Check._odata[i] = Foo._odata[SE->_offset];
 	  else 
-	    Check._odata[i] = myStencil.comm_buf[SE->_offset];
+	    Check._odata[i] = myStencil.CommBuf()[SE->_offset];
 	}
 
 	Real nrmC = norm2(Check);
@@ -207,7 +207,7 @@ int main (int argc, char ** argv)
 	  else if (SE->_is_local)
 	    OCheck._odata[i] = EFoo._odata[SE->_offset];
 	  else 
-	    OCheck._odata[i] = EStencil.comm_buf[SE->_offset];
+	    OCheck._odata[i] = EStencil.CommBuf()[SE->_offset];
 	}
 	for(int i=0;i<ECheck._grid->oSites();i++){
 	  int permute_type;
@@ -220,7 +220,7 @@ int main (int argc, char ** argv)
 	  else if (SE->_is_local)
 	    ECheck._odata[i] = OFoo._odata[SE->_offset];
 	  else 
-	    ECheck._odata[i] = OStencil.comm_buf[SE->_offset];
+	    ECheck._odata[i] = OStencil.CommBuf()[SE->_offset];
 	}
 	
 	setCheckerboard(Check,ECheck);

From ea25a4d9ac9d07f223a0c362d9ed7ec5fff998a8 Mon Sep 17 00:00:00 2001
From: azusayamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Sun, 23 Oct 2016 06:10:05 +0100
Subject: [PATCH 12/27] Works

---
 lib/Stencil.h | 60 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 42 insertions(+), 18 deletions(-)

diff --git a/lib/Stencil.h b/lib/Stencil.h
index 6b942539..f5fa1f8b 100644
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -289,11 +289,17 @@ PARALLEL_FOR_LOOP
   ///////////////////////////////////////////////////////////
   // Unified Comms buffers for all directions
   ///////////////////////////////////////////////////////////
-  std::vector<commVector<scalar_object> > u_simd_send_buf;
-  std::vector<commVector<scalar_object> > u_simd_recv_buf;
+  // Vectors that live on the symmetric heap in case of SHMEM
+  std::vector<commVector<scalar_object> > u_simd_send_buf_hide;
+  std::vector<commVector<scalar_object> > u_simd_recv_buf_hide;
   commVector<cobj>          u_send_buf;
   commVector<cobj>          u_recv_buf_hide;
+  // These are used; either SHM objects or refs to the above symmetric heap vectors
+  // depending on comms target
   cobj* u_recv_buf_p;
+  cobj* u_send_buf_p;
+  std::vector<scalar_object *> u_simd_send_buf;
+  std::vector<scalar_object *> u_simd_recv_buf;
 
   int u_comm_offset;
   int _unified_buffer_size;
@@ -427,32 +433,42 @@ PARALLEL_FOR_LOOP
 	}
       }
     }
-    u_send_buf.resize(_unified_buffer_size);
 
     /////////////////////////////////////////////////////////////////////////////////
     // Try to allocate for receiving in a shared memory region, fall back to buffer
     /////////////////////////////////////////////////////////////////////////////////
-    if( ShmDirectCopy ) {
+    const int Nsimd = grid->Nsimd();
 
-      u_recv_buf_p=(cobj *)_grid->ShmBufferSelf();
-      if ( u_recv_buf_p == NULL ) {
-	u_recv_buf_hide.resize(_unified_buffer_size);
-	u_recv_buf_p=&u_recv_buf_hide[0];
+    uint8_t *shm_ptr   = (uint8_t *)_grid->ShmBufferSelf();
+
+    u_simd_send_buf.resize(Nsimd);
+    u_simd_recv_buf.resize(Nsimd);
+
+    u_send_buf.resize(_unified_buffer_size);
+
+    if( ShmDirectCopy && shm_ptr != NULL ) {
+
+      u_recv_buf_p=(cobj *)shm_ptr; shm_ptr+= _unified_buffer_size*sizeof(cobj);
+      for(int l=0;l<Nsimd;l++){
+	u_simd_send_buf[l] = (scalar_object *)shm_ptr; shm_ptr += _unified_buffer_size*sizeof(scalar_object);
+	u_simd_recv_buf[l] = (scalar_object *)shm_ptr; shm_ptr += _unified_buffer_size*sizeof(scalar_object);
       }
+
     } else {
+
       u_recv_buf_hide.resize(_unified_buffer_size);
+      u_simd_send_buf_hide.resize(Nsimd,commVector<scalar_object>(_unified_buffer_size));
+      u_simd_recv_buf_hide.resize(Nsimd,commVector<scalar_object>(_unified_buffer_size));
+
       u_recv_buf_p=&u_recv_buf_hide[0];
+      for(int l=0;l<Nsimd;l++){
+	u_simd_send_buf[l] = & u_simd_send_buf_hide[l][0];
+	u_simd_recv_buf[l] = & u_simd_recv_buf_hide[l][0];
+      }
     }
 
     PrecomputeByteOffsets();
 
-    const int Nsimd = grid->Nsimd();
-    u_simd_send_buf.resize(Nsimd);
-    u_simd_recv_buf.resize(Nsimd);
-    for(int l=0;l<Nsimd;l++){
-      u_simd_send_buf[l].resize(_unified_buffer_size);
-      u_simd_recv_buf[l].resize(_unified_buffer_size);
-    }
   }
 
   void Local     (int point, int dimension,int shiftpm,int cbmask)
@@ -930,10 +946,19 @@ PARALLEL_FOR_LOOP
 	    int xmit_to_rank;
 	    
 	    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
-	    
+ 
+
 	    AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes);
+
+	    auto shm_or_rp = rp;
+	    if (ShmDirectCopy) { 
+	      scalar_object *shm = (scalar_object *) _grid->ShmBufferTranslate(xmit_to_rank,sp);
+	      if ( shm!=NULL) { 
+		shm_or_rp = shm;
+	      }
+	    }
 	    
-	    rpointers[i] = rp;
+	    rpointers[i] = shm_or_rp;
 	    
 	  } else { 
 	    
@@ -942,7 +967,6 @@ PARALLEL_FOR_LOOP
 	  }
 	}
 
-	assert(0);
 	AddMerge(&u_recv_buf_p[u_comm_offset],rpointers,buffer_size,Packets.size()-1);
 	
 	u_comm_offset     +=buffer_size;

From b6a65059a21f00c2d9f2a01f10f550996d6acb02 Mon Sep 17 00:00:00 2001
From: azusayamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Mon, 24 Oct 2016 17:30:43 +0100
Subject: [PATCH 13/27] Update to use shared memory to contain the stencil
 comms buffers Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode
 decompositions

---
 benchmarks/Benchmark_dwf.cc            |  21 +-
 lib/Init.cc                            |  14 +-
 lib/Init.h                             |   2 +
 lib/Log.cc                             |   5 +
 lib/Log.h                              |  59 ++--
 lib/Makefile.am                        |   4 +
 lib/Stencil.h                          |  73 ++---
 lib/communicator/Communicator_base.cc  | 132 +++++++++
 lib/communicator/Communicator_base.h   | 313 ++++++++++----------
 lib/communicator/Communicator_mpi.cc   |  41 +--
 lib/communicator/Communicator_mpi3.cc  | 388 ++++++++++++++-----------
 lib/communicator/Communicator_none.cc  |  55 ++--
 lib/communicator/Communicator_shmem.cc |  57 ++--
 13 files changed, 706 insertions(+), 458 deletions(-)
 create mode 100644 lib/communicator/Communicator_base.cc

diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc
index f75f0385..c9e2fa22 100644
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -156,6 +156,7 @@ int main (int argc, char ** argv)
     std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
     err = ref-result; 
     std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
+    assert (norm2(err)< 1.0e-5 );
     Dw.Report();
   }
 
@@ -208,7 +209,7 @@ int main (int argc, char ** argv)
 
     std::cout<<GridLogMessage<< "res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl;
 
-    RealF sum=0;
+    RealD sum=0;
     for(int x=0;x<latt4[0];x++){
     for(int y=0;y<latt4[1];y++){
     for(int z=0;z<latt4[2];z++){
@@ -226,12 +227,12 @@ int main (int argc, char ** argv)
       }
     }}}}}
     std::cout<<GridLogMessage<<" difference between normal and simd is "<<sum<<std::endl;
+    assert (sum< 1.0e-5 );
 
 
     if (1) {
 
       LatticeFermion sr_eo(sFGrid);
-      LatticeFermion serr(sFGrid);
 
       LatticeFermion ssrc_e (sFrbGrid);
       LatticeFermion ssrc_o (sFrbGrid);
@@ -243,8 +244,6 @@ int main (int argc, char ** argv)
 
       setCheckerboard(sr_eo,ssrc_o);
       setCheckerboard(sr_eo,ssrc_e);
-      serr = sr_eo-ssrc; 
-      std::cout<<GridLogMessage << "EO src norm diff   "<< norm2(serr)<<std::endl;
 
       sr_e = zero;
       sr_o = zero;
@@ -272,9 +271,18 @@ int main (int argc, char ** argv)
       pickCheckerboard(Even,ssrc_e,sresult);
       pickCheckerboard(Odd ,ssrc_o,sresult);
       ssrc_e = ssrc_e - sr_e;
+      RealD error = norm2(ssrc_e);
+
       std::cout<<GridLogMessage << "sE norm diff   "<< norm2(ssrc_e)<< "  vec nrm"<<norm2(sr_e) <<std::endl;
       ssrc_o = ssrc_o - sr_o;
+
+      error+= norm2(ssrc_o);
       std::cout<<GridLogMessage << "sO norm diff   "<< norm2(ssrc_o)<< "  vec nrm"<<norm2(sr_o) <<std::endl;
+      if(error>1.0e-5) { 
+	setCheckerboard(ssrc,ssrc_o);
+	setCheckerboard(ssrc,ssrc_e);
+	std::cout<< ssrc << std::endl;
+      }
     }
 
 
@@ -306,7 +314,7 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
   err = ref-result; 
   std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-
+  assert(norm2(err)<1.0e-5);
   LatticeFermion src_e (FrbGrid);
   LatticeFermion src_o (FrbGrid);
   LatticeFermion r_e   (FrbGrid);
@@ -349,11 +357,14 @@ int main (int argc, char ** argv)
 
   err = r_eo-result; 
   std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
+  assert(norm2(err)<1.0e-5);
 
   pickCheckerboard(Even,src_e,err);
   pickCheckerboard(Odd,src_o,err);
   std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
   std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
+  assert(norm2(src_e)<1.0e-5);
+  assert(norm2(src_o)<1.0e-5);
 
 
   }
diff --git a/lib/Init.cc b/lib/Init.cc
index a21cbce0..a2a8526a 100644
--- a/lib/Init.cc
+++ b/lib/Init.cc
@@ -171,14 +171,17 @@ std::string GridCmdVectorIntToString(const std::vector<int> & vec){
 /////////////////////////////////////////////////////////
 //
 /////////////////////////////////////////////////////////
+static int Grid_is_initialised = 0;
+
+
 void Grid_init(int *argc,char ***argv)
 {
+  GridLogger::StopWatch.Start();
+
   CartesianCommunicator::Init(argc,argv);
 
   // Parse command line args.
 
-  GridLogger::StopWatch.Start();
-
   std::string arg;
   std::vector<std::string> logstreams;
   std::string defaultLog("Error,Warning,Message,Performance");
@@ -216,11 +219,14 @@ void Grid_init(int *argc,char ***argv)
   if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
     LebesgueOrder::UseLebesgueOrder=1;
   }
-
   if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
     arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
     GridCmdOptionIntVector(arg,LebesgueOrder::Block);
   }
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--timestamp") ){
+    GridLogTimestamp(1);
+  }
+
   GridParseLayout(*argv,*argc,
 		  Grid_default_latt,
 		  Grid_default_mpi);
@@ -274,6 +280,8 @@ void Grid_init(int *argc,char ***argv)
   std::cout << "GNU General Public License for more details."<<std::endl;
   std::cout << COL_BACKGROUND <<std::endl;
   std::cout << std::endl;
+
+  Grid_is_initialised = 1;
 }
 
   
diff --git a/lib/Init.h b/lib/Init.h
index 25fda569..d68cee37 100644
--- a/lib/Init.h
+++ b/lib/Init.h
@@ -33,6 +33,7 @@ namespace Grid {
 
   void Grid_init(int *argc,char ***argv);
   void Grid_finalize(void);
+
   // internal, controled with --handle
   void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
   void Grid_debug_handler_init(void);
@@ -44,6 +45,7 @@ namespace Grid {
   const std::vector<int> &GridDefaultMpi(void);
   const int              &GridThreads(void)  ;
   void                    GridSetThreads(int t) ;
+  void GridLogTimestamp(int);
 
   // Common parsing chores
   std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option);
diff --git a/lib/Log.cc b/lib/Log.cc
index a55a1c9e..d4ac42ee 100644
--- a/lib/Log.cc
+++ b/lib/Log.cc
@@ -34,8 +34,13 @@ directory
 namespace Grid {
 
 GridStopWatch Logger::StopWatch;
+int Logger::timestamp;
 std::ostream Logger::devnull(0);
 
+void GridLogTimestamp(int on){
+  Logger::Timestamp(on);
+}
+
 Colours GridLogColours(0);
 GridLogger GridLogError(1, "Error", GridLogColours, "RED");
 GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
diff --git a/lib/Log.h b/lib/Log.h
index 156f52ee..dd3fe927 100644
--- a/lib/Log.h
+++ b/lib/Log.h
@@ -37,10 +37,11 @@
 #include <execinfo.h>
 #endif
 
-    namespace Grid {
+namespace Grid {
 
+//////////////////////////////////////////////////////////////////////////////////////////////////
 // Dress the output; use std::chrono for time stamping via the StopWatch class
-int Rank(void); // used for early stage debug before library init
+//////////////////////////////////////////////////////////////////////////////////////////////////
 
 
 class Colours{
@@ -55,7 +56,6 @@ public:
 
   void Active(bool activate){
     is_active=activate;
-
     if (is_active){
      colour["BLACK"]  ="\033[30m";
      colour["RED"]    ="\033[31m";
@@ -66,21 +66,18 @@ public:
      colour["CYAN"]   ="\033[36m";
      colour["WHITE"]  ="\033[37m";
      colour["NORMAL"] ="\033[0;39m";
-   } else {
-    colour["BLACK"] ="";
-    colour["RED"]   ="";
-    colour["GREEN"] ="";
-    colour["YELLOW"]="";
-    colour["BLUE"]  ="";
-    colour["PURPLE"]="";
-    colour["CYAN"]  ="";
-    colour["WHITE"] ="";
-    colour["NORMAL"]="";
-  }
-
-
-};
-
+    } else {
+      colour["BLACK"] ="";
+      colour["RED"]   ="";
+      colour["GREEN"] ="";
+      colour["YELLOW"]="";
+      colour["BLUE"]  ="";
+      colour["PURPLE"]="";
+      colour["CYAN"]  ="";
+      colour["WHITE"] ="";
+      colour["NORMAL"]="";
+    }
+  };
 };
 
 
@@ -88,6 +85,7 @@ class Logger {
 protected:
   Colours &Painter;
   int active;
+  static int timestamp;
   std::string name, topName;
   std::string COLOUR;
 
@@ -99,25 +97,28 @@ public:
   std::string evidence() {return Painter.colour["YELLOW"];}
   std::string colour() {return Painter.colour[COLOUR];}
 
-  Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col)
-  : active(on),
-  name(nm),
-  topName(topNm),
-  Painter(col_class),
-  COLOUR(col){} ;
+  Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col)  : active(on),
+    name(nm),
+    topName(topNm),
+    Painter(col_class),
+    COLOUR(col) {} ;
   
   void Active(int on) {active = on;};
   int  isActive(void) {return active;};
+  static void Timestamp(int on) {timestamp = on;};
   
   friend std::ostream& operator<< (std::ostream& stream, Logger& log){
 
     if ( log.active ) {
-      StopWatch.Stop();
-      GridTime now = StopWatch.Elapsed();
-      StopWatch.Start();
       stream << log.background()<< log.topName << log.background()<< " : ";
       stream << log.colour() <<std::setw(14) << std::left << log.name << log.background() << " : ";
-      stream << log.evidence()<< now << log.background() << " : " << log.colour();
+      if ( log.timestamp ) {
+	StopWatch.Stop();
+	GridTime now = StopWatch.Elapsed();
+	StopWatch.Start();
+	stream << log.evidence()<< now << log.background() << " : " ;
+      }
+      stream << log.colour();
       return stream;
     } else { 
       return devnull;
@@ -149,7 +150,7 @@ extern void * Grid_backtrace_buffer[_NBACKTRACE];
 
 #define BACKTRACEFILE() {\
 char string[20];					\
-std::sprintf(string,"backtrace.%d",Rank());				\
+std::sprintf(string,"backtrace.%d",CartesianCommunicator::RankWorld()); \
 std::FILE * fp = std::fopen(string,"w");				\
 BACKTRACEFP(fp)\
 std::fclose(fp);	    \
diff --git a/lib/Makefile.am b/lib/Makefile.am
index 35e76cc8..3285a545 100644
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -1,18 +1,22 @@
 extra_sources=
 if BUILD_COMMS_MPI
   extra_sources+=communicator/Communicator_mpi.cc
+  extra_sources+=communicator/Communicator_base.cc
 endif
 
 if BUILD_COMMS_MPI3
   extra_sources+=communicator/Communicator_mpi3.cc
+  extra_sources+=communicator/Communicator_base.cc
 endif
 
 if BUILD_COMMS_SHMEM
   extra_sources+=communicator/Communicator_shmem.cc
+  extra_sources+=communicator/Communicator_base.cc
 endif
 
 if BUILD_COMMS_NONE
   extra_sources+=communicator/Communicator_none.cc
+  extra_sources+=communicator/Communicator_base.cc
 endif
 
 #
diff --git a/lib/Stencil.h b/lib/Stencil.h
index f5fa1f8b..f22acb6f 100644
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -290,10 +290,11 @@ PARALLEL_FOR_LOOP
   // Unified Comms buffers for all directions
   ///////////////////////////////////////////////////////////
   // Vectors that live on the symmetric heap in case of SHMEM
-  std::vector<commVector<scalar_object> > u_simd_send_buf_hide;
-  std::vector<commVector<scalar_object> > u_simd_recv_buf_hide;
-  commVector<cobj>          u_send_buf;
-  commVector<cobj>          u_recv_buf_hide;
+  //  std::vector<commVector<scalar_object> > u_simd_send_buf_hide;
+  //  std::vector<commVector<scalar_object> > u_simd_recv_buf_hide;
+  //  commVector<cobj>          u_send_buf_hide;
+  //  commVector<cobj>          u_recv_buf_hide;
+
   // These are used; either SHM objects or refs to the above symmetric heap vectors
   // depending on comms target
   cobj* u_recv_buf_p;
@@ -439,36 +440,19 @@ PARALLEL_FOR_LOOP
     /////////////////////////////////////////////////////////////////////////////////
     const int Nsimd = grid->Nsimd();
 
-    uint8_t *shm_ptr   = (uint8_t *)_grid->ShmBufferSelf();
+    _grid->ShmBufferFreeAll();
 
     u_simd_send_buf.resize(Nsimd);
     u_simd_recv_buf.resize(Nsimd);
 
-    u_send_buf.resize(_unified_buffer_size);
-
-    if( ShmDirectCopy && shm_ptr != NULL ) {
-
-      u_recv_buf_p=(cobj *)shm_ptr; shm_ptr+= _unified_buffer_size*sizeof(cobj);
-      for(int l=0;l<Nsimd;l++){
-	u_simd_send_buf[l] = (scalar_object *)shm_ptr; shm_ptr += _unified_buffer_size*sizeof(scalar_object);
-	u_simd_recv_buf[l] = (scalar_object *)shm_ptr; shm_ptr += _unified_buffer_size*sizeof(scalar_object);
-      }
-
-    } else {
-
-      u_recv_buf_hide.resize(_unified_buffer_size);
-      u_simd_send_buf_hide.resize(Nsimd,commVector<scalar_object>(_unified_buffer_size));
-      u_simd_recv_buf_hide.resize(Nsimd,commVector<scalar_object>(_unified_buffer_size));
-
-      u_recv_buf_p=&u_recv_buf_hide[0];
-      for(int l=0;l<Nsimd;l++){
-	u_simd_send_buf[l] = & u_simd_send_buf_hide[l][0];
-	u_simd_recv_buf[l] = & u_simd_recv_buf_hide[l][0];
-      }
+    u_send_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
+    u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
+    for(int l=0;l<Nsimd;l++){
+      u_simd_recv_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
+      u_simd_send_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
     }
 
     PrecomputeByteOffsets();
-
   }
 
   void Local     (int point, int dimension,int shiftpm,int cbmask)
@@ -698,6 +682,7 @@ PARALLEL_FOR_LOOP
     calls++;
     Mergers.resize(0);
     Packets.resize(0);
+    _grid->StencilBarrier();
     HaloGather(source,compress);
     this->CommunicateBegin(reqs);
     this->CommunicateComplete(reqs);
@@ -836,19 +821,17 @@ PARALLEL_FOR_LOOP
 	// try the direct copy if possible
 	/////////////////////////////////////////////////////////
 
-	cobj *u_send_buf_p = &u_send_buf[0];
-	if (ShmDirectCopy) { 
-	  cobj *shm = (cobj *) _grid->ShmBuffer(xmit_to_rank);
-	  if ( shm!=NULL) { 
-	    u_send_buf_p = shm;
-	  }
+
+	cobj *send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,u_recv_buf_p);
+	if ( (ShmDirectCopy==0)||send_buf==NULL ) { 
+	  cobj *send_buf = u_send_buf_p;
 	}
 	
 	t_data-=usecond();
-	Gather_plane_simple_table         (face_table[face_idx],rhs,u_send_buf_p,compress,u_comm_offset,so);  face_idx++;
+	Gather_plane_simple_table         (face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so);  face_idx++;
 	t_data+=usecond();
 	
-	AddPacket((void *)&u_send_buf_p[u_comm_offset],
+	AddPacket((void *)&send_buf[u_comm_offset],
 		  (void *)&u_recv_buf_p[u_comm_offset],
 		  xmit_to_rank,
 		  recv_from_rank,
@@ -947,18 +930,16 @@ PARALLEL_FOR_LOOP
 	    
 	    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
  
-
-	    AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes);
-
-	    auto shm_or_rp = rp;
-	    if (ShmDirectCopy) { 
-	      scalar_object *shm = (scalar_object *) _grid->ShmBufferTranslate(xmit_to_rank,sp);
-	      if ( shm!=NULL) { 
-		shm_or_rp = shm;
-	      }
-	    }
+	    scalar_object *shm = (scalar_object *) _grid->ShmBufferTranslate(recv_from_rank,sp);
+	    if ((ShmDirectCopy==0)||(shm==NULL)) { 
+	      shm = rp;
+	    } 
 	    
-	    rpointers[i] = shm_or_rp;
+	    // if Direct, StencilSendToRecvFrom will suppress copy to a peer on node
+	    // assuming above pointer flip
+	    AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes);
+	    
+	    rpointers[i] = shm;
 	    
 	  } else { 
 	    
diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc
new file mode 100644
index 00000000..1272b6a2
--- /dev/null
+++ b/lib/communicator/Communicator_base.cc
@@ -0,0 +1,132 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/communicator/Communicator_none.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include "Grid.h"
+namespace Grid {
+
+///////////////////////////////////////////////////////////////
+// Info that is setup once and indept of cartesian layout
+///////////////////////////////////////////////////////////////
+int CartesianCommunicator::ShmRank;
+int CartesianCommunicator::ShmSize;
+int CartesianCommunicator::GroupRank;
+int CartesianCommunicator::GroupSize;
+int CartesianCommunicator::WorldRank;
+int CartesianCommunicator::WorldSize;
+int CartesianCommunicator::Slave;
+void *              CartesianCommunicator::ShmCommBuf;
+
+/////////////////////////////////
+// Alloc, free shmem region
+/////////////////////////////////
+void *CartesianCommunicator::ShmBufferMalloc(size_t bytes){
+  //  bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
+  void *ptr = (void *)heap_top;
+  heap_top  += bytes;
+  heap_bytes+= bytes;
+  assert(heap_bytes < MAX_MPI_SHM_BYTES);
+  return ptr;
+}
+void *CartesianCommunicator::ShmBufferFreeAll(void) { 
+  heap_top  =(size_t)ShmBufferSelf();
+  heap_bytes=0;
+}
+
+/////////////////////////////////
+// Grid information queries
+/////////////////////////////////
+int                      CartesianCommunicator::IsBoss(void)            { return _processor==0; };
+int                      CartesianCommunicator::BossRank(void)          { return 0; };
+int                      CartesianCommunicator::ThisRank(void)          { return _processor; };
+const std::vector<int> & CartesianCommunicator::ThisProcessorCoor(void) { return _processor_coor; };
+const std::vector<int> & CartesianCommunicator::ProcessorGrid(void)     { return _processors; };
+int                      CartesianCommunicator::ProcessorCount(void)    { return _Nprocessors; };
+
+////////////////////////////////////////////////////////////////////////////////
+// very VERY rarely (Log, serial RNG) we need world without a grid
+////////////////////////////////////////////////////////////////////////////////
+int  CartesianCommunicator::RankWorld(void) { return WorldRank; };
+int CartesianCommunicator::Ranks    (void) { return WorldSize; };
+int CartesianCommunicator::Nodes    (void) { return GroupSize; };
+int CartesianCommunicator::Cores    (void) { return ShmSize;   };
+int CartesianCommunicator::NodeRank (void) { return GroupRank; };
+int CartesianCommunicator::CoreRank (void) { return ShmRank;   };
+
+void CartesianCommunicator::GlobalSum(ComplexF &c)
+{
+  GlobalSumVector((float *)&c,2);
+}
+void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
+{
+  GlobalSumVector((float *)c,2*N);
+}
+void CartesianCommunicator::GlobalSum(ComplexD &c)
+{
+  GlobalSumVector((double *)&c,2);
+}
+void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
+{
+  GlobalSumVector((double *)c,2*N);
+}
+
+#ifndef GRID_COMMS_MPI3
+
+void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+						       void *xmit,
+						       int xmit_to_rank,
+						       void *recv,
+						       int recv_from_rank,
+						       int bytes)
+{
+  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
+}
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
+{
+  SendToRecvFromComplete(waitall);
+}
+void StencilBarrier(void){};
+
+commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
+
+void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; }
+void *CartesianCommunicator::ShmBuffer(int rank) {
+  if (rank != ShmRank ) return NULL;
+  else                  return ShmCommBuf;
+}
+void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { 
+  if (rank != ShmRank ) return NULL;
+  else                  return local_p;
+}
+void CartesianCommunicator::ShmInitGeneric(void){
+  ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
+  ShmCommBuf=(void *)&ShmBufStorageVector[0];
+}
+
+#endif
+  
+}
+
diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h
index 4139f72a..576c749e 100644
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -40,169 +40,188 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_COMMS_SHMEM
 #include <mpp/shmem.h>
 #endif
+
 namespace Grid {
+
 class CartesianCommunicator {
   public:    
 
+  // 65536 ranks per node adequate for now
+  // 128MB shared memory for comms enought for 48^4 local vol comms
+  // Give external control (command line override?) of this
+
+  static const int      MAXLOG2RANKSPERNODE = 16;            
+  static const uint64_t MAX_MPI_SHM_BYTES   = 128*1024*1024; 
+
   // Communicator should know nothing of the physics grid, only processor grid.
-  
-    int              _Nprocessors;     // How many in all
-    std::vector<int> _processors;      // Which dimensions get relayed out over processors lanes.
-    int              _processor;       // linear processor rank
-    std::vector<int> _processor_coor;  // linear processor coordinate
-    unsigned long _ndimension;
+  int              _Nprocessors;     // How many in all
+  std::vector<int> _processors;      // Which dimensions get relayed out over processors lanes.
+  int              _processor;       // linear processor rank
+  std::vector<int> _processor_coor;  // linear processor coordinate
+  unsigned long _ndimension;
 
-#ifdef GRID_COMMS_MPI
-    MPI_Comm communicator;
-    typedef MPI_Request CommsRequest_t;
-#elif  GRID_COMMS_MPI3
-    int shm_mode;
-
-    MPI_Comm communicator;
-    typedef MPI_Request CommsRequest_t;
-
-    const int MAXLOG2RANKSPERNODE = 16;     // 65536 ranks per node adequate for now
-    const uint64_t MAX_MPI_SHM_BYTES = 256*1024*1024; // 256MB shared memory for comms enought for 48^4 local vol comms
-
-    std::vector<int>  WorldDims;
-    std::vector<int>  GroupDims;
-    std::vector<int>  ShmDims;
-
-    std::vector<int> GroupCoor;
-    std::vector<int> ShmCoor;
-    std::vector<int> WorldCoor;
-
-    static std::vector<int> GroupRanks; 
-    static std::vector<int> MyGroup;
-    static int ShmSetup;
-    static MPI_Win ShmWindow; 
-    static MPI_Comm ShmComm;
-
-    void * ShmCommBuf;
-    std::vector<void *> ShmCommBufs;
-
-    int WorldRank;
-    int WorldSize;
-
-    static int ShmRank;
-    static int ShmSize;
-    static int GroupSize;
-    static int GroupRank;
-
-    std::vector<int>  LexicographicToWorldRank;
+#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
+  MPI_Comm communicator;
+  static MPI_Comm communicator_world;
+  typedef MPI_Request CommsRequest_t;
 #else 
-    typedef int CommsRequest_t;
+  typedef int CommsRequest_t;
 #endif
 
-    static void Init(int *argc, char ***argv);
+  ////////////////////////////////////////////////////////////////////
+  // Helper functionality for SHM Windows common to all other impls
+  ////////////////////////////////////////////////////////////////////
+  // Longer term; drop this in favour of a master / slave model with 
+  // cartesian communicator on a subset of ranks, slave ranks controlled
+  // by group leader with data xfer via shared memory
+  ////////////////////////////////////////////////////////////////////
+#ifdef  GRID_COMMS_MPI3
+  std::vector<int>  WorldDims;
+  std::vector<int>  GroupDims;
+  std::vector<int>  ShmDims;
+  
+  std::vector<int> GroupCoor;
+  std::vector<int> ShmCoor;
+  std::vector<int> WorldCoor;
+  
+  static std::vector<int> GroupRanks; 
+  static std::vector<int> MyGroup;
+  static int ShmSetup;
+  static MPI_Win ShmWindow; 
+  static MPI_Comm ShmComm;
+  
+  std::vector<int>  LexicographicToWorldRank;
+  
+  static std::vector<void *> ShmCommBufs;
+#else 
+  static void ShmInitGeneric(void);
+  static commVector<uint8_t> ShmBufStorageVector;
+#endif 
+  static void * ShmCommBuf;
+  size_t heap_top;
+  size_t heap_bytes;
+  void *ShmBufferSelf(void);
+  void *ShmBuffer(int rank);
+  void *ShmBufferTranslate(int rank,void * local_p);
+  void *ShmBufferMalloc(size_t bytes);
+  void *ShmBufferFreeAll(void) ;
+  
+  ////////////////////////////////////////////////
+  // Must call in Grid startup
+  ////////////////////////////////////////////////
+  static void Init(int *argc, char ***argv);
+  
+  ////////////////////////////////////////////////
+  // Constructor of any given grid
+  ////////////////////////////////////////////////
+  CartesianCommunicator(const std::vector<int> &pdimensions_in);
+  
+  ////////////////////////////////////////////////////////////////////////////////////////
+  // Wraps MPI_Cart routines, or implements equivalent on other impls
+  ////////////////////////////////////////////////////////////////////////////////////////
+  void ShiftedRanks(int dim,int shift,int & source, int & dest);
+  int  RankFromProcessorCoor(std::vector<int> &coor);
+  void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
+  
+  /////////////////////////////////
+  // Grid information and queries
+  /////////////////////////////////
+  static int ShmRank;
+  static int ShmSize;
+  static int GroupSize;
+  static int GroupRank;
+  static int WorldRank;
+  static int WorldSize;
+  static int Slave;
+  
+  int                      IsBoss(void)            ;
+  int                      BossRank(void)          ;
+  int                      ThisRank(void)          ;
+  const std::vector<int> & ThisProcessorCoor(void) ;
+  const std::vector<int> & ProcessorGrid(void)     ;
+  int                      ProcessorCount(void)    ;
+  static int Ranks    (void);
+  static int Nodes    (void);
+  static int Cores    (void);
+  static int NodeRank (void);
+  static int CoreRank (void);
 
-    // Constructor
-    CartesianCommunicator(const std::vector<int> &pdimensions_in);
+  ////////////////////////////////////////////////////////////////////////////////
+  // very VERY rarely (Log, serial RNG) we need world without a grid
+  ////////////////////////////////////////////////////////////////////////////////
+  static int  RankWorld(void) ;
+  static void BroadcastWorld(int root,void* data, int bytes);
+  
+  ////////////////////////////////////////////////////////////
+  // Reduction
+  ////////////////////////////////////////////////////////////
+  void GlobalSum(RealF &);
+  void GlobalSumVector(RealF *,int N);
+  void GlobalSum(RealD &);
+  void GlobalSumVector(RealD *,int N);
+  void GlobalSum(uint32_t &);
+  void GlobalSum(uint64_t &);
+  void GlobalSum(ComplexF &c);
+  void GlobalSumVector(ComplexF *c,int N);
+  void GlobalSum(ComplexD &c);
+  void GlobalSumVector(ComplexD *c,int N);
+  
+  template<class obj> void GlobalSum(obj &o){
+    typedef typename obj::scalar_type scalar_type;
+    int words = sizeof(obj)/sizeof(scalar_type);
+    scalar_type * ptr = (scalar_type *)& o;
+    GlobalSumVector(ptr,words);
+  }
+  
+  ////////////////////////////////////////////////////////////
+  // Face exchange, buffer swap in translational invariant way
+  ////////////////////////////////////////////////////////////
+  void SendToRecvFrom(void *xmit,
+		      int xmit_to_rank,
+		      void *recv,
+		      int recv_from_rank,
+		      int bytes);
+  
+  void SendRecvPacket(void *xmit,
+		      void *recv,
+		      int xmit_to_rank,
+		      int recv_from_rank,
+		      int bytes);
+  
+  void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+			   void *xmit,
+			   int xmit_to_rank,
+			   void *recv,
+			   int recv_from_rank,
+			   int bytes);
+  
+  void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
 
-    // Wraps MPI_Cart routines
-    void ShiftedRanks(int dim,int shift,int & source, int & dest);
-    int  RankFromProcessorCoor(std::vector<int> &coor);
-    void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
+  void StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+				  void *xmit,
+				  int xmit_to_rank,
+				  void *recv,
+				  int recv_from_rank,
+				  int bytes);
+  
+  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
+  void StencilBarrier(void);
 
-    // Helper function for SHM Windows in MPI3
-    void *ShmBufferSelf(void);
-    void *ShmBuffer(int rank);
-
-    /////////////////////////////////
-    // Grid information queries
-    /////////////////////////////////
-    int                      IsBoss(void)            { return _processor==0; };
-    int                      BossRank(void)          { return 0; };
-    int                      ThisRank(void)          { return _processor; };
-    const std::vector<int> & ThisProcessorCoor(void) { return _processor_coor; };
-    const std::vector<int> & ProcessorGrid(void)     { return _processors; };
-    int                      ProcessorCount(void)    { return _Nprocessors; };
-
-    ////////////////////////////////////////////////////////////
-    // Reduction
-    ////////////////////////////////////////////////////////////
-    void GlobalSum(RealF &);
-    void GlobalSumVector(RealF *,int N);
-
-    void GlobalSum(RealD &);
-    void GlobalSumVector(RealD *,int N);
-
-    void GlobalSum(uint32_t &);
-    void GlobalSum(uint64_t &);
-
-    void GlobalSum(ComplexF &c)
-    {
-      GlobalSumVector((float *)&c,2);
-    }
-    void GlobalSumVector(ComplexF *c,int N)
-    {
-      GlobalSumVector((float *)c,2*N);
-    }
-
-    void GlobalSum(ComplexD &c)
-    {
-      GlobalSumVector((double *)&c,2);
-    }
-    void GlobalSumVector(ComplexD *c,int N)
-    {
-      GlobalSumVector((double *)c,2*N);
-    }
-    
-    template<class obj> void GlobalSum(obj &o){
-      typedef typename obj::scalar_type scalar_type;
-      int words = sizeof(obj)/sizeof(scalar_type);
-      scalar_type * ptr = (scalar_type *)& o;
-      GlobalSumVector(ptr,words);
-    }
-    ////////////////////////////////////////////////////////////
-    // Face exchange, buffer swap in translational invariant way
-    ////////////////////////////////////////////////////////////
-    void SendToRecvFrom(void *xmit,
-			int xmit_to_rank,
-			void *recv,
-			int recv_from_rank,
-			int bytes);
-
-    void SendRecvPacket(void *xmit,
-			void *recv,
-			int xmit_to_rank,
-			int recv_from_rank,
-			int bytes);
-
-    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-			 void *xmit,
-			 int xmit_to_rank,
-			 void *recv,
-			 int recv_from_rank,
-			 int bytes);
-    void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
-    void StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-			 void *xmit,
-			 int xmit_to_rank,
-			 void *recv,
-			 int recv_from_rank,
-			 int bytes);
-    void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
-    {
-      SendToRecvFromComplete(waitall);
-    }
-
-    ////////////////////////////////////////////////////////////
-    // Barrier
-    ////////////////////////////////////////////////////////////
-    void Barrier(void);
-
-    ////////////////////////////////////////////////////////////
-    // Broadcast a buffer and composite larger
-    ////////////////////////////////////////////////////////////
-    void Broadcast(int root,void* data, int bytes);
-    template<class obj> void Broadcast(int root,obj &data)
+  ////////////////////////////////////////////////////////////
+  // Barrier
+  ////////////////////////////////////////////////////////////
+  void Barrier(void);
+  
+  ////////////////////////////////////////////////////////////
+  // Broadcast a buffer and composite larger
+  ////////////////////////////////////////////////////////////
+  void Broadcast(int root,void* data, int bytes);
+  
+  template<class obj> void Broadcast(int root,obj &data)
     {
       Broadcast(root,(void *)&data,sizeof(data));
     };
 
-    static void BroadcastWorld(int root,void* data, int bytes);
-
 }; 
 }
 
diff --git a/lib/communicator/Communicator_mpi.cc b/lib/communicator/Communicator_mpi.cc
index 4291b319..a638eebb 100644
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@@ -30,19 +30,28 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 
 namespace Grid {
 
-  // Should error check all MPI calls.
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Info that is setup once and indept of cartesian layout
+///////////////////////////////////////////////////////////////////////////////////////////////////
+MPI_Comm CartesianCommunicator::communicator_world;
+
+// Should error check all MPI calls.
 void CartesianCommunicator::Init(int *argc, char ***argv) {
   int flag;
   MPI_Initialized(&flag); // needed to coexist with other libs apparently
   if ( !flag ) {
     MPI_Init(argc,argv);
   }
-}
-
-int Rank(void) {
-  int pe;
-  MPI_Comm_rank(MPI_COMM_WORLD,&pe);
-  return pe;
+  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
+  MPI_Comm_rank(communicator_world,&WorldRank);
+  MPI_Comm_size(communicator_world,&WorldSize);
+  ShmRank=0;
+  ShmSize=1;
+  GroupRank=WorldRank;
+  GroupSize=WorldSize;
+  Slave    =0;
+  ShmInitGeneric();
 }
 
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
@@ -54,7 +63,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
   _processors = processors;
   _processor_coor.resize(_ndimension);
   
-  MPI_Cart_create(MPI_COMM_WORLD, _ndimension,&_processors[0],&periodic[0],1,&communicator);
+  MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
   MPI_Comm_rank(communicator,&_processor);
   MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
 
@@ -67,15 +76,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
   
   assert(Size==_Nprocessors);
 }
-void *CartesianCommunicator::ShmBufferSelf(void)
-{
-  return NULL;
-}
-void *CartesianCommunicator::ShmBuffer(int rank)
-{
-  return NULL;
-}
-
 void CartesianCommunicator::GlobalSum(uint32_t &u){
   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
   assert(ierr==0);
@@ -194,14 +194,17 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 		     communicator);
   assert(ierr==0);
 }
-
+  ///////////////////////////////////////////////////////
+  // Should only be used prior to Grid Init finished.
+  // Check for this?
+  ///////////////////////////////////////////////////////
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
   int ierr= MPI_Bcast(data,
 		      bytes,
 		      MPI_BYTE,
 		      root,
-		      MPI_COMM_WORLD);
+		      communicator_world);
   assert(ierr==0);
 }
 
diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc
index f5bbdbda..00b0ca11 100644
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -1,4 +1,3 @@
-
     /*************************************************************************************
 
     Grid physics library, www.github.com/paboyle/Grid 
@@ -33,26 +32,197 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
 
 
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Info that is setup once and indept of cartesian layout
+///////////////////////////////////////////////////////////////////////////////////////////////////
+int CartesianCommunicator::ShmSetup = 0;
 
-// Global used by Init and nowhere else. How to hide?
-int Rank(void) {
-  int pe;
-  MPI_Comm_rank(MPI_COMM_WORLD,&pe);
-  return pe;
+MPI_Comm CartesianCommunicator::communicator_world;
+MPI_Comm CartesianCommunicator::ShmComm;
+MPI_Win  CartesianCommunicator::ShmWindow;
+
+std::vector<int> CartesianCommunicator::GroupRanks;  
+std::vector<int> CartesianCommunicator::MyGroup;
+std::vector<void *> CartesianCommunicator::ShmCommBufs;
+
+void *CartesianCommunicator::ShmBufferSelf(void)
+{
+  return ShmCommBufs[ShmRank];
 }
-  // Should error check all MPI calls.
+void *CartesianCommunicator::ShmBuffer(int rank)
+{
+  int gpeer = GroupRanks[rank];
+  if (gpeer == MPI_UNDEFINED){
+    return NULL;
+  } else { 
+    return ShmCommBufs[gpeer];
+  }
+}
+void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p)
+{
+  int gpeer = GroupRanks[rank];
+  if (gpeer == MPI_UNDEFINED){
+    return NULL;
+  } else { 
+    uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
+    uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
+    return (void *) remote;
+  }
+}
+
 void CartesianCommunicator::Init(int *argc, char ***argv) {
   int flag;
   MPI_Initialized(&flag); // needed to coexist with other libs apparently
   if ( !flag ) {
     MPI_Init(argc,argv);
   }
-}
-  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Want to implement some magic ... Group sub-cubes into those on same node
-  //
-  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
+  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
+  MPI_Comm_rank(communicator_world,&WorldRank);
+  MPI_Comm_size(communicator_world,&WorldSize);
+
+  /////////////////////////////////////////////////////////////////////
+  // Split into groups that can share memory
+  /////////////////////////////////////////////////////////////////////
+  MPI_Comm_split_type(communicator_world, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
+  MPI_Comm_rank(ShmComm     ,&ShmRank);
+  MPI_Comm_size(ShmComm     ,&ShmSize);
+  GroupSize = WorldSize/ShmSize;
+
+  /////////////////////////////////////////////////////////////////////
+  // find world ranks in our SHM group (i.e. which ranks are on our node)
+  /////////////////////////////////////////////////////////////////////
+  MPI_Group WorldGroup, ShmGroup;
+  MPI_Comm_group (communicator_world, &WorldGroup); 
+  MPI_Comm_group (ShmComm, &ShmGroup);
+  
+  std::vector<int> world_ranks(WorldSize); 
+  GroupRanks.resize(WorldSize); 
+  MyGroup.resize(ShmSize);
+  for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
+  
+  MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &GroupRanks[0]); 
+
+  ///////////////////////////////////////////////////////////////////
+  // Identify who is in my group and noninate the leader
+    ///////////////////////////////////////////////////////////////////
+  int g=0;
+  for(int rank=0;rank<WorldSize;rank++){
+    if(GroupRanks[rank]!=MPI_UNDEFINED){
+      assert(g<ShmSize);
+      MyGroup[g++] = rank;
+    }
+  }
+  
+  std::sort(MyGroup.begin(),MyGroup.end(),std::less<int>());
+  int myleader = MyGroup[0];
+  
+  std::vector<int> leaders_1hot(WorldSize,0);
+  std::vector<int> leaders_group(GroupSize,0);
+  leaders_1hot [ myleader ] = 1;
+    
+  ///////////////////////////////////////////////////////////////////
+  // global sum leaders over comm world
+  ///////////////////////////////////////////////////////////////////
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator_world);
+  assert(ierr==0);
+  
+  ///////////////////////////////////////////////////////////////////
+  // find the group leaders world rank
+  ///////////////////////////////////////////////////////////////////
+  int group=0;
+  for(int l=0;l<WorldSize;l++){
+    if(leaders_1hot[l]){
+      leaders_group[group++] = l;
+    }
+  }
+  
+  ///////////////////////////////////////////////////////////////////
+  // Identify the rank of the group in which I (and my leader) live
+  ///////////////////////////////////////////////////////////////////
+  GroupRank=-1;
+  for(int g=0;g<GroupSize;g++){
+    if (myleader == leaders_group[g]){
+      GroupRank=g;
+    }
+  }
+  assert(GroupRank!=-1);
+  
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // allocate the shared window for our group
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  
+  ShmCommBuf = 0;
+  ierr = MPI_Win_allocate_shared(MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,ShmComm,&ShmCommBuf,&ShmWindow);
+  assert(ierr==0);
+  // KNL hack -- force to numa-domain 1 in flat
+#if 0
+  for(uint64_t page=0;page<MAX_MPI_SHM_BYTES;page+=4096){
+    void *pages = (void *) ( page + ShmCommBuf );
+    int status;
+    int flags=MPOL_MF_MOVE_ALL;
+    int nodes=1; // numa domain == MCDRAM
+    unsigned long count=1;
+    ierr= move_pages(0,count, &pages,&nodes,&status,flags);
+    if (ierr && (page==0)) perror("numa relocate command failed");
+  }
+#endif
+  MPI_Win_lock_all (MPI_MODE_NOCHECK, ShmWindow);
+  
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Plan: allocate a fixed SHM region. Scratch that is just used via some scheme during stencil comms, with no allocate free.
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  ShmCommBufs.resize(ShmSize);
+  for(int r=0;r<ShmSize;r++){
+    MPI_Aint sz;
+    int dsp_unit;
+    MPI_Win_shared_query (ShmWindow, r, &sz, &dsp_unit, &ShmCommBufs[r]);
+  }
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Verbose for now
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  if (WorldRank == 0){
+    std::cout<<GridLogMessage<< "Grid MPI-3 configuration: detected ";
+    std::cout<< WorldSize << " Ranks " ;
+    std::cout<< GroupSize << " Nodes " ;
+    std::cout<<  ShmSize  << " with ranks-per-node "<<std::endl;
+    
+    std::cout<<GridLogMessage     <<"Grid MPI-3 configuration: allocated shared memory region of size ";
+    std::cout<<std::hex << MAX_MPI_SHM_BYTES <<" ShmCommBuf address = "<<ShmCommBuf << std::dec<<std::endl;
+
+    for(int g=0;g<GroupSize;g++){
+      std::cout<<GridLogMessage<<" Node "<<g<<" led by MPI rank "<<leaders_group[g]<<std::endl;
+    }
+
+    std::cout<<GridLogMessage<<" Boss Node Shm Pointers are {";
+    for(int g=0;g<ShmSize;g++){
+      std::cout<<std::hex<<ShmCommBufs[g]<<std::dec;
+      if(g!=ShmSize-1) std::cout<<",";
+      else std::cout<<"}"<<std::endl;
+    }
+
+  }
+  
+  for(int g=0;g<GroupSize;g++){
+    if ( (ShmRank == 0) && (GroupRank==g) )  std::cout<<GridLogMessage<<"["<<g<<"] Node Group "<<g<<" is ranks {";
+    for(int r=0;r<ShmSize;r++){
+      if ( (ShmRank == 0) && (GroupRank==g) ) {
+	std::cout<<MyGroup[r];
+	if(r<ShmSize-1) std::cout<<",";
+	else std::cout<<"}"<<std::endl;
+      }
+      MPI_Barrier(communicator_world);
+    }
+  }
+  
+  assert(ShmSetup==0);  ShmSetup=1;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Want to implement some magic ... Group sub-cubes into those on same node
+////////////////////////////////////////////////////////////////////////////////////////////////////////////
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
   std::vector<int> coor = _processor_coor;
@@ -80,139 +250,13 @@ void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &c
   rank = LexicographicToWorldRank[rank];
 }
 
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Info that is setup once and indept of cartesian layout
-///////////////////////////////////////////////////////////////////////////////////////////////////
-int CartesianCommunicator::ShmSetup = 0;
-int CartesianCommunicator::ShmRank;
-int CartesianCommunicator::ShmSize;
-int CartesianCommunicator::GroupRank;
-int CartesianCommunicator::GroupSize;
-MPI_Comm CartesianCommunicator::ShmComm;
-MPI_Win  CartesianCommunicator::ShmWindow;
-std::vector<int> CartesianCommunicator::GroupRanks; 
-std::vector<int> CartesianCommunicator::MyGroup;
-
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 { 
-
-  _ndimension = processors.size();
-
-  WorldDims = processors;
-
-  communicator = MPI_COMM_WORLD;
-  MPI_Comm_rank(communicator,&WorldRank);
-  MPI_Comm_size(communicator,&WorldSize);
-
-  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Plan: allocate a fixed SHM region. Scratch that is just used via some scheme during stencil comms, with no allocate free.
-  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Does every grid need one, or could we share across all grids via a singleton/guard?
   int ierr;
 
-  if ( !ShmSetup ) { 
+  communicator=communicator_world;
 
-    MPI_Comm_split_type(communicator, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
-    MPI_Comm_rank(ShmComm     ,&ShmRank);
-    MPI_Comm_size(ShmComm     ,&ShmSize);
-    GroupSize = WorldSize/ShmSize;
-
-    /////////////////////////////////////////////////////////////////////
-    // find world ranks in our SHM group (i.e. which ranks are on our node)
-    /////////////////////////////////////////////////////////////////////
-    MPI_Group WorldGroup, ShmGroup;
-    MPI_Comm_group (communicator, &WorldGroup); 
-    MPI_Comm_group (ShmComm, &ShmGroup);
-
-    std::vector<int> world_ranks(WorldSize); 
-    GroupRanks.resize(WorldSize); 
-    MyGroup.resize(ShmSize);
-    for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
-  
-    MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &GroupRanks[0]); 
-
-    ///////////////////////////////////////////////////////////////////
-    // Identify who is in my group and noninate the leader
-    ///////////////////////////////////////////////////////////////////
-    int g=0;
-    for(int rank=0;rank<WorldSize;rank++){
-      if(GroupRanks[rank]!=MPI_UNDEFINED){
-	assert(g<ShmSize);
-	MyGroup[g++] = rank;
-      }
-    }
-  
-    std::sort(MyGroup.begin(),MyGroup.end(),std::greater<int>());
-    int myleader = MyGroup[0];
-    
-    std::vector<int> leaders_1hot(WorldSize,0);
-    std::vector<int> leaders_group(GroupSize,0);
-    leaders_1hot [ myleader ] = 1;
-    
-    ///////////////////////////////////////////////////////////////////
-    // global sum leaders over comm world
-    ///////////////////////////////////////////////////////////////////
-    ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator);
-    assert(ierr==0);
-  
-    ///////////////////////////////////////////////////////////////////
-    // find the group leaders world rank
-    ///////////////////////////////////////////////////////////////////
-    int group=0;
-    for(int l=0;l<WorldSize;l++){
-      if(leaders_1hot[l]){
-	leaders_group[group++] = l;
-      }
-    }
-  
-    ///////////////////////////////////////////////////////////////////
-    // Identify the rank of the group in which I (and my leader) live
-    ///////////////////////////////////////////////////////////////////
-    GroupRank=-1;
-    for(int g=0;g<GroupSize;g++){
-      if (myleader == leaders_group[g]){
-	GroupRank=g;
-      }
-    }
-    assert(GroupRank!=-1);
-    
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // allocate the shared window for our group
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    
-    ShmCommBuf = 0;
-    ierr = MPI_Win_allocate_shared(MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,ShmComm,&ShmCommBuf,&ShmWindow);
-    assert(ierr==0);
-    for(uint64_t page=0;page<MAX_MPI_SHM_BYTES;page+=4096){
-      void *pages = (void *) ( page + ShmCommBuf );
-      int status;
-      int flags=MPOL_MF_MOVE_ALL;
-      int nodes=1; // numa domain == MCDRAM
-      unsigned long count=1;
-      ierr= move_pages(0,count, &pages,&nodes,&status,flags);
-      if (ierr && (page==0)) perror("numa relocate command failed");
-    }
-    MPI_Win_lock_all (MPI_MODE_NOCHECK, ShmWindow);
-
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // Verbose for now
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    std::cout<<GridLogMessage<< "MPI-3 configuration: Ranks per node "<< ShmSize ;
-    std::cout<< " Nodes "<< GroupSize;
-    std::cout<< " Ranks "<< WorldSize;
-    std::cout<< " Shm CommBuf address"<< std::hex <<ShmCommBuf << std::dec<<std::endl;
-
-    // Done
-    ShmSetup=1;
-
-  }
-
-  ShmCommBufs.resize(ShmSize);
-  for(int r=0;r<ShmSize;r++){
-    MPI_Aint sz;
-    int dsp_unit;
-    MPI_Win_shared_query (ShmWindow, r, &sz, &dsp_unit, &ShmCommBufs[r]);
-  }
+  _ndimension = processors.size();
   
   ////////////////////////////////////////////////////////////////
   // Assert power of two shm_size.
@@ -232,6 +276,8 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
   ////////////////////////////////////////////////////////////////
   int dim = 0;
   
+  std::vector<int> WorldDims = processors;
+
   ShmDims.resize(_ndimension,1);
   GroupDims.resize(_ndimension);
     
@@ -346,21 +392,6 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
   }
 }
 
-
-void *CartesianCommunicator::ShmBufferSelf(void)
-{
-  return ShmCommBufs[ShmRank];
-}
-void *CartesianCommunicator::ShmBuffer(int rank)
-{
-  int gpeer = GroupRanks[rank];
-  if (gpeer == MPI_UNDEFINED){
-    return NULL;
-  } else { 
-    return ShmCommBufs[gpeer];
-  }
-}
-
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
@@ -369,6 +400,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 						int from,
 						int bytes)
 {
+#if 1
   MPI_Request xrq;
   MPI_Request rrq;
   
@@ -387,12 +419,11 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 
   sequence++;
   
-  char *to_ptr   = (char *)ShmCommBufs[gdest];
   char *from_ptr = (char *)ShmCommBufs[ShmRank];
 
   int small = (bytes<MAX_MPI_SHM_BYTES);
 
-  typedef vRealD T;
+  typedef uint64_t T;
   int words = bytes/sizeof(T);
 
   assert(((size_t)bytes &(sizeof(T)-1))==0);
@@ -400,13 +431,18 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 
   if ( small && (gdest !=MPI_UNDEFINED) ) {
 
+    char *to_ptr   = (char *)ShmCommBufs[gdest];
+
     assert(gme != gdest);
 
     T *ip = (T *)xmit;
     T *op = (T *)to_ptr;
 PARALLEL_FOR_LOOP 
     for(int w=0;w<words;w++) {
-      vstream(op[w],ip[w]);
+      op[w]=ip[w];
+      if ( w == 0 ) { 
+	//	std::cout << " xmit "<< ShmRank <<" -> "<< gdest<<" " <<std::hex<<op[w]<<std::dec<<std::endl;
+      }
     }
 
     bcopy(&_processor,&to_ptr[bytes],sizeof(_processor));
@@ -426,7 +462,10 @@ PARALLEL_FOR_LOOP
     T *op = (T *)recv;
 PARALLEL_FOR_LOOP 
     for(int w=0;w<words;w++) {
-      vstream(op[w],ip[w]);
+      op[w]=ip[w];
+      if ( w == 0 ) { 
+	//	std::cout << " recv "<< ShmRank <<" <- "<< gfrom<<" " <<std::hex<<op[w]<<std::dec<<std::endl;
+      }
     }
     bcopy(&from_ptr[bytes]  ,&tag  ,sizeof(tag));
     bcopy(&from_ptr[bytes+4],&check,sizeof(check));
@@ -441,6 +480,19 @@ PARALLEL_FOR_LOOP
   MPI_Win_sync (ShmWindow);   
   MPI_Barrier  (ShmComm);
   MPI_Win_sync (ShmWindow);   
+#else
+  MPI_Request xrq;
+  MPI_Request rrq;
+  int rank = _processor;
+  int ierr;
+  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
+  
+  assert(ierr==0);
+
+  list.push_back(xrq);
+  list.push_back(rrq);
+#endif
 }
 
 void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
@@ -476,19 +528,29 @@ void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_
     list.push_back(rrq);
   }
 
+
+  StencilBarrier();
+
+}
+
+
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list)
+{
+  SendToRecvFromComplete(list);
+}
+
+void CartesianCommunicator::StencilBarrier(void)
+{
   MPI_Win_sync (ShmWindow);   
   MPI_Barrier  (ShmComm);
   MPI_Win_sync (ShmWindow);   
-  
 }
 
-
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
   int nreq=list.size();
   std::vector<MPI_Status> status(nreq);
   int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
-
   assert(ierr==0);
 }
 
@@ -514,7 +576,7 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 		      bytes,
 		      MPI_BYTE,
 		      root,
-		      MPI_COMM_WORLD);
+		      communicator_world);
   assert(ierr==0);
 }
 
diff --git a/lib/communicator/Communicator_none.cc b/lib/communicator/Communicator_none.cc
index 80b8fb90..d32fe4fa 100644
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@@ -28,18 +28,29 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include "Grid.h"
 namespace Grid {
 
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Info that is setup once and indept of cartesian layout
+///////////////////////////////////////////////////////////////////////////////////////////////////
+int CartesianCommunicator::ShmRank;
+int CartesianCommunicator::ShmSize;
+int CartesianCommunicator::GroupRank;
+int CartesianCommunicator::GroupSize;
+int CartesianCommunicator::WorldRank;
+int CartesianCommunicator::WorldSize;
+int CartesianCommunicator::Slave;
+void *              CartesianCommunicator::ShmCommBuf;
+commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
+
 void CartesianCommunicator::Init(int *argc, char *** arv)
 {
-}
-
-int Rank(void ){ return 0; };
-void *CartesianCommunicator::ShmBufferSelf(void)
-{
-  return NULL;
-}
-void *CartesianCommunicator::ShmBuffer(int rank)
-{
-  return NULL;
+  WorldRank = 0;
+  WorldSize = 1;
+  ShmRank=0;
+  ShmSize=1;
+  GroupRank=_WorldRank;
+  GroupSize=_WorldSize;
+  Slave    =0;
+  ShmInitGeneric();
 }
 
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
@@ -97,30 +108,16 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
   assert(0);
 }
 
-void CartesianCommunicator::Barrier(void)
-{
-}
-
-void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
-{
-}
-void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
-{
-}
-
-
+void CartesianCommunicator::Barrier(void){}
+void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
+void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
+int  CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) {  return 0;}
+void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){  assert(0);}
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
   source =0;
   dest=0;
 }
-int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
-{
-  return 0;
-}
-void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
-{
-}
 
 
 }
diff --git a/lib/communicator/Communicator_shmem.cc b/lib/communicator/Communicator_shmem.cc
index 4af719b0..544b37c7 100644
--- a/lib/communicator/Communicator_shmem.cc
+++ b/lib/communicator/Communicator_shmem.cc
@@ -39,25 +39,19 @@ namespace Grid {
     BACKTRACEFILE();		   \
   }\
 }
-int Rank(void) {
-  return shmem_my_pe();
-}
-typedef struct HandShake_t { 
-  uint64_t seq_local;
-  uint64_t seq_remote;
-} HandShake;
 
-static Vector< HandShake > XConnections;
-static Vector< HandShake > RConnections;
 
-void *CartesianCommunicator::ShmBufferSelf(void)
-{
-  return NULL;
-}
-void *CartesianCommunicator::ShmBuffer(int rank)
-{
-  return NULL;
-}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Info that is setup once and indept of cartesian layout
+///////////////////////////////////////////////////////////////////////////////////////////////////
+int CartesianCommunicator::ShmRank;
+int CartesianCommunicator::ShmSize;
+int CartesianCommunicator::GroupRank;
+int CartesianCommunicator::GroupSize;
+int CartesianCommunicator::WorldRank;
+int CartesianCommunicator::WorldSize;
+int CartesianCommunicator::Slave;
+
 void CartesianCommunicator::Init(int *argc, char ***argv) {
   shmem_init();
   XConnections.resize(shmem_n_pes());
@@ -69,7 +63,36 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
     RConnections[pe].seq_remote= 0;
   }
   shmem_barrier_all();
+  ShmInitGeneric();
 }
+
+
+// Should error check all MPI calls.
+void CartesianCommunicator::Init(int *argc, char ***argv) {
+  int flag;
+  MPI_Initialized(&flag); // needed to coexist with other libs apparently
+  if ( !flag ) {
+    MPI_Init(argc,argv);
+    MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
+    MPI_Comm_rank(communicator_world,&_WorldRank);
+    MPI_Comm_size(communicator_world,&_WorldSize);
+    _ShmRank=0;
+    _ShmSize=1;
+    _GroupRank=_WorldRank;
+    _GroupSize=_WorldSize;
+    _Slave    =0;
+  }
+}
+
+
+typedef struct HandShake_t { 
+  uint64_t seq_local;
+  uint64_t seq_remote;
+} HandShake;
+
+static Vector< HandShake > XConnections;
+static Vector< HandShake > RConnections;
+
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
   _ndimension = processors.size();

From 392e0645137a988dea02f74e27ba2a8ad8182328 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Mon, 24 Oct 2016 19:24:21 +0100
Subject: [PATCH 14/27] fast local peek-poke

---
 lib/lattice/Lattice_peekpoke.h | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/lib/lattice/Lattice_peekpoke.h b/lib/lattice/Lattice_peekpoke.h
index 9bece943..19d349c4 100644
--- a/lib/lattice/Lattice_peekpoke.h
+++ b/lib/lattice/Lattice_peekpoke.h
@@ -154,7 +154,7 @@ PARALLEL_FOR_LOOP
     template<class vobj,class sobj>
     void peekLocalSite(sobj &s,const Lattice<vobj> &l,std::vector<int> &site){
         
-      GridBase *grid=l._grid;
+      GridBase *grid = l._grid;
 
       typedef typename vobj::scalar_type scalar_type;
       typedef typename vobj::vector_type vector_type;
@@ -164,16 +164,18 @@ PARALLEL_FOR_LOOP
       assert( l.checkerboard== l._grid->CheckerBoard(site));
       assert( sizeof(sobj)*Nsimd == sizeof(vobj));
 
+      static const int words=sizeof(vobj)/sizeof(vector_type);
       int odx,idx;
       idx= grid->iIndex(site);
       odx= grid->oIndex(site);
 
-      std::vector<sobj> buf(Nsimd);
-
-      extract(l._odata[odx],buf);
+      scalar_type * vp = (scalar_type *)&l._odata[odx];
+      scalar_type * pt = (scalar_type *)&s;
+      
+      for(int w=0;w<words;w++){
+        pt[w] = vp[idx+w*Nsimd];
+      }
       
-      s = buf[idx];
-
       return;
     };
 
@@ -190,18 +192,17 @@ PARALLEL_FOR_LOOP
       assert( l.checkerboard== l._grid->CheckerBoard(site));
       assert( sizeof(sobj)*Nsimd == sizeof(vobj));
 
+      static const int words=sizeof(vobj)/sizeof(vector_type);
       int odx,idx;
       idx= grid->iIndex(site);
       odx= grid->oIndex(site);
 
-      std::vector<sobj> buf(Nsimd);
-
-      // extract-modify-merge cycle is easiest way and this is not perf critical
-      extract(l._odata[odx],buf);
+      scalar_type * vp = (scalar_type *)&l._odata[odx];
+      scalar_type * pt = (scalar_type *)&s;
       
-      buf[idx] = s;
-
-      merge(l._odata[odx],buf);
+      for(int w=0;w<words;w++){
+        vp[idx+w*Nsimd] = pt[w];
+      }
 
       return;
     };

From a795b5705e8ab938fcfd244d022bb2a9ce396810 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Mon, 24 Oct 2016 19:25:15 +0100
Subject: [PATCH 15/27] memory optimisation

---
 lib/cartesian/Cartesian_base.h      | 2 +-
 lib/cartesian/Cartesian_full.h      | 2 +-
 lib/cartesian/Cartesian_red_black.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/cartesian/Cartesian_base.h b/lib/cartesian/Cartesian_base.h
index 8a24c87b..72b21ee3 100644
--- a/lib/cartesian/Cartesian_base.h
+++ b/lib/cartesian/Cartesian_base.h
@@ -77,7 +77,7 @@ public:
     // GridCartesian / GridRedBlackCartesian
     ////////////////////////////////////////////////////////////////
     virtual int CheckerBoarded(int dim)=0;
-    virtual int CheckerBoard(std::vector<int> site)=0;
+    virtual int CheckerBoard(std::vector<int> &site)=0;
     virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
     virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
     virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
diff --git a/lib/cartesian/Cartesian_full.h b/lib/cartesian/Cartesian_full.h
index 14ab8b55..b0d20441 100644
--- a/lib/cartesian/Cartesian_full.h
+++ b/lib/cartesian/Cartesian_full.h
@@ -49,7 +49,7 @@ public:
     virtual int CheckerBoarded(int dim){
       return 0;
     }
-    virtual int CheckerBoard(std::vector<int> site){
+    virtual int CheckerBoard(std::vector<int> &site){
         return 0;
     }
     virtual int CheckerBoardDestination(int cb,int shift,int dim){
diff --git a/lib/cartesian/Cartesian_red_black.h b/lib/cartesian/Cartesian_red_black.h
index db0508d5..6a4300d7 100644
--- a/lib/cartesian/Cartesian_red_black.h
+++ b/lib/cartesian/Cartesian_red_black.h
@@ -49,7 +49,7 @@ public:
       if( dim==_checker_dim) return 1;
       else return 0;
     }
-    virtual int CheckerBoard(std::vector<int> site){
+    virtual int CheckerBoard(std::vector<int> &site){
       int linear=0;
       assert(site.size()==_ndimension);
       for(int d=0;d<_ndimension;d++){ 

From 13bf0482e3165514feeccf626d60add2aa91910c Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Mon, 24 Oct 2016 19:25:40 +0100
Subject: [PATCH 16/27] FFT optimisation

---
 lib/FFT.h | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/lib/FFT.h b/lib/FFT.h
index 4cda6483..17060dc3 100644
--- a/lib/FFT.h
+++ b/lib/FFT.h
@@ -200,18 +200,14 @@ namespace Grid {
 					       sign,FFTW_ESTIMATE);
 	}
 
-	double add,mul,fma;
-	FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
-	flops_call = add+mul+2.0*fma;
-
-	GridStopWatch timer;
+    std::vector<int> lcoor(Nd), gcoor(Nd);
 
 	// Barrel shift and collect global pencil
 	for(int p=0;p<processors[dim];p++) { 
 
 	  for(int idx=0;idx<sgrid->lSites();idx++) { 
 
-	    std::vector<int> lcoor(Nd);
+	    
     	    sgrid->LocalIndexToLocalCoor(idx,lcoor);
 
 	    sobj s;
@@ -228,14 +224,11 @@ namespace Grid {
 	
 	// Loop over orthog coords
 	int NN=pencil_g.lSites();
-
-	GridStopWatch Timer;
-	Timer.Start();
+	GridStopWatch timer;
+	timer.Start();
 
 PARALLEL_FOR_LOOP
-	for(int idx=0;idx<NN;idx++) { 
-
-	  std::vector<int> lcoor(Nd);
+	for(int idx=0;idx<NN;idx++) {
 	  pencil_g.LocalIndexToLocalCoor(idx,lcoor);
 
 	  if ( lcoor[dim] == 0 ) {  // restricts loop to plane at lcoor[dim]==0
@@ -245,15 +238,17 @@ PARALLEL_FOR_LOOP
 	  }
 	}
 
-        Timer.Stop();
-	usec += Timer.useconds();
-	flops+= flops_call*NN;
+        timer.Stop();
 
+          double add,mul,fma;
+          FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
+          flops_call = add+mul+2.0*fma;
+          usec += timer.useconds();
+          flops+= flops_call*NN;
         int pc = processor_coor[dim];
-        for(int idx=0;idx<sgrid->lSites();idx++) { 
-	  std::vector<int> lcoor(Nd);
+        for(int idx=0;idx<sgrid->lSites();idx++) {
 	  sgrid->LocalIndexToLocalCoor(idx,lcoor);
-	  std::vector<int> gcoor = lcoor;
+	  gcoor = lcoor;
 	  // extract the result
 	  sobj s;
 	  gcoor[dim] = lcoor[dim]+L*pc;

From b94478fa51c46039bfc6beda741790b65dd2c46e Mon Sep 17 00:00:00 2001
From: azusayamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Mon, 24 Oct 2016 23:45:31 +0100
Subject: [PATCH 17/27] mpi, mpi3, shmem all compile. mpi, mpi3 pass single
 node multi-rank

---
 lib/AlignedAllocator.h                 |  2 +-
 lib/Stencil.h                          | 31 ++++++-------
 lib/communicator/Communicator_base.cc  | 12 ++---
 lib/communicator/Communicator_mpi3.cc  | 24 +++-------
 lib/communicator/Communicator_none.cc  |  4 +-
 lib/communicator/Communicator_shmem.cc | 61 +++++++-------------------
 6 files changed, 49 insertions(+), 85 deletions(-)

diff --git a/lib/AlignedAllocator.h b/lib/AlignedAllocator.h
index 44929ca8..fa001adc 100644
--- a/lib/AlignedAllocator.h
+++ b/lib/AlignedAllocator.h
@@ -141,7 +141,7 @@ public:
 
     if ( bcast != ptr ) {
       std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout);
-      BACKTRACEFILE();
+      //      BACKTRACEFILE();
       exit(0);
     }
     assert( bcast == (void *) ptr);
diff --git a/lib/Stencil.h b/lib/Stencil.h
index f22acb6f..221785b4 100644
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -32,8 +32,6 @@
 
  #include <Grid/stencil/Lebesgue.h>   // subdir aggregate
 
-const int ShmDirectCopy = 1;
-
  //////////////////////////////////////////////////////////////////////////////////////////
  // Must not lose sight that goal is to be able to construct really efficient
  // gather to a point stencil code. CSHIFT is not the best way, so need
@@ -170,13 +168,13 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
     reqs.resize(Packets.size());
     commtime-=usecond();
     for(int i=0;i<Packets.size();i++){
-      if( ShmDirectCopy ) {
 	_grid->StencilSendToRecvFromBegin(reqs[i],
 					  Packets[i].send_buf,
 					  Packets[i].to_rank,
 					  Packets[i].recv_buf,
 					  Packets[i].from_rank,
 					  Packets[i].bytes);
+	/*
       }else{
 	_grid->SendToRecvFromBegin(reqs[i],
 				   Packets[i].send_buf,
@@ -185,17 +183,19 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 				   Packets[i].from_rank,
 				   Packets[i].bytes);
       }
+	*/
     }
     commtime+=usecond();
   }
   void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
   {
     commtime-=usecond();
+
     for(int i=0;i<Packets.size();i++){
-      if( ShmDirectCopy ) 
+      //      if( ShmDirectCopy ) 
 	_grid->StencilSendToRecvFromComplete(reqs[i]);
-      else 
-	_grid->SendToRecvFromComplete(reqs[i]);
+	//      else 
+	//	_grid->SendToRecvFromComplete(reqs[i]);
     }
     commtime+=usecond();
   }
@@ -253,8 +253,6 @@ PARALLEL_FOR_LOOP
   // Flat vector, change layout for cache friendly.
   Vector<StencilEntry>  _entries;
   
-  inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; }
-  
   void PrecomputeByteOffsets(void){
     for(int i=0;i<_entries.size();i++){
       if( _entries[i]._is_local ) {
@@ -265,9 +263,7 @@ PARALLEL_FOR_LOOP
     }
   };
 
-  inline uint64_t Touch(int ent) {
-    //	 _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0);
-  }
+  inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; }
   inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
     uint64_t cbase = (uint64_t)&u_recv_buf_p[0];
     local = _entries[ent]._is_local;
@@ -685,7 +681,9 @@ PARALLEL_FOR_LOOP
     _grid->StencilBarrier();
     HaloGather(source,compress);
     this->CommunicateBegin(reqs);
+    _grid->StencilBarrier();
     this->CommunicateComplete(reqs);
+    _grid->StencilBarrier();
     CommsMerge(); // spins
   }
   
@@ -823,11 +821,13 @@ PARALLEL_FOR_LOOP
 
 
 	cobj *send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,u_recv_buf_p);
-	if ( (ShmDirectCopy==0)||send_buf==NULL ) { 
-	  cobj *send_buf = u_send_buf_p;
+	if ( (send_buf==NULL) ) { 
+	  send_buf = u_send_buf_p;
 	}
-	
+	//	std::cout << " send_bufs  "<<std::hex<< send_buf <<" ubp "<<u_send_buf_p <<std::dec<<std::endl;
 	t_data-=usecond();
+	assert(u_send_buf_p!=NULL);
+	assert(send_buf!=NULL);
 	Gather_plane_simple_table         (face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so);  face_idx++;
 	t_data+=usecond();
 	
@@ -931,7 +931,8 @@ PARALLEL_FOR_LOOP
 	    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
  
 	    scalar_object *shm = (scalar_object *) _grid->ShmBufferTranslate(recv_from_rank,sp);
-	    if ((ShmDirectCopy==0)||(shm==NULL)) { 
+	    //	    if ((ShmDirectCopy==0)||(shm==NULL)) { 
+	    if (shm==NULL) { 
 	      shm = rp;
 	    } 
 	    
diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc
index 1272b6a2..91e9cf9b 100644
--- a/lib/communicator/Communicator_base.cc
+++ b/lib/communicator/Communicator_base.cc
@@ -69,7 +69,7 @@ int                      CartesianCommunicator::ProcessorCount(void)    { return
 ////////////////////////////////////////////////////////////////////////////////
 // very VERY rarely (Log, serial RNG) we need world without a grid
 ////////////////////////////////////////////////////////////////////////////////
-int  CartesianCommunicator::RankWorld(void) { return WorldRank; };
+int  CartesianCommunicator::RankWorld(void){ return WorldRank; };
 int CartesianCommunicator::Ranks    (void) { return WorldSize; };
 int CartesianCommunicator::Nodes    (void) { return GroupSize; };
 int CartesianCommunicator::Cores    (void) { return ShmSize;   };
@@ -108,22 +108,22 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
 {
   SendToRecvFromComplete(waitall);
 }
-void StencilBarrier(void){};
+void CartesianCommunicator::StencilBarrier(void){};
 
 commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
 
 void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; }
+
 void *CartesianCommunicator::ShmBuffer(int rank) {
-  if (rank != ShmRank ) return NULL;
-  else                  return ShmCommBuf;
+  return NULL;
 }
 void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { 
-  if (rank != ShmRank ) return NULL;
-  else                  return local_p;
+  return NULL;
 }
 void CartesianCommunicator::ShmInitGeneric(void){
   ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
   ShmCommBuf=(void *)&ShmBufStorageVector[0];
+  std::cout << "allocated persistent buffer"<<std::hex << ShmCommBuf << std::dec<<std::endl;
 }
 
 #endif
diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc
index 00b0ca11..61e5e06b 100644
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -400,7 +400,9 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 						int from,
 						int bytes)
 {
-#if 1
+#if 0
+  this->StencilBarrier();
+
   MPI_Request xrq;
   MPI_Request rrq;
   
@@ -440,9 +442,6 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 PARALLEL_FOR_LOOP 
     for(int w=0;w<words;w++) {
       op[w]=ip[w];
-      if ( w == 0 ) { 
-	//	std::cout << " xmit "<< ShmRank <<" -> "<< gdest<<" " <<std::hex<<op[w]<<std::dec<<std::endl;
-      }
     }
 
     bcopy(&_processor,&to_ptr[bytes],sizeof(_processor));
@@ -452,20 +451,15 @@ PARALLEL_FOR_LOOP
     assert(ierr==0);
     list.push_back(xrq);
   }
-  
-  MPI_Win_sync (ShmWindow);   
-  MPI_Barrier  (ShmComm);
-  MPI_Win_sync (ShmWindow);   
 
+  this->StencilBarrier();
+  
   if (small && (gfrom !=MPI_UNDEFINED) ) {
     T *ip = (T *)from_ptr;
     T *op = (T *)recv;
 PARALLEL_FOR_LOOP 
     for(int w=0;w<words;w++) {
       op[w]=ip[w];
-      if ( w == 0 ) { 
-	//	std::cout << " recv "<< ShmRank <<" <- "<< gfrom<<" " <<std::hex<<op[w]<<std::dec<<std::endl;
-      }
     }
     bcopy(&from_ptr[bytes]  ,&tag  ,sizeof(tag));
     bcopy(&from_ptr[bytes+4],&check,sizeof(check));
@@ -477,9 +471,8 @@ PARALLEL_FOR_LOOP
     list.push_back(rrq);
   }
 
-  MPI_Win_sync (ShmWindow);   
-  MPI_Barrier  (ShmComm);
-  MPI_Win_sync (ShmWindow);   
+  this->StencilBarrier();
+
 #else
   MPI_Request xrq;
   MPI_Request rrq;
@@ -528,9 +521,6 @@ void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_
     list.push_back(rrq);
   }
 
-
-  StencilBarrier();
-
 }
 
 
diff --git a/lib/communicator/Communicator_none.cc b/lib/communicator/Communicator_none.cc
index d32fe4fa..f4e4c575 100644
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@@ -47,8 +47,8 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
   WorldSize = 1;
   ShmRank=0;
   ShmSize=1;
-  GroupRank=_WorldRank;
-  GroupSize=_WorldSize;
+  GroupRank=WorldRank;
+  GroupSize=WorldSize;
   Slave    =0;
   ShmInitGeneric();
 }
diff --git a/lib/communicator/Communicator_shmem.cc b/lib/communicator/Communicator_shmem.cc
index 544b37c7..d5f3ed76 100644
--- a/lib/communicator/Communicator_shmem.cc
+++ b/lib/communicator/Communicator_shmem.cc
@@ -44,13 +44,16 @@ namespace Grid {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
-int CartesianCommunicator::ShmRank;
-int CartesianCommunicator::ShmSize;
-int CartesianCommunicator::GroupRank;
-int CartesianCommunicator::GroupSize;
-int CartesianCommunicator::WorldRank;
-int CartesianCommunicator::WorldSize;
-int CartesianCommunicator::Slave;
+
+typedef struct HandShake_t { 
+  uint64_t seq_local;
+  uint64_t seq_remote;
+} HandShake;
+
+
+static Vector< HandShake > XConnections;
+static Vector< HandShake > RConnections;
+
 
 void CartesianCommunicator::Init(int *argc, char ***argv) {
   shmem_init();
@@ -62,37 +65,17 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
     RConnections[pe].seq_local = 0;
     RConnections[pe].seq_remote= 0;
   }
+  WorldSize = shmem_n_pes();
+  WorldRank = shmem_my_pe();
+  ShmRank=0;
+  ShmSize=1;
+  GroupRank=WorldRank;
+  GroupSize=WorldSize;
+  Slave    =0;
   shmem_barrier_all();
   ShmInitGeneric();
 }
 
-
-// Should error check all MPI calls.
-void CartesianCommunicator::Init(int *argc, char ***argv) {
-  int flag;
-  MPI_Initialized(&flag); // needed to coexist with other libs apparently
-  if ( !flag ) {
-    MPI_Init(argc,argv);
-    MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
-    MPI_Comm_rank(communicator_world,&_WorldRank);
-    MPI_Comm_size(communicator_world,&_WorldSize);
-    _ShmRank=0;
-    _ShmSize=1;
-    _GroupRank=_WorldRank;
-    _GroupSize=_WorldSize;
-    _Slave    =0;
-  }
-}
-
-
-typedef struct HandShake_t { 
-  uint64_t seq_local;
-  uint64_t seq_remote;
-} HandShake;
-
-static Vector< HandShake > XConnections;
-static Vector< HandShake > RConnections;
-
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
   _ndimension = processors.size();
@@ -261,12 +244,9 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
 
   if ( _processor == sender ) {
 
-    printf("Sender SHMEM pt2pt %d -> %d\n",sender,receiver);
     // Check he has posted a receive
     while(SendSeq->seq_remote == SendSeq->seq_local);
 
-    printf("Sender receive %d posted\n",sender,receiver);
-
     // Advance our send count
     seq = ++(SendSeq->seq_local);
     
@@ -275,26 +255,19 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
     shmem_putmem(recv,xmit,bytes,receiver);
     shmem_fence();
 
-    printf("Sender sent payload %d\n",seq);
     //Notify him we're done
     shmem_putmem((void *)&(RecvSeq->seq_remote),&seq,sizeof(seq),receiver);
     shmem_fence();
-    printf("Sender ringing door bell  %d\n",seq);
   }
   if ( _processor == receiver ) {
 
-    printf("Receiver SHMEM pt2pt %d->%d\n",sender,receiver);
     // Post a receive
     seq = ++(RecvSeq->seq_local);
     shmem_putmem((void *)&(SendSeq->seq_remote),&seq,sizeof(seq),sender);
 
-    printf("Receiver Opening letter box %d\n",seq);
-
-    
     // Now wait until he has advanced our reception counter
     while(RecvSeq->seq_remote != RecvSeq->seq_local);
 
-    printf("Receiver Got the mail %d\n",seq);
   }
 }
 

From 7c3363b91ecbdb06de96842a610f73697a3de873 Mon Sep 17 00:00:00 2001
From: azusayamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Tue, 25 Oct 2016 00:04:17 +0100
Subject: [PATCH 18/27] Compiles all comms targets

---
 lib/communicator/Communicator_none.cc | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/lib/communicator/Communicator_none.cc b/lib/communicator/Communicator_none.cc
index f4e4c575..198c1add 100644
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@@ -31,15 +31,6 @@ namespace Grid {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
-int CartesianCommunicator::ShmRank;
-int CartesianCommunicator::ShmSize;
-int CartesianCommunicator::GroupRank;
-int CartesianCommunicator::GroupSize;
-int CartesianCommunicator::WorldRank;
-int CartesianCommunicator::WorldSize;
-int CartesianCommunicator::Slave;
-void *              CartesianCommunicator::ShmCommBuf;
-commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
 
 void CartesianCommunicator::Init(int *argc, char *** arv)
 {

From d97a27f4833c6827dc3d344a5186f0a3e4d2d972 Mon Sep 17 00:00:00 2001
From: azusayamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Tue, 25 Oct 2016 01:05:31 +0100
Subject: [PATCH 19/27] Verbose

---
 lib/communicator/Communicator_base.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc
index 91e9cf9b..bccba6be 100644
--- a/lib/communicator/Communicator_base.cc
+++ b/lib/communicator/Communicator_base.cc
@@ -123,7 +123,6 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
 void CartesianCommunicator::ShmInitGeneric(void){
   ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
   ShmCommBuf=(void *)&ShmBufStorageVector[0];
-  std::cout << "allocated persistent buffer"<<std::hex << ShmCommBuf << std::dec<<std::endl;
 }
 
 #endif

From 8f8058f8a54e81a26b29a718170d6a8f85d4166e Mon Sep 17 00:00:00 2001
From: azusayamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Tue, 25 Oct 2016 01:05:52 +0100
Subject: [PATCH 20/27] More random bits on parallel seeding

---
 lib/lattice/Lattice_rng.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/lib/lattice/Lattice_rng.h b/lib/lattice/Lattice_rng.h
index 3254af30..2df5e2d1 100644
--- a/lib/lattice/Lattice_rng.h
+++ b/lib/lattice/Lattice_rng.h
@@ -296,9 +296,10 @@ namespace Grid {
 	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
 
 	int l_idx=generator_idx(o_idx,i_idx);
-	
-	std::vector<int> site_seeds(4);
-	for(int i=0;i<4;i++){
+
+	const int num_rand_seed=16;
+	std::vector<int> site_seeds(num_rand_seed);
+	for(int i=0;i<site_seeds.size();i++){
 	  site_seeds[i]= ui(pseeder);
 	}
 

From d7d92af09db80132959ebd9fdb5ca39516520d20 Mon Sep 17 00:00:00 2001
From: azusayamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Tue, 25 Oct 2016 01:45:53 +0100
Subject: [PATCH 21/27] Travis fail fix attempt

---
 lib/Stencil.h                         | 2 +-
 lib/communicator/Communicator_base.cc | 3 ++-
 lib/communicator/Communicator_base.h  | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/lib/Stencil.h b/lib/Stencil.h
index 221785b4..5c3a5ef9 100644
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -821,7 +821,7 @@ PARALLEL_FOR_LOOP
 
 
 	cobj *send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,u_recv_buf_p);
-	if ( (send_buf==NULL) ) { 
+	if ( send_buf==NULL ) { 
 	  send_buf = u_send_buf_p;
 	}
 	//	std::cout << " send_bufs  "<<std::hex<< send_buf <<" ubp "<<u_send_buf_p <<std::dec<<std::endl;
diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc
index bccba6be..9810987d 100644
--- a/lib/communicator/Communicator_base.cc
+++ b/lib/communicator/Communicator_base.cc
@@ -48,10 +48,11 @@ void *CartesianCommunicator::ShmBufferMalloc(size_t bytes){
   void *ptr = (void *)heap_top;
   heap_top  += bytes;
   heap_bytes+= bytes;
+  std::cout <<"Shm alloc "<<ptr<<std::endl;
   assert(heap_bytes < MAX_MPI_SHM_BYTES);
   return ptr;
 }
-void *CartesianCommunicator::ShmBufferFreeAll(void) { 
+void CartesianCommunicator::ShmBufferFreeAll(void) { 
   heap_top  =(size_t)ShmBufferSelf();
   heap_bytes=0;
 }
diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h
index 576c749e..9a15d2b0 100644
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -104,7 +104,7 @@ class CartesianCommunicator {
   void *ShmBuffer(int rank);
   void *ShmBufferTranslate(int rank,void * local_p);
   void *ShmBufferMalloc(size_t bytes);
-  void *ShmBufferFreeAll(void) ;
+  void ShmBufferFreeAll(void) ;
   
   ////////////////////////////////////////////////
   // Must call in Grid startup

From 09f66100d3fe9a363d43d8da4e241116b4a8e103 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Tue, 25 Oct 2016 06:01:12 +0100
Subject: [PATCH 22/27] MPI 3 compile on non-linux

---
 lib/communicator/Communicator_mpi3.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc
index 61e5e06b..14a152ab 100644
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -27,7 +27,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
     /*  END LEGAL */
 #include "Grid.h"
 #include <mpi.h>
-#include <numaif.h>
 
 namespace Grid {
 
@@ -157,6 +156,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
   assert(ierr==0);
   // KNL hack -- force to numa-domain 1 in flat
 #if 0
+  //#include <numaif.h>
   for(uint64_t page=0;page<MAX_MPI_SHM_BYTES;page+=4096){
     void *pages = (void *) ( page + ShmCommBuf );
     int status;

From 33d199a0ad3d1be0d583961911c2766d8f8874e4 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Tue, 25 Oct 2016 12:56:40 +0100
Subject: [PATCH 23/27] temporary thread safety in FFT

---
 lib/FFT.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/FFT.h b/lib/FFT.h
index 17060dc3..9a59ed01 100644
--- a/lib/FFT.h
+++ b/lib/FFT.h
@@ -227,7 +227,7 @@ namespace Grid {
 	GridStopWatch timer;
 	timer.Start();
 
-PARALLEL_FOR_LOOP
+//PARALLEL_FOR_LOOP
 	for(int idx=0;idx<NN;idx++) {
 	  pencil_g.LocalIndexToLocalCoor(idx,lcoor);
 

From bca861e112a0473f8215e66310489a9216b7870a Mon Sep 17 00:00:00 2001
From: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Tue, 25 Oct 2016 14:21:48 +0100
Subject: [PATCH 24/27] Note:FFT shoud be GridFFT (Not change yet).

Gauge fix with FFt is added (tests/core)
---
 lib/qcd/utils/SUn.h         |  31 ++++
 lib/qcd/utils/WilsonLoops.h |   2 +-
 tests/core/Test_fft.cc      |   3 +
 tests/core/Test_fft_gfix.cc | 301 ++++++++++++++++++++++++++++++++++++
 4 files changed, 336 insertions(+), 1 deletion(-)
 create mode 100644 tests/core/Test_fft_gfix.cc

diff --git a/lib/qcd/utils/SUn.h b/lib/qcd/utils/SUn.h
index 3240098a..914c3bad 100644
--- a/lib/qcd/utils/SUn.h
+++ b/lib/qcd/utils/SUn.h
@@ -674,6 +674,37 @@ class SU {
       out += la;
     }
   }
+/*
+ add GaugeTrans
+*/
+
+template<typename GaugeField,typename GaugeMat>
+  static void GaugeTransform( GaugeField &Umu, GaugeMat &g){
+    GridBase *grid = Umu._grid;
+    conformable(grid,g._grid);
+
+    GaugeMat U(grid);
+    GaugeMat ag(grid); ag = adj(g);
+
+    for(int mu=0;mu<Nd;mu++){
+      U= PeekIndex<LorentzIndex>(Umu,mu);
+      U = g*U*Cshift(ag, mu, 1);
+      PokeIndex<LorentzIndex>(Umu,U,mu);
+    }
+  }
+  template<typename GaugeMat>
+    static void GaugeTransform( std::vector<GaugeMat> &U, GaugeMat &g){
+    GridBase *grid = g._grid;
+    GaugeMat ag(grid); ag = adj(g);
+    for(int mu=0;mu<Nd;mu++){
+      U[mu] = g*U[mu]*Cshift(ag, mu, 1);
+    }
+  }
+  template<typename GaugeField,typename GaugeMat>
+  static void RandomGaugeTransform(GridParallelRNG &pRNG, GaugeField &Umu, GaugeMat &g){
+    LieRandomize(pRNG,g,1.0);
+    GaugeTransform(Umu,g);
+  }
 
   // Projects the algebra components a lattice matrix (of dimension ncol*ncol -1 )
   // inverse operation: FundamentalLieAlgebraMatrix
diff --git a/lib/qcd/utils/WilsonLoops.h b/lib/qcd/utils/WilsonLoops.h
index 10022f50..03d45c07 100644
--- a/lib/qcd/utils/WilsonLoops.h
+++ b/lib/qcd/utils/WilsonLoops.h
@@ -522,4 +522,4 @@ typedef WilsonLoops<PeriodicGimplR> SU3WilsonLoops;
 }
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/tests/core/Test_fft.cc b/tests/core/Test_fft.cc
index 562a53a1..512d6e62 100644
--- a/tests/core/Test_fft.cc
+++ b/tests/core/Test_fft.cc
@@ -441,6 +441,8 @@ int main (int argc, char ** argv)
   }
 
   {
+    /*
+     * 
     typedef GaugeImplTypes<vComplexD, 1> QEDGimplTypesD;
     typedef Photon<QEDGimplTypesD>       QEDGaction;
 
@@ -450,6 +452,7 @@ int main (int argc, char ** argv)
 
     Maxwell.FreePropagator (Source,Prop);
     std::cout << " MaxwellFree propagator\n";
+    */
   }
   Grid_finalize();
 }
diff --git a/tests/core/Test_fft_gfix.cc b/tests/core/Test_fft_gfix.cc
new file mode 100644
index 00000000..1eda9a67
--- /dev/null
+++ b/tests/core/Test_fft_gfix.cc
@@ -0,0 +1,301 @@
+    /*************************************************************************************
+
+    grid` physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_cshift.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/qcd/action/gauge/Photon.h>
+
+using namespace Grid;
+using namespace Grid::QCD;
+
+template <class Gimpl> 
+class FourierAcceleratedGaugeFixer  : public Gimpl {
+  public:
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+  typedef typename Gimpl::GaugeLinkField GaugeMat;
+  typedef typename Gimpl::GaugeField GaugeLorentz;
+
+  static void GaugeLinkToLieAlgebraField(const std::vector<GaugeMat> &U,std::vector<GaugeMat> &A) {
+    for(int mu=0;mu<Nd;mu++){
+//      ImplComplex cmi(0.0,-1.0);
+      ComplexD cmi(0.0,-1.0);
+      A[mu] = Ta(U[mu]) * cmi;
+    }
+  }
+  static void DmuAmu(const std::vector<GaugeMat> &A,GaugeMat &dmuAmu) {
+    dmuAmu=zero;
+    for(int mu=0;mu<Nd;mu++){
+      dmuAmu = dmuAmu + A[mu] - Cshift(A[mu],mu,-1);
+    }
+  }  
+  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,RealD & alpha,int maxiter,RealD Omega_tol, RealD Phi_tol) {
+    GridBase *grid = Umu._grid;
+
+    RealD org_plaq      =WilsonLoops<Gimpl>::avgPlaquette(Umu);
+    RealD org_link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); 
+    RealD old_trace = org_link_trace;
+    RealD trG;
+
+    std::vector<GaugeMat> U(Nd,grid);
+                 GaugeMat dmuAmu(grid);
+
+    for(int i=0;i<maxiter;i++){
+      for(int mu=0;mu<Nd;mu++) U[mu]= PeekIndex<LorentzIndex>(Umu,mu);
+      //trG = SteepestDescentStep(U,alpha,dmuAmu);
+      trG = FourierAccelSteepestDescentStep(U,alpha,dmuAmu);
+      for(int mu=0;mu<Nd;mu++) PokeIndex<LorentzIndex>(Umu,U[mu],mu);
+      // Monitor progress and convergence test 
+      // infrequently to minimise cost overhead
+      if ( i %20 == 0 ) { 
+	RealD plaq      =WilsonLoops<Gimpl>::avgPlaquette(Umu);
+	RealD link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); 
+
+	std::cout << GridLogMessage << " Iteration "<<i<< " plaq= "<<plaq<< " dmuAmu " << norm2(dmuAmu)<< std::endl;
+	
+	RealD Phi  = 1.0 - old_trace / link_trace ;
+	RealD Omega= 1.0 - trG;
+
+
+	std::cout << GridLogMessage << " Iteration "<<i<< " Phi= "<<Phi<< " Omega= " << Omega<< " trG " << trG <<std::endl;
+	if ( (Omega < Omega_tol) && ( ::fabs(Phi) < Phi_tol) ) {
+	  std::cout << GridLogMessage << "Converged ! "<<std::endl;
+	  return;
+	}
+
+	old_trace = link_trace;
+
+      }
+    }
+  };
+  static RealD SteepestDescentStep(std::vector<GaugeMat> &U,RealD & alpha, GaugeMat & dmuAmu) {
+    GridBase *grid = U[0]._grid;
+
+    std::vector<GaugeMat> A(Nd,grid);
+    GaugeMat g(grid);
+
+    GaugeLinkToLieAlgebraField(U,A);
+    ExpiAlphaDmuAmu(A,g,alpha,dmuAmu);
+
+
+    RealD vol = grid->gSites();
+    RealD trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
+
+    SU<Nc>::GaugeTransform(U,g);
+
+    return trG;
+  }
+
+  static RealD FourierAccelSteepestDescentStep(std::vector<GaugeMat> &U,RealD & alpha, GaugeMat & dmuAmu) {
+
+    GridBase *grid = U[0]._grid;
+
+    RealD vol = grid->gSites();
+
+    FFT theFFT((GridCartesian *)grid);
+
+    LatticeComplex  Fp(grid);
+    LatticeComplex  psq(grid); psq=zero;
+    LatticeComplex  pmu(grid); 
+    LatticeComplex   one(grid); one = ComplexD(1.0,0.0);
+
+    GaugeMat g(grid);
+    GaugeMat dmuAmu_p(grid);
+    std::vector<GaugeMat> A(Nd,grid);
+
+    GaugeLinkToLieAlgebraField(U,A);
+
+    DmuAmu(A,dmuAmu);
+
+    theFFT.FFT_all_dim(dmuAmu_p,dmuAmu,FFT::forward);
+
+    //////////////////////////////////
+    // Work out Fp = psq_max/ psq...
+    //////////////////////////////////
+    std::vector<int> latt_size = grid->GlobalDimensions();
+    std::vector<int> coor(grid->_ndimension,0);
+    for(int mu=0;mu<Nd;mu++) {
+
+      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+      LatticeCoordinate(pmu,mu);
+      pmu = TwoPiL * pmu ;
+      psq = psq + 4.0*sin(pmu*0.5)*sin(pmu*0.5); 
+    }
+
+    ComplexD psqMax(16.0);
+    Fp =  psqMax*one/psq;
+
+    static int once;
+    if ( once == 0 ) { 
+      std::cout << " Fp " << Fp <<std::endl;
+      once ++;
+    }
+    pokeSite(TComplex(1.0),Fp,coor);
+
+    dmuAmu_p  = dmuAmu_p * Fp; 
+
+    theFFT.FFT_all_dim(dmuAmu,dmuAmu_p,FFT::backward);
+
+    GaugeMat ciadmam(grid);
+    ComplexD cialpha(0.0,-alpha);
+    ciadmam = dmuAmu*cialpha;
+    SU<Nc>::taExp(ciadmam,g);
+
+    RealD trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
+
+    SU<Nc>::GaugeTransform(U,g);
+
+    return trG;
+  }
+
+  static void ExpiAlphaDmuAmu(const std::vector<GaugeMat> &A,GaugeMat &g,RealD & alpha, GaugeMat &dmuAmu) {
+    GridBase *grid = g._grid;
+    ComplexD cialpha(0.0,-alpha);
+    GaugeMat ciadmam(grid);
+    DmuAmu(A,dmuAmu);
+    ciadmam = dmuAmu*cialpha;
+    SU<Nc>::taExp(ciadmam,g);
+  }  
+/*
+  ////////////////////////////////////////////////////////////////
+  // NB The FT for fields living on links has an extra phase in it
+  // Could add these to the FFT class as a later task since this code
+  // might be reused elsewhere ????
+  ////////////////////////////////////////////////////////////////
+  static void InverseFourierTransformAmu(FFT &theFFT,const std::vector<GaugeMat> &Ap,std::vector<GaugeMat> &Ax) {
+    GridBase * grid = theFFT.Grid();
+    std::vector<int> latt_size = grid->GlobalDimensions();
+
+    ComplexField  pmu(grid);
+    ComplexField  pha(grid);
+    GaugeMat      Apha(grid);
+
+    ComplexD ci(0.0,1.0);
+
+    for(int mu=0;mu<Nd;mu++){
+
+      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+      LatticeCoordinate(pmu,mu);
+      pmu = TwoPiL * pmu ;
+      pha = exp(pmu *  (0.5 *ci)); // e(ipmu/2) since Amu(x+mu/2)
+
+      Apha = Ap[mu] * pha;
+
+      theFFT.FFT_all_dim(Apha,Ax[mu],FFT::backward);
+    }
+  }
+  static void FourierTransformAmu(FFT & theFFT,const std::vector<GaugeMat> &Ax,std::vector<GaugeMat> &Ap) {
+    GridBase * grid = theFFT.Grid();
+    std::vector<int> latt_size = grid->GlobalDimensions();
+
+    ComplexField  pmu(grid);
+    ComplexField  pha(grid);
+    ComplexD ci(0.0,1.0);
+    
+    // Sign convention for FFTW calls:
+    // A(x)= Sum_p e^ipx A(p) / V
+    // A(p)= Sum_p e^-ipx A(x)
+
+    for(int mu=0;mu<Nd;mu++){
+      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+      LatticeCoordinate(pmu,mu);
+      pmu = TwoPiL * pmu ;
+      pha = exp(-pmu *  (0.5 *ci)); // e(+ipmu/2) since Amu(x+mu/2)
+
+      theFFT.FFT_all_dim(Ax[mu],Ap[mu],FFT::backward);
+      Ap[mu] = Ap[mu] * pha;
+    }
+  }
+*/
+};
+
+int main (int argc, char ** argv)
+{
+  std::vector<int> seeds({1,2,3,4});
+
+  Grid_init(&argc,&argv);
+
+  int threads = GridThread::GetThreads();
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout( { vComplexD::Nsimd(),1,1,1});
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+
+  int vol = 1;
+  for(int d=0;d<latt_size.size();d++){
+    vol = vol * latt_size[d];
+  }
+
+  GridCartesian         GRID(latt_size,simd_layout,mpi_layout);
+  GridSerialRNG          sRNG;  sRNG.SeedFixedIntegers(seeds); // naughty seeding
+  GridParallelRNG          pRNG(&GRID);   pRNG.SeedFixedIntegers(seeds);
+
+  FFT theFFT(&GRID);
+
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
+  std::cout<< "*****************************************************************" <<std::endl;
+  std::cout<< "* Testing we can gauge fix steep descent a RGT of Unit gauge    *" <<std::endl;
+  std::cout<< "*****************************************************************" <<std::endl;
+
+  LatticeGaugeFieldD   Umu(&GRID);
+  LatticeGaugeFieldD   Uorg(&GRID);
+  LatticeColourMatrixD   g(&GRID); // Gauge xform
+
+  
+  SU3::ColdConfiguration(pRNG,Umu); // Unit gauge
+  Uorg=Umu;
+
+  SU3::RandomGaugeTransform(pRNG,Umu,g); // Unit gauge
+  RealD plaq=WilsonLoops<PeriodicGimplD>::avgPlaquette(Umu);
+  std::cout << " Initial plaquette "<<plaq << std::endl;
+
+
+
+  RealD alpha=0.1;
+  FourierAcceleratedGaugeFixer<PeriodicGimplD>::SteepestDescentGaugeFix(Umu,alpha,10000,1.0e-10, 1.0e-10);
+
+
+  plaq=WilsonLoops<PeriodicGimplD>::avgPlaquette(Umu);
+  std::cout << " Final plaquette "<<plaq << std::endl;
+
+  Uorg = Uorg - Umu;
+  std::cout << " Norm Difference "<< norm2(Uorg) << std::endl;
+
+
+  //  std::cout<< "*****************************************************************" <<std::endl;
+  //  std::cout<< "* Testing Fourier accelerated fixing                            *" <<std::endl;
+  //  std::cout<< "*****************************************************************" <<std::endl;
+
+  //  std::cout<< "*****************************************************************" <<std::endl;
+  //  std::cout<< "* Testing non-unit configuration                                *" <<std::endl;
+  //  std::cout<< "*****************************************************************" <<std::endl;
+
+
+
+  Grid_finalize();
+}

From 14ddf2c234eda6f62bc8e28bf28d6395a77920d3 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Wed, 26 Oct 2016 17:36:26 +0100
Subject: [PATCH 25/27] more FFT optimisations

---
 lib/FFT.h              | 233 +++++++++++++++++++----------------------
 tests/core/Test_fft.cc |  24 +++--
 2 files changed, 121 insertions(+), 136 deletions(-)

diff --git a/lib/FFT.h b/lib/FFT.h
index 9a59ed01..71cf8a3e 100644
--- a/lib/FFT.h
+++ b/lib/FFT.h
@@ -98,174 +98,157 @@ namespace Grid {
 #define FFTW_BACKWARD (+1)
 #endif
 
-  class FFT { 
+  class FFT {
   private:
-
+    
     GridCartesian *vgrid;
     GridCartesian *sgrid;
-
+    
     int Nd;
     double flops;
     double flops_call;
     uint64_t usec;
-
+    
     std::vector<int> dimensions;
     std::vector<int> processors;
     std::vector<int> processor_coor;
-
+    
   public:
-
+    
     static const int forward=FFTW_FORWARD;
     static const int backward=FFTW_BACKWARD;
-
+    
     double Flops(void) {return flops;}
     double MFlops(void) {return flops/usec;}
-
-    FFT ( GridCartesian * grid ) : 
-      vgrid(grid),
-      Nd(grid->_ndimension),
-      dimensions(grid->_fdimensions),
-      processors(grid->_processors),
-      processor_coor(grid->_processor_coor)
+    
+    FFT ( GridCartesian * grid ) :
+    vgrid(grid),
+    Nd(grid->_ndimension),
+    dimensions(grid->_fdimensions),
+    processors(grid->_processors),
+    processor_coor(grid->_processor_coor)
     {
       flops=0;
       usec =0;
       std::vector<int> layout(Nd,1);
       sgrid = new GridCartesian(dimensions,layout,processors);
     };
-
-    ~FFT ( void)  { 
-      delete sgrid; 
+    
+    ~FFT ( void)  {
+      delete sgrid;
     }
     
     template<class vobj>
     void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int inverse){
-
+#ifndef HAVE_FFTW
+      assert(0);
+#else
       conformable(result._grid,vgrid);
       conformable(source._grid,vgrid);
-
+      
       int L = vgrid->_ldimensions[dim];
       int G = vgrid->_fdimensions[dim];
-
+      
       std::vector<int> layout(Nd,1);
       std::vector<int> pencil_gd(vgrid->_fdimensions);
-
-      pencil_gd[dim] = G*processors[dim];    
-
+      
+      pencil_gd[dim] = G*processors[dim];
+      
       // Pencil global vol LxLxGxLxL per node
       GridCartesian pencil_g(pencil_gd,layout,processors);
-
+      
       // Construct pencils
       typedef typename vobj::scalar_object sobj;
       typedef typename sobj::scalar_type   scalar;
+      
+      Lattice<sobj> pgbuf(&pencil_g);
+      
 
-      Lattice<vobj> ssource(vgrid); ssource =source;
-      Lattice<sobj> pgsource(&pencil_g);
-      Lattice<sobj> pgresult(&pencil_g); pgresult=zero;
-
-#ifndef HAVE_FFTW	
-      assert(0);
-#else 
       typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
       typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
-
-      {
-	int Ncomp = sizeof(sobj)/sizeof(scalar);
-	int Nlow  = 1;
-	for(int d=0;d<dim;d++){
-	  Nlow*=vgrid->_ldimensions[d];
-	}
-
-	int rank = 1;  /* 1d transforms */
-	int n[] = {G}; /* 1d transforms of length G */
-	int howmany = Ncomp;
-	int odist,idist,istride,ostride;
-	idist   = odist   = 1;          /* Distance between consecutive FT's */
-	istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
-	int *inembed = n, *onembed = n;
-
-	
-	int sign = FFTW_FORWARD;
-	if (inverse) sign = FFTW_BACKWARD;
-
-	FFTW_plan p;
-	{
-	  FFTW_scalar *in = (FFTW_scalar *)&pgsource._odata[0];
-	  FFTW_scalar *out= (FFTW_scalar *)&pgresult._odata[0];
-	  p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
-					       in,inembed,
-					       istride,idist,
-					       out,onembed,
-					       ostride, odist,
-					       sign,FFTW_ESTIMATE);
-	}
-
-    std::vector<int> lcoor(Nd), gcoor(Nd);
-
-	// Barrel shift and collect global pencil
-	for(int p=0;p<processors[dim];p++) { 
-
-	  for(int idx=0;idx<sgrid->lSites();idx++) { 
-
-	    
-    	    sgrid->LocalIndexToLocalCoor(idx,lcoor);
-
-	    sobj s;
-
-	    peekLocalSite(s,ssource,lcoor);
-
-	    lcoor[dim]+=p*L;
-	   
-	    pokeLocalSite(s,pgsource,lcoor);
-	  }
-
-	  ssource = Cshift(ssource,dim,L);
-	}
-	
-	// Loop over orthog coords
-	int NN=pencil_g.lSites();
-	GridStopWatch timer;
-	timer.Start();
-
-//PARALLEL_FOR_LOOP
-	for(int idx=0;idx<NN;idx++) {
-	  pencil_g.LocalIndexToLocalCoor(idx,lcoor);
-
-	  if ( lcoor[dim] == 0 ) {  // restricts loop to plane at lcoor[dim]==0
-	    FFTW_scalar *in = (FFTW_scalar *)&pgsource._odata[idx];
-	    FFTW_scalar *out= (FFTW_scalar *)&pgresult._odata[idx];
-	    FFTW<scalar>::fftw_execute_dft(p,in,out);
-	  }
-	}
-
-        timer.Stop();
-
-          double add,mul,fma;
-          FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
-          flops_call = add+mul+2.0*fma;
-          usec += timer.useconds();
-          flops+= flops_call*NN;
-        int pc = processor_coor[dim];
-        for(int idx=0;idx<sgrid->lSites();idx++) {
-	  sgrid->LocalIndexToLocalCoor(idx,lcoor);
-	  gcoor = lcoor;
-	  // extract the result
-	  sobj s;
-	  gcoor[dim] = lcoor[dim]+L*pc;
-	  peekLocalSite(s,pgresult,gcoor);
-	  pokeLocalSite(s,result,lcoor);
-	}
-      	  
-	FFTW<scalar>::fftw_destroy_plan(p);
+      
+      int Ncomp = sizeof(sobj)/sizeof(scalar);
+      int Nlow  = 1;
+      for(int d=0;d<dim;d++){
+        Nlow*=vgrid->_ldimensions[d];
       }
+      
+      int rank = 1;  /* 1d transforms */
+      int n[] = {G}; /* 1d transforms of length G */
+      int howmany = Ncomp;
+      int odist,idist,istride,ostride;
+      idist   = odist   = 1;          /* Distance between consecutive FT's */
+      istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
+      int *inembed = n, *onembed = n;
+      
+      int sign = FFTW_FORWARD;
+      if (inverse) sign = FFTW_BACKWARD;
+      
+      FFTW_plan p;
+      {
+        FFTW_scalar *in = (FFTW_scalar *)&pgbuf._odata[0];
+        FFTW_scalar *out= (FFTW_scalar *)&pgbuf._odata[0];
+        p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
+                                             in,inembed,
+                                             istride,idist,
+                                             out,onembed,
+                                             ostride, odist,
+                                             sign,FFTW_ESTIMATE);
+      }
+      
+      // Barrel shift and collect global pencil
+      std::vector<int> lcoor(Nd), gcoor(Nd);
+      result = source;
+      for(int p=0;p<processors[dim];p++) {
+        for(int idx=0;idx<sgrid->lSites();idx++) {
+          sgrid->LocalIndexToLocalCoor(idx,lcoor);
+          sobj s;
+          peekLocalSite(s,result,lcoor);
+          lcoor[dim]+=p*L;
+          pokeLocalSite(s,pgbuf,lcoor);
+        }
+        result = Cshift(result,dim,L);
+      }
+      
+      // Loop over orthog coords
+      int NN=pencil_g.lSites();
+      GridStopWatch timer;
+      timer.Start();
+      //PARALLEL_FOR_LOOP
+      for(int idx=0;idx<NN;idx++) {
+        pencil_g.LocalIndexToLocalCoor(idx,lcoor);
+        
+        if ( lcoor[dim] == 0 ) {  // restricts loop to plane at lcoor[dim]==0
+          FFTW_scalar *in = (FFTW_scalar *)&pgbuf._odata[idx];
+          FFTW_scalar *out= (FFTW_scalar *)&pgbuf._odata[idx];
+          FFTW<scalar>::fftw_execute_dft(p,in,out);
+        }
+      }
+      timer.Stop();
+      
+      // performance counting
+      double add,mul,fma;
+      FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
+      flops_call = add+mul+2.0*fma;
+      usec += timer.useconds();
+      flops+= flops_call*NN;
+      
+      // writing out result
+      int pc = processor_coor[dim];
+      for(int idx=0;idx<sgrid->lSites();idx++) {
+        sgrid->LocalIndexToLocalCoor(idx,lcoor);
+        gcoor = lcoor;
+        sobj s;
+        gcoor[dim] = lcoor[dim]+L*pc;
+        peekLocalSite(s,pgbuf,gcoor);
+        pokeLocalSite(s,result,lcoor);
+      }
+      
+      // destroying plan
+      FFTW<scalar>::fftw_destroy_plan(p);
 #endif
-
-
     }
-
   };
-
-
 }
 
 #endif
diff --git a/tests/core/Test_fft.cc b/tests/core/Test_fft.cc
index 2fdaeed2..d5b60752 100644
--- a/tests/core/Test_fft.cc
+++ b/tests/core/Test_fft.cc
@@ -76,13 +76,14 @@ int main (int argc, char ** argv)
   S=zero;
   S = S+C;
 
+  Ctilde = C;
   FFT theFFT(&Fine);
 
-  theFFT.FFT_dim(Ctilde,C,0,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Ctilde,C,1,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Ctilde,C,2,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Ctilde,C,3,FFT::forward);  std::cout << theFFT.MFlops()<<std::endl;
-
+  theFFT.FFT_dim(Ctilde,Ctilde,0,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Ctilde,Ctilde,1,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Ctilde,Ctilde,2,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Ctilde,Ctilde,3,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
+  
   //  C=zero;
   //  Ctilde = where(abs(Ctilde)<1.0e-10,C,Ctilde);
   TComplexD cVol;
@@ -92,12 +93,13 @@ int main (int argc, char ** argv)
   pokeSite(cVol,C,p);
   C=C-Ctilde;
   std::cout << "diff scalar "<<norm2(C) << std::endl;
-
-  theFFT.FFT_dim(Stilde,S,0,FFT::forward);  S=Stilde; std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Stilde,S,1,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Stilde,S,2,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<std::endl;
-
+  Stilde = S;
+  
+  theFFT.FFT_dim(Stilde,Stilde,0,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Stilde,Stilde,1,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Stilde,Stilde,2,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Stilde,Stilde,3,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
+  
   SpinMatrixD Sp; 
   Sp = zero; Sp = Sp+cVol;
 

From a1705a8d53f85e730681d5f90f973d09b15f1b25 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Wed, 26 Oct 2016 18:50:07 +0100
Subject: [PATCH 26/27] debug message removed

---
 lib/communicator/Communicator_base.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc
index 9810987d..8b789f4b 100644
--- a/lib/communicator/Communicator_base.cc
+++ b/lib/communicator/Communicator_base.cc
@@ -48,7 +48,6 @@ void *CartesianCommunicator::ShmBufferMalloc(size_t bytes){
   void *ptr = (void *)heap_top;
   heap_top  += bytes;
   heap_bytes+= bytes;
-  std::cout <<"Shm alloc "<<ptr<<std::endl;
   assert(heap_bytes < MAX_MPI_SHM_BYTES);
   return ptr;
 }

From 618abdf302813e38921c506282d559296dcfd7b5 Mon Sep 17 00:00:00 2001
From: James Harrison <jch1g10@soton.ac.uk>
Date: Sat, 29 Oct 2016 11:04:02 +0100
Subject: [PATCH 27/27] Add missing volume factor in stochastic QED field

---
 lib/qcd/action/gauge/Photon.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/lib/qcd/action/gauge/Photon.h b/lib/qcd/action/gauge/Photon.h
index 0dd5a9dc..409569d8 100644
--- a/lib/qcd/action/gauge/Photon.h
+++ b/lib/qcd/action/gauge/Photon.h
@@ -151,12 +151,19 @@ namespace QCD{
   {
     auto               *grid = dynamic_cast<GridCartesian *>(out._grid);
     const unsigned int nd = grid->_ndimension;
+    std::vector<int> latt_size   = grid->_fdimensions;
     GaugeLinkField     sqrtK2Inv(grid), r(grid);
     GaugeField         aTilde(grid);
     FFT                fft(grid);
     
+    Integer vol = 1;
+    for(int d = 0; d < nd; d++)
+    {
+      vol = vol * latt_size[d];
+    }
+
     invKHatSquared(sqrtK2Inv);
-    sqrtK2Inv = sqrt(real(sqrtK2Inv));
+    sqrtK2Inv = sqrt(vol*real(sqrtK2Inv));
     zmSub(sqrtK2Inv);
     for(int mu = 0; mu < nd; mu++)
     {