Internal SHM comms in non-simd directions working

Need to fix simd directions
2026-01-08 02:49:33 +00:00 · 2016-10-22 18:14:27 +01:00
parent 0fcd2e7188
commit c190221fd3
16 changed files with 1729 additions and 1739 deletions
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -153,7 +153,7 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
-    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    err = ref-result; 
    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
    Dw.Report();
@@ -192,7 +192,7 @@ int main (int argc, char ** argv)

    std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
-    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    sDw.Report();
  
    if(0){
@@ -262,7 +262,7 @@ int main (int argc, char ** argv)
      double flops=(1344.0*volume*ncall)/2;

      std::cout<<GridLogMessage << "sDeo mflop/s =   "<< flops/(t1-t0)<<std::endl;
-      std::cout<<GridLogMessage << "sDeo mflop/s per node   "<< flops/(t1-t0)/NP<<std::endl;
+      std::cout<<GridLogMessage << "sDeo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
      sDw.Report();

      sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
@@ -333,7 +333,7 @@ int main (int argc, char ** argv)
    double flops=(1344.0*volume*ncall)/2;

    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
-    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
    Dw.Report();
  }
  Dw.DhopEO(src_o,r_e,DaggerNo);
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -32,6 +32,8 @@

 #include <Grid/stencil/Lebesgue.h>   // subdir aggregate

+const int ShmDirectCopy = 1;
+
 //////////////////////////////////////////////////////////////////////////////////////////
 // Must not lose sight that goal is to be able to construct really efficient
 // gather to a point stencil code. CSHIFT is not the best way, so need
@@ -117,19 +119,6 @@ PARALLEL_FOR_LOOP
     }
 }

-template<class vobj,class cobj,class compressor> void 
-Gather_plane_simple_stencil (const Lattice<vobj> &rhs,cobj *buffer,int dimension,int plane,int cbmask,compressor &compress, int off,
-			     double &t_table ,double & t_data )
-{
-  std::vector<std::pair<int,int> > table;
-  Gather_plane_simple_table_compute (rhs._grid,dimension,plane,cbmask,off,table);
-  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
-  Gather_plane_simple_table         (table,rhs,buffer,compress,off,so);
-}
-
-
-
-
 struct StencilEntry { 
  uint64_t _offset;
  uint64_t _byte_offset;
@@ -181,6 +170,14 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,cobj *buffer,int dimension
    reqs.resize(Packets.size());
    commtime-=usecond();
    for(int i=0;i<Packets.size();i++){
+      if( ShmDirectCopy ) {
+	_grid->StencilSendToRecvFromBegin(reqs[i],
+					  Packets[i].send_buf,
+					  Packets[i].to_rank,
+					  Packets[i].recv_buf,
+					  Packets[i].from_rank,
+					  Packets[i].bytes);
+      }else{
 	_grid->SendToRecvFromBegin(reqs[i],
 				   Packets[i].send_buf,
 				   Packets[i].to_rank,
@@ -188,12 +185,16 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,cobj *buffer,int dimension
 				   Packets[i].from_rank,
 				   Packets[i].bytes);
      }
+    }
    commtime+=usecond();
  }
  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
    commtime-=usecond();
    for(int i=0;i<Packets.size();i++){
+      if( ShmDirectCopy ) 
+	_grid->StencilSendToRecvFromComplete(reqs[i]);
+      else 
 	_grid->SendToRecvFromComplete(reqs[i]);
    }
    commtime+=usecond();
@@ -259,7 +260,6 @@ PARALLEL_FOR_LOOP
      if( _entries[i]._is_local ) {
 	_entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj);
      } else { 
-	     // PrecomputeByteOffsets [5] 16384/32768 140735768678528 140735781261056 2581581952
 	_entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj);
      }
    }
@@ -269,7 +269,7 @@ PARALLEL_FOR_LOOP
    //	 _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0);
  }
  inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
-	 uint64_t cbase = (uint64_t)&comm_buf[0];
+    uint64_t cbase = (uint64_t)&u_recv_buf_p[0];
    local = _entries[ent]._is_local;
    perm  = _entries[ent]._permute;
    if (perm)  ptype = _permute_type[point]; 
@@ -280,23 +280,26 @@ PARALLEL_FOR_LOOP
    }
  }
  inline uint64_t GetPFInfo(int ent,uint64_t base) {
-	 uint64_t cbase = (uint64_t)&comm_buf[0];
+    uint64_t cbase = (uint64_t)&u_recv_buf_p[0];
    int local = _entries[ent]._is_local;
    if (local) return  base + _entries[ent]._byte_offset;
    else       return cbase + _entries[ent]._byte_offset;
  }
  
  ///////////////////////////////////////////////////////////
-       // Comms buffers
+  // Unified Comms buffers for all directions
  ///////////////////////////////////////////////////////////
  std::vector<commVector<scalar_object> > u_simd_send_buf;
  std::vector<commVector<scalar_object> > u_simd_recv_buf;
  commVector<cobj>          u_send_buf;
-       commVector<cobj>          comm_buf;
+  commVector<cobj>          u_recv_buf_hide;
+  cobj* u_recv_buf_p;

  int u_comm_offset;
  int _unified_buffer_size;
  
+  cobj *CommBuf(void) { return u_recv_buf_p; }
+
  /////////////////////////////////////////
  // Timing info; ugly; possibly temporary
  /////////////////////////////////////////
@@ -378,7 +381,6 @@ PARALLEL_FOR_LOOP
      int i = ii; // reverse direction to get SIMD comms done first
      int point = i;
      
-
      int dimension    = directions[i];
      int displacement = distances[i];
      int shift = displacement;
@@ -426,7 +428,21 @@ PARALLEL_FOR_LOOP
      }
    }
    u_send_buf.resize(_unified_buffer_size);
-         comm_buf.resize(_unified_buffer_size);
+
+    /////////////////////////////////////////////////////////////////////////////////
+    // Try to allocate for receiving in a shared memory region, fall back to buffer
+    /////////////////////////////////////////////////////////////////////////////////
+    if( ShmDirectCopy ) {
+
+      u_recv_buf_p=(cobj *)_grid->ShmBufferSelf();
+      if ( u_recv_buf_p == NULL ) {
+	u_recv_buf_hide.resize(_unified_buffer_size);
+	u_recv_buf_p=&u_recv_buf_hide[0];
+      }
+    } else {
+      u_recv_buf_hide.resize(_unified_buffer_size);
+      u_recv_buf_p=&u_recv_buf_hide[0];
+    }

    PrecomputeByteOffsets();

@@ -660,10 +676,7 @@ PARALLEL_FOR_LOOP
    }
  }
  
-
-
-       template<class compressor>
-       void HaloExchange(const Lattice<vobj> &source,compressor &compress) 
+  template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress) 
  {
    std::vector<std::vector<CommsRequest_t> > reqs;
    calls++;
@@ -675,8 +688,7 @@ PARALLEL_FOR_LOOP
    CommsMerge(); // spins
  }
  
-       template<class compressor>
-       void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
+  template<class compressor> void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
  {
    int dimension    = _directions[point];
    int displacement = _distances[point];
@@ -734,7 +746,6 @@ PARALLEL_FOR_LOOP
    assert(source._grid==_grid);
    halogtime-=usecond();
    
-	 assert (comm_buf.size() == _unified_buffer_size );
    u_comm_offset=0;
    
    // Gather all comms buffers
@@ -779,9 +790,6 @@ PARALLEL_FOR_LOOP
      int sx        = (x+sshift)%rd;
      int comm_proc = ((x+sshift)/rd)%pd;
      
-	     cobj *u_send_buf_p;
-	     cobj   *comm_buf_p;
-
      if (comm_proc) {

 	int words = buffer_size;
@@ -794,36 +802,48 @@ PARALLEL_FOR_LOOP
 	if ( !face_table_computed ) {
 	  t_table-=usecond();
 	  face_table.resize(face_idx+1);
-		 cobj *ptr; ptr = &u_send_buf[0];
 	  Gather_plane_simple_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,
 					     face_table[face_idx]);
 	  t_table+=usecond();
 	}
-	       t_data-=usecond();
-	       Gather_plane_simple_table         (face_table[face_idx],rhs,&u_send_buf[0],compress,u_comm_offset,so);  face_idx++;
-	       t_data+=usecond();
-	       gathertime+=usecond();
+	
 	
 	int rank           = _grid->_processor;
 	int recv_from_rank;
 	int xmit_to_rank;
 	_grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
+	
 	assert (xmit_to_rank   != _grid->ThisRank());
 	assert (recv_from_rank != _grid->ThisRank());
 	
-	       //      FIXME Implement asynchronous send & also avoid buffer copy
-	       AddPacket((void *)&u_send_buf[u_comm_offset],
-			 (void *)  &comm_buf[u_comm_offset],
+	/////////////////////////////////////////////////////////
+	// try the direct copy if possible
+	/////////////////////////////////////////////////////////
+
+	cobj *u_send_buf_p = &u_send_buf[0];
+	if (ShmDirectCopy) { 
+	  cobj *shm = (cobj *) _grid->ShmBuffer(xmit_to_rank);
+	  if ( shm!=NULL) { 
+	    u_send_buf_p = shm;
+	  }
+	}
+	
+	t_data-=usecond();
+	Gather_plane_simple_table         (face_table[face_idx],rhs,u_send_buf_p,compress,u_comm_offset,so);  face_idx++;
+	t_data+=usecond();
+	
+	AddPacket((void *)&u_send_buf_p[u_comm_offset],
+		  (void *)&u_recv_buf_p[u_comm_offset],
 		  xmit_to_rank,
 		  recv_from_rank,
 		  bytes);

+	gathertime+=usecond();
 	u_comm_offset+=words;
      }
    }
  }
  
-
  template<class compressor>
  void  GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx)
  {
@@ -904,10 +924,6 @@ PARALLEL_FOR_LOOP
 	  auto rp = &u_simd_recv_buf[i       ][u_comm_offset];
 	  auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset];
 	  
-		 void *vrp = (void *)rp;
-		 void *vsp = (void *)sp;
-
-
 	  if(nbr_proc){
 	    
 	    int recv_from_rank;
@@ -915,7 +931,7 @@ PARALLEL_FOR_LOOP
 	    
 	    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
 	    
-		   AddPacket( vsp,vrp,xmit_to_rank,recv_from_rank,bytes);
+	    AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes);
 	    
 	    rpointers[i] = rp;
 	    
@@ -926,7 +942,8 @@ PARALLEL_FOR_LOOP
 	  }
 	}

-	       AddMerge(&comm_buf[u_comm_offset],rpointers,buffer_size,Packets.size()-1);
+	assert(0);
+	AddMerge(&u_recv_buf_p[u_comm_offset],rpointers,buffer_size,Packets.size()-1);
 	
 	u_comm_offset     +=buffer_size;
      }
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -80,7 +80,6 @@ class CartesianCommunicator {

    void * ShmCommBuf;
    std::vector<void *> ShmCommBufs;
-    std::vector<void *> ShmStencilBufs;

    int WorldRank;
    int WorldSize;
@@ -105,6 +104,10 @@ class CartesianCommunicator {
    int  RankFromProcessorCoor(std::vector<int> &coor);
    void ProcessorCoorFromRank(int rank,std::vector<int> &coor);

+    // Helper function for SHM Windows in MPI3
+    void *ShmBufferSelf(void);
+    void *ShmBuffer(int rank);
+
    /////////////////////////////////
    // Grid information queries
    /////////////////////////////////
@@ -173,6 +176,16 @@ class CartesianCommunicator {
 			 int recv_from_rank,
 			 int bytes);
    void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
+    void StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+			 void *xmit,
+			 int xmit_to_rank,
+			 void *recv,
+			 int recv_from_rank,
+			 int bytes);
+    void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
+    {
+      SendToRecvFromComplete(waitall);
+    }

    ////////////////////////////////////////////////////////////
    // Barrier
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@@ -67,6 +67,14 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
  
  assert(Size==_Nprocessors);
 }
+void *CartesianCommunicator::ShmBufferSelf(void)
+{
+  return NULL;
+}
+void *CartesianCommunicator::ShmBuffer(int rank)
+{
+  return NULL;
+}

 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -197,10 +197,10 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    // Verbose for now
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    std::cout<< "Ranks per node "<< ShmSize << std::endl;
-    std::cout<< "Nodes          "<< GroupSize << std::endl;
-    std::cout<< "Ranks          "<< WorldSize << std::endl;
-    std::cout<< "Shm CommBuf "<< ShmCommBuf << std::endl;
+    std::cout<<GridLogMessage<< "MPI-3 configuration: Ranks per node "<< ShmSize ;
+    std::cout<< " Nodes "<< GroupSize;
+    std::cout<< " Ranks "<< WorldSize;
+    std::cout<< " Shm CommBuf address"<< std::hex <<ShmCommBuf << std::dec<<std::endl;

    // Done
    ShmSetup=1;
@@ -208,12 +208,10 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
  }

  ShmCommBufs.resize(ShmSize);
-  ShmStencilBufs.resize(ShmSize);
  for(int r=0;r<ShmSize;r++){
    MPI_Aint sz;
    int dsp_unit;
    MPI_Win_shared_query (ShmWindow, r, &sz, &dsp_unit, &ShmCommBufs[r]);
-    ShmStencilBufs[r] = (void *) ((uint64_t)ShmCommBufs[r]+MAX_MPI_SHM_BYTES/4);
  }
  
  ////////////////////////////////////////////////////////////////
@@ -240,6 +238,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
  ShmCoor.resize(_ndimension);
  GroupCoor.resize(_ndimension);
  WorldCoor.resize(_ndimension);
+
  for(int l2=0;l2<log2size;l2++){
    while ( WorldDims[dim] / ShmDims[dim] <= 1 ) dim=(dim+1)%_ndimension;
    ShmDims[dim]*=2;
@@ -347,6 +346,21 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
  }
 }

+
+void *CartesianCommunicator::ShmBufferSelf(void)
+{
+  return ShmCommBufs[ShmRank];
+}
+void *CartesianCommunicator::ShmBuffer(int rank)
+{
+  int gpeer = GroupRanks[rank];
+  if (gpeer == MPI_UNDEFINED){
+    return NULL;
+  } else { 
+    return ShmCommBufs[gpeer];
+  }
+}
+
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
@@ -355,13 +369,11 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 						int from,
 						int bytes)
 {
-#undef SHM_USE_BCOPY
  MPI_Request xrq;
  MPI_Request rrq;
  
  static int sequence;

-  int rank = _processor;
  int ierr;
  int tag;
  int check;
@@ -370,6 +382,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
  assert(from != _processor);
  
  int gdest = GroupRanks[dest];
+  int gfrom = GroupRanks[from];
  int gme   = GroupRanks[_processor];

  sequence++;
@@ -379,30 +392,23 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis

  int small = (bytes<MAX_MPI_SHM_BYTES);

-#ifndef SHM_USE_BCOPY
  typedef vRealD T;
  int words = bytes/sizeof(T);
-  assert(((size_t)bytes &(sizeof(T)-1))==0);
-  //  assert(((size_t)xmit  &(sizeof(T)-1))==0);
-  //  assert(((size_t)recv  &(sizeof(T)-1))==0);
-#endif

+  assert(((size_t)bytes &(sizeof(T)-1))==0);
  assert(gme == ShmRank);

-  //  std::cerr << "proc dest from gme  gdest "<<_processor<<" "<<dest <<" "<< from <<" "<<gme<<" "<< gdest<<std::endl; Barrier();
-  if ( small && (dest !=MPI_UNDEFINED) ) {
+  if ( small && (gdest !=MPI_UNDEFINED) ) {
+
    assert(gme != gdest);

-#ifdef SHM_USE_BCOPY
-    bcopy(xmit,to_ptr,bytes);
-#else
    T *ip = (T *)xmit;
    T *op = (T *)to_ptr;
 PARALLEL_FOR_LOOP 
    for(int w=0;w<words;w++) {
      vstream(op[w],ip[w]);
    }
-#endif
+
    bcopy(&_processor,&to_ptr[bytes],sizeof(_processor));
    bcopy(&  sequence,&to_ptr[bytes+4],sizeof(sequence));
  } else { 
@@ -411,24 +417,17 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
    list.push_back(xrq);
  }
  
-  //  std::cout << "Syncing "<<std::endl; Barrier();
  MPI_Win_sync (ShmWindow);   
  MPI_Barrier  (ShmComm);
  MPI_Win_sync (ShmWindow);   

-  //  std::cout << "Receiving "<<std::endl; Barrier();
-  
-  if (small && (from !=MPI_UNDEFINED) ) {
-#ifdef SHM_USE_BCOPY
-    bcopy(from_ptr,recv,bytes);
-#else
+  if (small && (gfrom !=MPI_UNDEFINED) ) {
    T *ip = (T *)from_ptr;
    T *op = (T *)recv;
 PARALLEL_FOR_LOOP 
    for(int w=0;w<words;w++) {
      vstream(op[w],ip[w]);
    }
-#endif
    bcopy(&from_ptr[bytes]  ,&tag  ,sizeof(tag));
    bcopy(&from_ptr[bytes+4],&check,sizeof(check));
    assert(check==sequence);
@@ -439,27 +438,51 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
    list.push_back(rrq);
  }

-  //  std::cout << "Syncing"<<std::endl; Barrier();
+  MPI_Win_sync (ShmWindow);   
+  MPI_Barrier  (ShmComm);
+  MPI_Win_sync (ShmWindow);   
+}
+
+void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+						       void *xmit,
+						       int dest,
+						       void *recv,
+						       int from,
+						       int bytes)
+{
+  MPI_Request xrq;
+  MPI_Request rrq;
+
+  int ierr;
+
+  assert(dest != _processor);
+  assert(from != _processor);
+  
+  int gdest = GroupRanks[dest];
+  int gfrom = GroupRanks[from];
+  int gme   = GroupRanks[_processor];
+
+  assert(gme == ShmRank);
+
+  if ( gdest == MPI_UNDEFINED ) {
+    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+    assert(ierr==0);
+    list.push_back(xrq);
+  }
+  
+  if ( gfrom ==MPI_UNDEFINED) {
+    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
+    assert(ierr==0);
+    list.push_back(rrq);
+  }

  MPI_Win_sync (ShmWindow);   
  MPI_Barrier  (ShmComm);
  MPI_Win_sync (ShmWindow);   
  
-#if 0
-  MPI_Request xrq;
-  MPI_Request rrq;
-  int rank = _processor;
-  int ierr;
-  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
-  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
-  
-  assert(ierr==0);
-
-  list.push_back(xrq);
-  list.push_back(rrq);
-#endif
 }

+
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  int nreq=list.size();
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@@ -33,6 +33,14 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
 }

 int Rank(void ){ return 0; };
+void *CartesianCommunicator::ShmBufferSelf(void)
+{
+  return NULL;
+}
+void *CartesianCommunicator::ShmBuffer(int rank)
+{
+  return NULL;
+}

 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
--- a/lib/communicator/Communicator_shmem.cc
+++ b/lib/communicator/Communicator_shmem.cc
@@ -50,6 +50,14 @@ typedef struct HandShake_t {
 static Vector< HandShake > XConnections;
 static Vector< HandShake > RConnections;

+void *CartesianCommunicator::ShmBufferSelf(void)
+{
+  return NULL;
+}
+void *CartesianCommunicator::ShmBuffer(int rank)
+{
+  return NULL;
+}
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  shmem_init();
  XConnections.resize(shmem_n_pes());
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -33,7 +33,6 @@ directory
 #define GRID_QCD_FERMION_OPERATOR_IMPL_H

 namespace Grid {
-
 namespace QCD {


@@ -108,13 +107,14 @@ namespace Grid {
  INHERIT_GIMPL_TYPES(Base)	 \
  INHERIT_FIMPL_TYPES(Base)
  
-    ///////
+  /////////////////////////////////////////////////////////////////////////////
  // Single flavour four spinors with colour index
-    ///////
+  /////////////////////////////////////////////////////////////////////////////
  template <class S, class Representation = FundamentalRepresentation,class _Coeff_t = RealD >
-    class WilsonImpl
-      : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
+  class WilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
+
    public:
+
    static const int Dimension = Representation::Dimension;
    typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
      
@@ -124,7 +124,6 @@ namespace Grid {
    const bool LsVectorised=false;
    typedef _Coeff_t Coeff_t;

-
    INHERIT_GIMPL_TYPES(Gimpl);
      
    template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
@@ -158,8 +157,7 @@ namespace Grid {
    }
      
    template <class ref>
-      inline void loadLinkElement(Simd &reg,
-				  ref &memory) {
+    inline void loadLinkElement(Simd &reg, ref &memory) {
      reg = memory;
    }
      
@@ -202,9 +200,10 @@ namespace Grid {
    }
  };

-    ///////
+  ////////////////////////////////////////////////////////////////////////////////////
  // Single flavour four spinors with colour index, 5d redblack
-    ///////
+  ////////////////////////////////////////////////////////////////////////////////////
+
 template<class S,int Nrepresentation=Nc,class _Coeff_t = RealD>
 class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
  public:
@@ -227,12 +226,9 @@ namespace Grid {
  typedef Lattice<SiteSpinor> FermionField;
  
  // Make the doubled gauge field a *scalar*
-      typedef iImplDoubledGaugeField<typename Simd::scalar_type>
-      SiteDoubledGaugeField;  // This is a scalar
-      typedef iImplGaugeField<typename Simd::scalar_type>
-      SiteScalarGaugeField;  // scalar
-      typedef iImplGaugeLink<typename Simd::scalar_type>
-      SiteScalarGaugeLink;  // scalar
+  typedef iImplDoubledGaugeField<typename Simd::scalar_type>  SiteDoubledGaugeField;  // This is a scalar
+  typedef iImplGaugeField<typename Simd::scalar_type>         SiteScalarGaugeField;  // scalar
+  typedef iImplGaugeLink<typename Simd::scalar_type>          SiteScalarGaugeLink;  // scalar
      
  typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
      
@@ -250,6 +246,7 @@ namespace Grid {
  inline void loadLinkElement(Simd &reg, ref &memory) {
    vsplat(reg, memory);
  }
+
  inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
 		       const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
 		       StencilImpl &St) {
@@ -262,8 +259,8 @@ namespace Grid {
    mult(&phi(), &UU(), &chi());
  }
      
-      inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,
-			      const GaugeField &Umu) {
+  inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu) 
+  {
    SiteScalarGaugeField ScalarUmu;
    SiteDoubledGaugeField ScalarUds;
    
@@ -289,13 +286,13 @@ namespace Grid {
    }
  }
      
-      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,
-				FermionField &A, int mu) {
+  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu) 
+  {
    assert(0);
  }
      
-      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,
-				FermionField &Atilde, int mu) {
+  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,FermionField &Atilde, int mu) 
+  {
 	assert(0);
  }
 };
@@ -305,9 +302,9 @@ namespace Grid {
    ////////////////////////////////////////////////////////////////////////////////////////
    
 template <class S, int Nrepresentation,class _Coeff_t = RealD>
-    class GparityWilsonImpl
-      : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
+class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
 public:
+
 static const int Dimension = Nrepresentation;

 const bool LsVectorised=false;
@@ -317,15 +314,9 @@ namespace Grid {
 
 INHERIT_GIMPL_TYPES(Gimpl);
      
-      template <typename vtype>
-      using iImplSpinor =
-      iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp>;
-      template <typename vtype>
-      using iImplHalfSpinor =
-	iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp>;
-      template <typename vtype>
-      using iImplDoubledGaugeField =
-	iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>;
+ template <typename vtype> using iImplSpinor            = iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp>;
+ template <typename vtype> using iImplHalfSpinor        = iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp>;
+ template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>;
      
 typedef iImplSpinor<Simd> SiteSpinor;
 typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
@@ -341,7 +332,6 @@ namespace Grid {
      
 ImplParams Params;

-
 GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){};

 bool overlapCommsCompute(void) { return Params.overlapCommsCompute; };
@@ -351,6 +341,7 @@ namespace Grid {
 inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
 		      const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
 		      StencilImpl &St) {
+
  typedef SiteHalfSpinor vobj;
   typedef typename SiteHalfSpinor::scalar_object sobj;
 	
@@ -419,7 +410,6 @@ namespace Grid {

 inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
 {
-	
   conformable(Uds._grid,GaugeGrid);
   conformable(Umu._grid,GaugeGrid);
   
@@ -429,7 +419,6 @@ namespace Grid {
   
   Lattice<iScalar<vInteger> > coor(GaugeGrid);
 	
-	
   for(int mu=0;mu<Nd;mu++){
 	  
     LatticeCoordinate(coor,mu);
@@ -443,7 +432,6 @@ namespace Grid {
       Uconj = where(coor==neglink,-Uconj,Uconj);
     }
 	  
-	  
 PARALLEL_FOR_LOOP
     for(auto ss=U.begin();ss<U.end();ss++){
       Uds[ss](0)(mu) = U[ss]();
@@ -477,8 +465,8 @@ namespace Grid {
 }
      
      
-      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,
-				FermionField &A, int mu) {
+ inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
+
   // DhopDir provides U or Uconj depending on coor/flavour.
   GaugeLinkField link(mat._grid);
   // use lorentz for flavour as hack.
@@ -491,8 +479,8 @@ namespace Grid {
   return;
 }
      
-      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,
-				FermionField &Atilde, int mu) {
+ inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
+
   int Ls = Btilde._grid->_fdimensions[0];
 	
   GaugeLinkField tmp(mat._grid);
@@ -508,13 +496,13 @@ namespace Grid {
   PokeIndex<LorentzIndex>(mat, tmp, mu);
   return;
 }
+
 };

 typedef WilsonImpl<vComplex,  FundamentalRepresentation > WilsonImplR;   // Real.. whichever prec
 typedef WilsonImpl<vComplexF, FundamentalRepresentation > WilsonImplF;  // Float
 typedef WilsonImpl<vComplexD, FundamentalRepresentation > WilsonImplD;  // Double

-
 typedef WilsonImpl<vComplex,  FundamentalRepresentation, ComplexD > ZWilsonImplR; // Real.. whichever prec
 typedef WilsonImpl<vComplexF, FundamentalRepresentation, ComplexD > ZWilsonImplF; // Float
 typedef WilsonImpl<vComplexD, FundamentalRepresentation, ComplexD > ZWilsonImplD; // Double
@@ -538,6 +526,7 @@ namespace Grid {
 typedef GparityWilsonImpl<vComplex , Nc> GparityWilsonImplR;  // Real.. whichever prec
 typedef GparityWilsonImpl<vComplexF, Nc> GparityWilsonImplF;  // Float
 typedef GparityWilsonImpl<vComplexD, Nc> GparityWilsonImplD;  // Double
-}
-}
+
+}}
+
 #endif
--- a/lib/qcd/action/fermion/WilsonFermion.cc
+++ b/lib/qcd/action/fermion/WilsonFermion.cc
@@ -166,7 +166,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
    ////////////////////////
    PARALLEL_FOR_LOOP
    for (int sss = 0; sss < B._grid->oSites(); sss++) {
-      Kernels::DiracOptDhopDir(st, U, st.comm_buf, sss, sss, B, Btilde, mu,
+      Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sss, sss, B, Btilde, mu,
                               gamma);
    }

@@ -277,7 +277,7 @@ void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,

  PARALLEL_FOR_LOOP
  for (int sss = 0; sss < in._grid->oSites(); sss++) {
-    Kernels::DiracOptDhopDir(Stencil, Umu, Stencil.comm_buf, sss, sss, in, out,
+    Kernels::DiracOptDhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out,
                             dirdisp, gamma);
  }
 };
@@ -295,13 +295,13 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
  if (dag == DaggerYes) {
    PARALLEL_FOR_LOOP
    for (int sss = 0; sss < in._grid->oSites(); sss++) {
-      Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sss, sss, 1, 1, in,
+      Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in,
                                   out);
    }
  } else {
    PARALLEL_FOR_LOOP
    for (int sss = 0; sss < in._grid->oSites(); sss++) {
-      Kernels::DiracOptDhopSite(st, lo, U, st.comm_buf, sss, sss, 1, 1, in,
+      Kernels::DiracOptDhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in,
                                out);
    }
  }
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -185,18 +185,14 @@ void WilsonFermion5D<Impl>::Report(void)
  if ( DhopCalls > 0 ) {
    std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D Number of Dhop Calls     : " << DhopCalls   << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " << DhopCommTime
-              << " us" << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls           : "
-              << DhopCommTime / DhopCalls << " us" << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D Total Compute time       : "
-              << DhopComputeTime << " us" << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls        : "
-              << DhopComputeTime / DhopCalls << " us" << std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " << DhopCommTime<< " us" << std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls           : " << DhopCommTime / DhopCalls << " us" << std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D Total Compute time       : " << DhopComputeTime << " us" << std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls        : " << DhopComputeTime / DhopCalls << " us" << std::endl;

    RealD mflops = 1344*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NP << std::endl;
+    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;

   }

@@ -210,12 +206,9 @@ void WilsonFermion5D<Impl>::Report(void)
    std::cout << GridLogMessage << "WilsonFermion5D Total Dhop Compute time  : " <<DerivDhopComputeTime <<" us"<<std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D Dhop ComputeTime/Calls   : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
    
-
-
    RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NP << std::endl;
-
  }

  if (DerivCalls > 0 || DhopCalls > 0){
@@ -275,7 +268,7 @@ PARALLEL_FOR_LOOP
    for(int s=0;s<Ls;s++){
      int sU=ss;
      int sF = s+Ls*sU; 
-      Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.comm_buf,sF,sU,in,out,dirdisp,gamma);
+      Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.CommBuf(),sF,sU,in,out,dirdisp,gamma);
    }
  }
 };
@@ -327,8 +320,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
        assert(sF < B._grid->oSites());
        assert(sU < U._grid->oSites());

-        Kernels::DiracOptDhopDir(st, U, st.comm_buf, sF, sU, B, Btilde, mu,
-                                 gamma);
+        Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sF, sU, B, Btilde, mu, gamma);

        ////////////////////////////
        // spin trace outer product
@@ -412,8 +404,7 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
    for (int ss = 0; ss < U._grid->oSites(); ss++) {
      int sU = ss;
      int sF = LLs * sU;
-      Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in,
-                                   out);
+      Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sF, sU, LLs, 1, in, out);
    }
 #ifdef AVX512
  } else if (stat.is_init() ) {
@@ -427,11 +418,10 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
    int mythread = omp_get_thread_num();
    stat.enter(mythread);
 #pragma omp for nowait
-   for(int ss=0;ss<U._grid->oSites();ss++)
-    {
+    for(int ss=0;ss<U._grid->oSites();ss++) {
      int sU=ss;
      int sF=LLs*sU;
-       Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
+      Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
    }
    stat.exit(mythread);
    }
@@ -442,8 +432,7 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
    for (int ss = 0; ss < U._grid->oSites(); ss++) {
      int sU = ss;
      int sF = LLs * sU;
-      Kernels::DiracOptDhopSite(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in,
-                                out);
+      Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
    }
  }
  DhopComputeTime+=usecond();
--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@@ -34,7 +34,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/Stat.h>

 namespace Grid {
-
 namespace QCD {

  ////////////////////////////////////////////////////////////////////////////////
@@ -182,7 +181,7 @@ namespace Grid {
    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
    
  };
-  }
-}
+
+}}

 #endif
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -43,9 +43,8 @@ WilsonKernels<Impl>::WilsonKernels(const ImplParams &p) : Base(p){};
 ////////////////////////////////////////////

 template <class Impl>
-void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(
-    StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-    commVector<SiteHalfSpinor> &buf, int sF,
+void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+						     SiteHalfSpinor *buf, int sF,
 						     int sU, const FermionField &in, FermionField &out) {
  SiteHalfSpinor tmp;
  SiteHalfSpinor chi;
@@ -220,9 +219,8 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(

 // Need controls to do interior, exterior, or both
 template <class Impl>
-void WilsonKernels<Impl>::DiracOptGenericDhopSite(
-    StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-    commVector<SiteHalfSpinor> &buf, int sF,
+void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+						  SiteHalfSpinor *buf, int sF,
 						  int sU, const FermionField &in, FermionField &out) {
  SiteHalfSpinor tmp;
  SiteHalfSpinor chi;
@@ -396,10 +394,9 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSite(
 };

 template <class Impl>
-void WilsonKernels<Impl>::DiracOptDhopDir(
-    StencilImpl &st, DoubledGaugeField &U,
-    commVector<SiteHalfSpinor> &buf, int sF,
+void WilsonKernels<Impl>::DiracOptDhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int sF,
 					   int sU, const FermionField &in, FermionField &out, int dir, int gamma) {
+
  SiteHalfSpinor tmp;
  SiteHalfSpinor chi;
  SiteSpinor result;
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -32,7 +32,6 @@ directory
 #define GRID_QCD_DHOP_H

 namespace Grid {
-
 namespace QCD {

  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -56,16 +55,11 @@ namespace Grid {
   
  template <bool EnableBool = true>
  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
-	DiracOptDhopSite(
-			 StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-			 commVector<SiteHalfSpinor> &buf,
-			 int sF, int sU, int Ls, int Ns, const FermionField &in,
-			 FermionField &out) {
+  DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+		   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
 #ifdef AVX512
    if (AsmOpt) {
-	  WilsonKernels<Impl>::DiracOptAsmDhopSite(st, lo, U, buf, sF, sU, Ls, Ns,
-						   in, out);
-
+      WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
    } else {
 #else
    {
@@ -73,11 +67,9 @@ namespace Grid {
      for (int site = 0; site < Ns; site++) {
 	for (int s = 0; s < Ls; s++) {
 	  if (HandOpt)
-		  WilsonKernels<Impl>::DiracOptHandDhopSite(st, lo, U, buf, sF, sU,
-							    in, out);
+	    WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
 	  else
-		  WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU,
-							       in, out);
+	    WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
 	  sF++;
 	}
 	sU++;
@@ -87,15 +79,12 @@ namespace Grid {
     
  template <bool EnableBool = true>
  typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
-	  DiracOptDhopSite(
-			   StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-			   commVector<SiteHalfSpinor> &buf,
-			   int sF, int sU, int Ls, int Ns, const FermionField &in,
-			   FermionField &out) {
+  DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+		   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
+     
    for (int site = 0; site < Ns; site++) {
      for (int s = 0; s < Ls; s++) {
-	      WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in,
-							   out);
+	WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in, out);
 	sF++;
      }
      sU++;
@@ -103,17 +92,12 @@ namespace Grid {
  }
     
  template <bool EnableBool = true>
-	  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,
-				  void>::type
-	  DiracOptDhopSiteDag(
-			      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-			      commVector<SiteHalfSpinor> &buf,
-			      int sF, int sU, int Ls, int Ns, const FermionField &in,
-			      FermionField &out) {
+  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,void>::type
+  DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+		      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
 #ifdef AVX512
    if (AsmOpt) {
-				      WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st, lo, U, buf, sF, sU, Ls,
-										  Ns, in, out);
+      WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
    } else {
 #else
    {
@@ -121,11 +105,9 @@ namespace Grid {
      for (int site = 0; site < Ns; site++) {
 	for (int s = 0; s < Ls; s++) {
 	  if (HandOpt)
-					      WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st, lo, U, buf, sF, sU,
-											   in, out);
+	    WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
 	  else
-					      WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF,
-											      sU, in, out);
+	    WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
 	  sF++;
 	}
 	sU++;
@@ -134,73 +116,48 @@ namespace Grid {
  }

  template <bool EnableBool = true>
-				      typename std::enable_if<
-				      (Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,
-				      void>::type
-				      DiracOptDhopSiteDag(
-							  StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-							  commVector<SiteHalfSpinor> &buf,
-							  int sF, int sU, int Ls, int Ns, const FermionField &in,
-							  FermionField &out) {
+  typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,void>::type
+  DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,SiteHalfSpinor * buf,
+		      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
+
    for (int site = 0; site < Ns; site++) {
      for (int s = 0; s < Ls; s++) {
-					    WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF, sU,
-											    in, out);
+	WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
 	sF++;
      }
      sU++;
    }
  }

-				    void DiracOptDhopDir(
-							 StencilImpl &st, DoubledGaugeField &U,
-							 commVector<SiteHalfSpinor> &buf,
-							 int sF, int sU, const FermionField &in, FermionField &out, int dirdisp,
-							 int gamma);
+  void DiracOptDhopDir(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
+		       int sF, int sU, const FermionField &in, FermionField &out, int dirdisp, int gamma);
      
 private:
     // Specialised variants
-				    void DiracOptGenericDhopSite(
-								 StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-								 commVector<SiteHalfSpinor> &buf,
+  void DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 			       int sF, int sU, const FermionField &in, FermionField &out);
      
-				    void DiracOptGenericDhopSiteDag(
-								    StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-								    commVector<SiteHalfSpinor> &buf,
+  void DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 				  int sF, int sU, const FermionField &in, FermionField &out);

-				    void DiracOptAsmDhopSite(
-							     StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-							     commVector<SiteHalfSpinor> &buf,
-							     int sF, int sU, int Ls, int Ns, const FermionField &in,
-							     FermionField &out);
+  void DiracOptAsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+			   int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);

-				    void DiracOptAsmDhopSiteDag(
-								StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-								commVector<SiteHalfSpinor> &buf,
-								int sF, int sU, int Ls, int Ns, const FermionField &in,
-								FermionField &out);
+  void DiracOptAsmDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+			      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out);

-				    void DiracOptHandDhopSite(
-							      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-							      commVector<SiteHalfSpinor> &buf,
+  void DiracOptHandDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 			    int sF, int sU, const FermionField &in, FermionField &out);

-				    void DiracOptHandDhopSiteDag(
-								 StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-								 commVector<SiteHalfSpinor> &buf,
+  void DiracOptHandDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 			       int sF, int sU, const FermionField &in, FermionField &out);
      
 public:
+
  WilsonKernels(const ImplParams &p = ImplParams());
+
 };
    
-      }
-    }
-
-
-
-
+}}

 #endif
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -38,26 +38,22 @@ namespace Grid {
 ///////////////////////////////////////////////////////////
 // Default to no assembler implementation
 ///////////////////////////////////////////////////////////
-    template<class Impl>
-      void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-                             commVector<SiteHalfSpinor>  &buf,
-                             int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-    {
-      assert(0);
-    }
-    template<class Impl>
-      void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-                                commVector<SiteHalfSpinor>  &buf,
+template<class Impl> void 
+WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 					  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 {
  assert(0);
 }

-
+template<class Impl> void 
+WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+					     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+{
+  assert(0);
+}

 #if defined(AVX512) 
    
-    
    ///////////////////////////////////////////////////////////
    // If we are AVX512 specialise the single precision routine
    ///////////////////////////////////////////////////////////
@@ -84,16 +80,14 @@ namespace Grid {
 #define FX(A) WILSONASM_ ##A
  
 #undef KERNEL_DAG
-    template<>
-    void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-							 commVector<SiteHalfSpinor>  &buf,
+template<> void 
+WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
      
 #define KERNEL_DAG
-    template<>
-    void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-							    commVector<SiteHalfSpinor>  &buf,
+template<> void 
+WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 						   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 				    
@@ -109,31 +103,26 @@ namespace Grid {
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
 				    
 #undef KERNEL_DAG
-    template<>
-    void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-								  commVector<SiteHalfSpinor>  &buf,
+template<> void 
+WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
 							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 				    
 #define KERNEL_DAG
-    template<>
-    void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-								     commVector<SiteHalfSpinor>  &buf,
+template<> void 
+WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 				    
 #endif

-
 #define INSTANTIATE_ASM(A)\
-template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\
-                                   commVector<SiteHalfSpinor>  &buf,\
+template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
-template void WilsonKernels<A>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\
-                                   commVector<SiteHalfSpinor>  &buf,\
+ \
+template void WilsonKernels<A>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\

-
 INSTANTIATE_ASM(WilsonImplF);
 INSTANTIATE_ASM(WilsonImplD);
 INSTANTIATE_ASM(ZWilsonImplF);
@@ -144,6 +133,6 @@ INSTANTIATE_ASM(DomainWallVec5dImplF);
 INSTANTIATE_ASM(DomainWallVec5dImplD);
 INSTANTIATE_ASM(ZDomainWallVec5dImplF);
 INSTANTIATE_ASM(ZDomainWallVec5dImplD);
-  }
-}
+
+}}

--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -311,9 +311,8 @@ namespace Grid {
 namespace QCD {


-  template<class Impl>
-  void WilsonKernels<Impl>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-					       commVector<SiteHalfSpinor>  &buf,
+template<class Impl> void 
+WilsonKernels<Impl>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionField &in, FermionField &out)
 {
  typedef typename Simd::scalar_type S;
@@ -555,8 +554,7 @@ namespace QCD {
 }

 template<class Impl>
-  void WilsonKernels<Impl>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-					       commVector<SiteHalfSpinor>  &buf,
+void WilsonKernels<Impl>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionField &in, FermionField &out)
 {
  //  std::cout << "Hand op Dhop "<<std::endl;
@@ -798,37 +796,34 @@ namespace QCD {
  }
 }

-
  ////////////////////////////////////////////////
  // Specialise Gparity to simple implementation
  ////////////////////////////////////////////////
-template<>
-void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-							     commVector<SiteHalfSpinor>  &buf,
+template<> void 
+WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+							SiteHalfSpinor *buf,
 							int sF,int sU,const FermionField &in, FermionField &out)
 {
  assert(0);
 }

-template<>
-void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								commVector<SiteHalfSpinor>  &buf,
+template<> void 
+WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+							   SiteHalfSpinor *buf,
 							   int sF,int sU,const FermionField &in, FermionField &out)
 {
  assert(0);
 }

-template<>
-void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-							     commVector<SiteHalfSpinor>  &buf,
+template<> void 
+WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 							int sF,int sU,const FermionField &in, FermionField &out)
 {
  assert(0);
 }

-template<>
-void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								commVector<SiteHalfSpinor>  &buf,
+template<> void 
+WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 							   int sF,int sU,const FermionField &in, FermionField &out)
 {
  assert(0);
@@ -840,11 +835,9 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,
 // Need Nc=3 though //

 #define INSTANTIATE_THEM(A) \
-template void WilsonKernels<A>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
-							       commVector<SiteHalfSpinor>  &buf,\
+template void WilsonKernels<A>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
 						     int ss,int sU,const FermionField &in, FermionField &out); \
-template void WilsonKernels<A>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
-								  commVector<SiteHalfSpinor>  &buf,\
+template void WilsonKernels<A>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
 							int ss,int sU,const FermionField &in, FermionField &out);

 INSTANTIATE_THEM(WilsonImplF);
--- a/tests/Test_stencil.cc
+++ b/tests/Test_stencil.cc
@@ -116,7 +116,7 @@ int main (int argc, char ** argv)
 	  else if (SE->_is_local)
 	    Check._odata[i] = Foo._odata[SE->_offset];
 	  else 
-	    Check._odata[i] = myStencil.comm_buf[SE->_offset];
+	    Check._odata[i] = myStencil.CommBuf()[SE->_offset];
 	}

 	Real nrmC = norm2(Check);
@@ -207,7 +207,7 @@ int main (int argc, char ** argv)
 	  else if (SE->_is_local)
 	    OCheck._odata[i] = EFoo._odata[SE->_offset];
 	  else 
-	    OCheck._odata[i] = EStencil.comm_buf[SE->_offset];
+	    OCheck._odata[i] = EStencil.CommBuf()[SE->_offset];
 	}
 	for(int i=0;i<ECheck._grid->oSites();i++){
 	  int permute_type;
@@ -220,7 +220,7 @@ int main (int argc, char ** argv)
 	  else if (SE->_is_local)
 	    ECheck._odata[i] = OFoo._odata[SE->_offset];
 	  else 
-	    ECheck._odata[i] = OStencil.comm_buf[SE->_offset];
+	    ECheck._odata[i] = OStencil.CommBuf()[SE->_offset];
 	}
 	
 	setCheckerboard(Check,ECheck);