Overlap comms compute changes

2025-07-13 11:37:06 +01:00 · 2016-01-10 19:20:16 +00:00
parent c99d748da6
commit d19321dfde
8 changed files with 220 additions and 184 deletions
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@ -7,8 +7,6 @@
    Copyright (C) 2015

 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@ -88,11 +86,78 @@ namespace Grid {
      typedef typename cobj::scalar_type scalar_type;
      typedef typename cobj::scalar_object scalar_object;

+      //////////////////////////////////////////
+      // Comms packet queue for asynch thread
+      //////////////////////////////////////////
+
+      struct Packet {
+	void * send_buf;
+	void * recv_buf;
+	Integer to_rank;
+	Integer from_rank;
+	Integer bytes;
+      };
+
+      std::vector<Packet> Packets;
+
+      void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){
+	Packet p;
+	p.send_buf = xmit;
+	p.recv_buf = rcv;
+	p.to_rank  = to;
+	p.from_rank= from;
+	p.bytes    = bytes;
+	Packets.push_back(p);
+      }
+
+      void Communicate(void ) { 
+	for(int i=0;i<Packets.size();i++){
+	  _grid->SendToRecvFrom(Packets[i].send_buf,
+				Packets[i].to_rank,
+				Packets[i].recv_buf,
+				Packets[i].from_rank,
+				Packets[i].bytes);
+	}
+      }
+
+      ///////////////////////////////////////////
+      // Simd merge queue for asynch comms
+      ///////////////////////////////////////////
+      struct Merge {
+        cobj * mpointer;
+	std::vector<scalar_object *> rpointers;
+	Integer buffer_size;
+      };
+
+      std::vector<Merge> Mergers;
+
+      void AddMerge(cobj *merge_p,std::vector<scalar_object *> &rpointers,Integer buffer_size) {
+	Merge m;
+	m.mpointer = merge_p;
+	m.rpointers= rpointers;
+	m.buffer_size = buffer_size;
+	Mergers.push_back(m);
+      }
+
+      void CommsMerge(void ) { 
+	mergetime-=usecond();
+	for(int i=0;i<Mergers.size();i++){	
+PARALLEL_FOR_LOOP
+	  for(int o=0;o<Mergers[i].buffer_size;o++){
+	    merge(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
+	  }
+	}
+	mergetime+=usecond();
+      }
+
+      ////////////////////////////////////////
+      // Basic Grid and stencil info
+      ////////////////////////////////////////
+
      int                               _checkerboard;
      int                               _npoints; // Move to template param?
      GridBase *                        _grid;

-
      // npoints of these
      std::vector<int>                  _directions;
      std::vector<int>                  _distances;
@ -101,19 +166,21 @@ namespace Grid {

      // npoints x Osites() of these
      std::vector<std::vector<StencilEntry> > _entries;
-
-      // Comms buffers
-      std::vector<std::vector<scalar_object> > send_buf_extract;
-      std::vector<std::vector<scalar_object> > recv_buf_extract;
-      std::vector<scalar_object *> pointers;
-      std::vector<scalar_object *> rpointers;
-      Vector<cobj> send_buf;
-
      inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point][osite]; }

+      // Comms buffers
+      std::vector<Vector<scalar_object> > u_simd_send_buf;
+      std::vector<Vector<scalar_object> > u_simd_recv_buf;
+      Vector<cobj>          u_send_buf;
+      Vector<cobj>          comm_buf;
+      int u_comm_offset;
      int _unified_buffer_size;
-      int _request_count;

+      /////////////////////////////////////////
+      // Timing info; ugly; possibly temporary
+      /////////////////////////////////////////
+#define TIMING_HACK
+#ifdef TIMING_HACK
      double buftime;
      double gathertime;
      double commtime;
@ -124,9 +191,7 @@ namespace Grid {
      double gathermtime;
      double splicetime;
      double nosplicetime;
-
-
-
+#endif

  CartesianStencil(GridBase *grid,
 				     int npoints,
@ -135,6 +200,7 @@ namespace Grid {
 				     const std::vector<int> &distances) 
    :   _entries(npoints), _permute_type(npoints), _comm_buf_size(npoints)
    {
+#ifdef TIMING_HACK
      gathertime=0;
      commtime=0;
      commstime=0;
@ -145,13 +211,12 @@ namespace Grid {
      buftime=0;
      splicetime=0;
      nosplicetime=0;
-
+#endif
      _npoints = npoints;
      _grid    = grid;
      _directions = directions;
      _distances  = distances;
      _unified_buffer_size=0;
-      _request_count =0;

      int osites  = _grid->oSites();

@ -197,22 +262,25 @@ namespace Grid {
 	  sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);

 	  if ( sshift[0] == sshift[1] ) {
-	    //	    std::cout<<"Comms 0x3"<<std::endl;
 	    Comms(point,dimension,shift,0x3);
 	  } else {
-	    //	    std::cout<<"Comms 0x1 ; 0x2"<<std::endl;
 	    Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
 	    Comms(point,dimension,shift,0x2);// both with block stride loop iteration
 	  }
 	}
-	//	for(int ss=0;ss<osites;ss++){
-	//	  std::cout << "point["<<i<<"] "<<ss<<"-> o"<<_entries[i][ss]._offset<<"; l"<<
-	//	    _entries[i][ss]._is_local<<"; p"<<_entries[i][ss]._permute<<std::endl;
-	//	}
+      }
+      u_send_buf.resize(_unified_buffer_size);
+      comm_buf.resize(_unified_buffer_size);
+      
+      const int Nsimd = grid->Nsimd();
+      u_simd_send_buf.resize(Nsimd);
+      u_simd_recv_buf.resize(Nsimd);
+      for(int l=0;l<Nsimd;l++){
+	u_simd_send_buf[l].resize(_unified_buffer_size);
+	u_simd_recv_buf[l].resize(_unified_buffer_size);
      }
    }

-
    void Local     (int point, int dimension,int shiftpm,int cbmask)
    {
      int fd = _grid->_fdimensions[dimension];
@ -276,17 +344,15 @@ namespace Grid {
      assert(shift<fd);

      int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; // done in reduced dims, so SIMD factored
-      //      std::cout << " dim " <<dimension<<" buffersize "<<buffer_size<<std::endl;
+
      _comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
                                           // send to one or more remote nodes.

      int cb= (cbmask==0x2)? Odd : Even;
      int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);

-
      for(int x=0;x<rd;x++){       

-
 	int permute_type=grid->PermuteType(dimension);

 	int sx        =  (x+sshift)%rd;
@ -310,16 +376,9 @@ namespace Grid {
 	} else { 
 	  int comm_proc = ((x+sshift)/rd)%pd;
 	  offnode = (comm_proc!= 0);
-	  //	  std::cout << "Stencil x "<<x<<" shift "<<shift<<" sshift "<<sshift<<" fd "<<fd<<" rd " <<rd<<" offnode "<<offnode<<" sx "<<sx<< " comm_proc "<<comm_proc<<" pd "<< pd <<std::endl;
 	}


-	// Stencil x 1 shift 3 sshift 3 fd 8 rd 2 offnode 0 sx 0 comm_proc 0 pd 2
-	// x+sshift = 4
-	// x+sshift/2 = 2
-	// 2%2 == 0
-	// Problem: sshift is wrong in "rd" for SIMD directions. The complex logic in Cshift_mpi is needed.
-
 	int wraparound=0;
 	if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
 	  wraparound = 1;
@ -337,15 +396,13 @@ namespace Grid {
 	  int words = buffer_size;
 	  if (cbmask != 0x3) words=words>>1;
 	  
-	  //	  GatherPlaneSimple (point,dimension,sx,cbmask);
-	  
 	  int rank           = grid->_processor;
 	  int recv_from_rank;
 	  int xmit_to_rank;

 	  int unified_buffer_offset = _unified_buffer_size;
 	  _unified_buffer_size    += words;
-	  //	  std::cout<< "Comms dim "<<dimension<<" offset "<<unified_buffer_offset<<" size "<<" " << _unified_buffer_size<<std::endl;
+
 	  ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
 	  
 	}
@ -441,39 +498,34 @@ namespace Grid {
      }
    }

-//      CartesianStencil(GridBase *grid,
-//		       int npoints,
-//		       int checkerboard,
-//		       const std::vector<int> &directions,
-//		       const std::vector<int> &distances);

+      std::thread HaloExchangeBegin(const Lattice<vobj> &source,compressor &compress) {
+	Mergers.resize(0); 
+	Packets.resize(0);
+	HaloGather(source,compress);
+        return std::thread([&] { this->Communicate(); });
+      }

-      // Add to tables for various cases;  is this mistaken. only local if 1 proc in dim
-      // Can this be avoided with simpler coding of comms?
-   //      void Local     (int point, int dimension,int shift,int cbmask);
-   //      void Comms     (int point, int dimension,int shift,int cbmask);
-   //      void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap);
-   //      void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset,int wrap);
-
-      // Could allow a functional munging of the halo to another type during the comms.
-      // this could implement the 16bit/32bit/64bit compression.
-      void HaloExchange(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress) 
+      void HaloExchange(const Lattice<vobj> &source,compressor &compress) 
+      {
+	auto thr = HaloExchangeBegin(source,compress);
+        HaloExchangeComplete(thr);
+      }
+
+      void HaloExchangeComplete(std::thread &thr) 
      {
-	std::thread thr = HaloExchangeBegin(source,u_comm_buf,compress);
 	thr.join();
+	CommsMerge();
      }

-      std::thread HaloExchangeBegin(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > & u_comm_buf,compressor &compress) {
-	return std::thread([&] { this->HaloExchangeBlocking(source,u_comm_buf,compress); });
-      }
-
-      void HaloExchangeBlocking(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
+      void HaloGather(const Lattice<vobj> &source,compressor &compress)
      {
 	// conformable(source._grid,_grid);
 	assert(source._grid==_grid);
 	halotime-=usecond();
-	if (u_comm_buf.size() != _unified_buffer_size ) u_comm_buf.resize(_unified_buffer_size);
-	int u_comm_offset=0;
+
+	assert (comm_buf.size() == _unified_buffer_size );
+	u_comm_offset=0;

 	// Gather all comms buffers
 	for(int point = 0 ; point < _npoints; point++) {
@ -506,35 +558,34 @@ namespace Grid {
 	    if ( sshift[0] == sshift[1] ) {
 	      if (splice_dim) {
 		splicetime-=usecond();
-		GatherStartCommsSimd(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
+		GatherSimd(source,dimension,shift,0x3,compress);
 		splicetime+=usecond();
 	      } else { 
 		nosplicetime-=usecond();
-		GatherStartComms(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
+		Gather(source,dimension,shift,0x3,compress);
 		nosplicetime+=usecond();
 	      }
 	    } else {
-	      //	      std::cout << "dim "<<dimension<<"cb "<<_checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
 	      if(splice_dim){
 		splicetime-=usecond();
-		GatherStartCommsSimd(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);// if checkerboard is unfavourable take two passes
-		GatherStartCommsSimd(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);// both with block stride loop iteration
+		GatherSimd(source,dimension,shift,0x1,compress);// if checkerboard is unfavourable take two passes
+		GatherSimd(source,dimension,shift,0x2,compress);// both with block stride loop iteration
 		splicetime+=usecond();
 	      } else {
 		nosplicetime-=usecond();
-		GatherStartComms(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);
-		GatherStartComms(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);
+		Gather(source,dimension,shift,0x1,compress);
+		Gather(source,dimension,shift,0x2,compress);
 		nosplicetime+=usecond();
 	      }
 	    }
 	  }
 	}
+
+	assert(u_comm_offset==_unified_buffer_size);
 	halotime+=usecond();
      }

-        void GatherStartComms(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
-			      std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
-			      int &u_comm_offset,compressor & compress)
+        void Gather(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor & compress)
 	{
 	  typedef typename cobj::vector_type vector_type;
 	  typedef typename cobj::scalar_type scalar_type;
@ -555,8 +606,6 @@ namespace Grid {

 	  int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];

-	  if(send_buf.size()<buffer_size) send_buf.resize(buffer_size);
-
 	  int cb= (cbmask==0x2)? Odd : Even;
 	  int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);

@ -573,7 +622,7 @@ namespace Grid {
 	      int bytes = words * sizeof(cobj);

 	      gathertime-=usecond();
-	      Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask,compress);
+	      Gather_plane_simple (rhs,u_send_buf,dimension,sx,cbmask,compress,u_comm_offset);
 	      gathertime+=usecond();

 	      int rank           = _grid->_processor;
@ -585,11 +634,19 @@ namespace Grid {

 	      //      FIXME Implement asynchronous send & also avoid buffer copy
 	      commtime-=usecond();
+	      /*
 	      _grid->SendToRecvFrom((void *)&send_buf[0],
 				   xmit_to_rank,
-				    (void *)&u_comm_buf[u_comm_offset],
+				    (void *)&comm_buf[u_comm_offset],
 				   recv_from_rank,
 				   bytes);
+	      */ 
+	      AddPacket((void *)&u_send_buf[u_comm_offset],
+			(void *)&comm_buf[u_comm_offset],
+			xmit_to_rank,
+			recv_from_rank,
+			bytes);
+			
 	      commtime+=usecond();

 	      u_comm_offset+=words;
@ -598,14 +655,11 @@ namespace Grid {
 	}


-	void  GatherStartCommsSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
-				   std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
-				   int &u_comm_offset,compressor &compress)
+	void  GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress)
 	{
 	  buftime-=usecond();
 	  const int Nsimd = _grid->Nsimd();
 	  
-	  
 	  int fd = _grid->_fdimensions[dimension];
 	  int rd = _grid->_rdimensions[dimension];
 	  int ld = _grid->_ldimensions[dimension];
@ -628,20 +682,11 @@ namespace Grid {

 	  assert(cbmask==0x3); // Fixme think there is a latent bug if not true

-	  //	Should grow to max size and then cost very little thereafter
-	  send_buf_extract.resize(Nsimd);
-	  recv_buf_extract.resize(Nsimd);
-	  for(int l=0;l<Nsimd;l++){
-	    if( send_buf_extract[l].size() < buffer_size) {
-	      send_buf_extract[l].resize(buffer_size);
-	      recv_buf_extract[l].resize(buffer_size);
-	    }
-	  }
-	  pointers.resize(Nsimd);
-	  rpointers.resize(Nsimd);
-
 	  int bytes = buffer_size*sizeof(scalar_object);

+	  std::vector<scalar_object *> rpointers(Nsimd);
+	  std::vector<scalar_object *> spointers(Nsimd);
+
 	  buftime+=usecond();
 	  
 	  ///////////////////////////////////////////
@ -659,16 +704,19 @@ namespace Grid {
 	    if ( any_offnode ) {

 	      for(int i=0;i<Nsimd;i++){       
-		pointers[i] = &send_buf_extract[i][0];
+		spointers[i] = &u_simd_send_buf[i][u_comm_offset];
 	      }
+
 	      int sx   = (x+sshift)%rd;
 	      
 	      gathermtime-=usecond();
-	      Gather_plane_extract<cobj>(rhs,pointers,dimension,sx,cbmask,compress);
+	      Gather_plane_extract<cobj>(rhs,spointers,dimension,sx,cbmask,compress);
 	      gathermtime+=usecond();

 	      for(int i=0;i<Nsimd;i++){

+		//		std::cout << "GatherSimd : lane 1st elem " << i << u_simd_send_buf[i ][u_comm_offset]<<std::endl;
+
 		int inner_bit = (Nsimd>>(permute_type+1));
 		int ic= (i&inner_bit)? 1:0;

@ -680,45 +728,43 @@ namespace Grid {
 		int nbr_ox   = (nbr_lcoor%rd);    // outer coord of peer
 		int nbr_lane = (i&(~inner_bit));
 		
-		int recv_from_rank;
-		int xmit_to_rank;
-		
 		if (nbr_ic) nbr_lane|=inner_bit;
 		assert (sx == nbr_ox);

+		auto rp = &u_simd_recv_buf[i       ][u_comm_offset];
+		auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset];
+
+		void *vrp = (void *)rp;
+		void *vsp = (void *)sp;
+

 		if(nbr_proc){
 		  
+		  int recv_from_rank;
+		  int xmit_to_rank;
+
 		  _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
 		  
 		  commstime-=usecond();
-		  _grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
-					xmit_to_rank,
-					(void *)&recv_buf_extract[i][0],
-					recv_from_rank,
-					bytes);
+		  AddPacket( vsp,vrp,xmit_to_rank,recv_from_rank,bytes);
 		  commstime+=usecond();
 		  
-		  rpointers[i] = &recv_buf_extract[i][0];
+		  rpointers[i] = rp;

 		} else { 
-		  rpointers[i] = &send_buf_extract[nbr_lane][0];
+
+		  rpointers[i] = sp;
+
 		}
 	      }

-	      //	      std::cout << " CommsSimd ["<<dimension<<"] offset "<<u_comm_offset<<" buffsize "<<buffer_size  <<" unified  buffer size "<<_unified_buffer_size<<std::endl;
-	      mergetime-=usecond();
-PARALLEL_FOR_LOOP
-	      for(int i=0;i<buffer_size;i++){
-		//		std::cout<<"buffer loop " << i<<" "<<u_comm_offset+i<<" / "<<_unified_buffer_size<<std::endl;
-		//		assert(u_comm_offset+i<_unified_buffer_size);
-		merge(u_comm_buf[u_comm_offset+i],rpointers,i);
-	      }
-	      mergetime+=usecond();
-	      u_comm_offset+=buffer_size;
+	      AddMerge(&comm_buf[u_comm_offset],rpointers,buffer_size);
+
+	      u_comm_offset     +=buffer_size;
 	    }
 	  }
 	}
+
  };
 }
 #endif
--- a/lib/algorithms/CoarsenedMatrix.h
+++ b/lib/algorithms/CoarsenedMatrix.h
@ -204,7 +204,6 @@ namespace Grid {

    std::vector<CoarseMatrix> A;

-    std::vector<siteVector,alignedAllocator<siteVector> >   comm_buf;
      
    ///////////////////////
    // Interface
@ -217,7 +216,7 @@ namespace Grid {
      conformable(in._grid,out._grid);

      SimpleCompressor<siteVector> compressor;
-      Stencil.HaloExchange(in,comm_buf,compressor);
+      Stencil.HaloExchange(in,compressor);

 PARALLEL_FOR_LOOP
      for(int ss=0;ss<Grid()->oSites();ss++){
@ -234,7 +233,7 @@ PARALLEL_FOR_LOOP
 	  } else if(SE->_is_local) { 
 	    nbr = in._odata[SE->_offset];
 	  } else {
-	    nbr = comm_buf[SE->_offset];
+	    nbr = Stencil.comm_buf[SE->_offset];
 	  }
 	  res = res + A[point]._odata[ss]*nbr;
 	}
@ -258,7 +257,6 @@ PARALLEL_FOR_LOOP
      Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
      A(geom.npoint,&CoarseGrid)
    {
-      comm_buf.resize(Stencil._unified_buffer_size);
    };

    void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
--- a/lib/cshift/Cshift_common.h
+++ b/lib/cshift/Cshift_common.h
@ -44,7 +44,7 @@ public:
 // Gather for when there is no need to SIMD split with compression
 ///////////////////////////////////////////////////////////////////
 template<class vobj,class cobj,class compressor> void 
-Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress)
+Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress, int off=0)
 {
  int rd = rhs._grid->_rdimensions[dimension];

@ -63,7 +63,7 @@ PARALLEL_NESTED_LOOP2
      for(int b=0;b<e2;b++){
 	int o  = n*rhs._grid->_slice_stride[dimension];
 	int bo = n*rhs._grid->_slice_block[dimension];
-	buffer[bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
+	buffer[off+bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
      }
    }
  } else { 
@ -73,7 +73,7 @@ PARALLEL_NESTED_LOOP2
 	 int o  = n*rhs._grid->_slice_stride[dimension];
 	 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 	 if ( ocb &cbmask ) {
-	   buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
+	   buffer[off+bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 	 }
       }
     }
--- a/lib/qcd/action/fermion/WilsonFermion.cc
+++ b/lib/qcd/action/fermion/WilsonFermion.cc
@ -58,7 +58,6 @@ namespace QCD {
 	UmuOdd (&Hgrid) 
  {
    // Allocate the required comms buffer
-    comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
    ImportGauge(_Umu);
  }

@ -153,7 +152,7 @@ namespace QCD {
    FermionField Atilde(B._grid);
    Atilde = A;

-    st.HaloExchange(B,comm_buf,compressor);
+    st.HaloExchange(B,compressor);
    
    for(int mu=0;mu<Nd;mu++){
      
@ -168,7 +167,7 @@ namespace QCD {
      ////////////////////////
 PARALLEL_FOR_LOOP
 	for(int sss=0;sss<B._grid->oSites();sss++){
-	  Kernels::DiracOptDhopDir(st,U,comm_buf,sss,sss,B,Btilde,mu,gamma);
+	  Kernels::DiracOptDhopDir(st,U,st.comm_buf,sss,sss,B,Btilde,mu,gamma);
 	}
      
      //////////////////////////////////////////////////
@ -274,11 +273,11 @@ PARALLEL_FOR_LOOP
    
    Compressor compressor(dag);
    
-    Stencil.HaloExchange(in,comm_buf,compressor);
+    Stencil.HaloExchange(in,compressor);
    
 PARALLEL_FOR_LOOP
      for(int sss=0;sss<in._grid->oSites();sss++){
-	Kernels::DiracOptDhopDir(Stencil,Umu,comm_buf,sss,sss,in,out,dirdisp,gamma);
+	Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.comm_buf,sss,sss,in,out,dirdisp,gamma);
      }
    
  };
@ -300,30 +299,30 @@ PARALLEL_FOR_LOOP
    assert((dag==DaggerNo) ||(dag==DaggerYes));

    Compressor compressor(dag);
-    st.HaloExchange(in,comm_buf,compressor);
+    st.HaloExchange(in,compressor);
    
    if ( dag == DaggerYes ) {
      if( HandOptDslash ) {
 PARALLEL_FOR_LOOP
        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sss,sss,in,out);
+	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
 	}
      } else { 
 PARALLEL_FOR_LOOP
        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sss,sss,in,out);
+	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
 	}
      }
    } else {
      if( HandOptDslash ) {
 PARALLEL_FOR_LOOP
        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptHandDhopSite(st,U,comm_buf,sss,sss,in,out);
+	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out);
 	}
      } else { 
 PARALLEL_FOR_LOOP
        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptDhopSite(st,U,comm_buf,sss,sss,in,out);
+	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out);
 	}
      }
    }
@ -338,8 +337,7 @@ PARALLEL_FOR_LOOP

    Compressor compressor(dag);

-    std::thread comms_thread = st.HaloExchangeBegin(in,comm_buf,compressor);
-    comms_thread.join();
+    auto handle = st.HaloExchangeBegin(in,compressor);

    bool local    = true;
    bool nonlocal = false;
@ -347,28 +345,29 @@ PARALLEL_FOR_LOOP
      if( HandOptDslash ) {
 PARALLEL_FOR_LOOP
        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
+	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 	}
      } else { 
 PARALLEL_FOR_LOOP
        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
+	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 	}
      }
    } else {
      if( HandOptDslash ) {
 PARALLEL_FOR_LOOP
        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptHandDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
+	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 	}
      } else { 
 PARALLEL_FOR_LOOP
        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
+	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 	}
      }
    }

+    st.HaloExchangeComplete(handle);

    local    = false;
    nonlocal = true;
@ -376,24 +375,24 @@ PARALLEL_FOR_LOOP
      if( HandOptDslash ) {
 PARALLEL_FOR_LOOP
        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
+	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 	}
      } else { 
 PARALLEL_FOR_LOOP
        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
+	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 	}
      }
    } else {
      if( HandOptDslash ) {
 PARALLEL_FOR_LOOP
        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptHandDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
+	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 	}
      } else { 
 PARALLEL_FOR_LOOP
        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
+	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 	}
      }
    }
--- a/lib/qcd/action/fermion/WilsonFermion.h
+++ b/lib/qcd/action/fermion/WilsonFermion.h
@ -153,9 +153,6 @@ namespace Grid {
      DoubledGaugeField UmuEven;
      DoubledGaugeField UmuOdd;
      
-      // Comms buffer
-      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
-      
    };

    typedef WilsonFermion<WilsonImplF> WilsonFermionF;
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@ -98,12 +98,11 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  }

  // Allocate the required comms buffer
-  comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
-
  ImportGauge(_Umu);
  commtime=0;
  jointime=0;
  dslashtime=0;
+  dslash1time=0;
 }  
 template<class Impl>
 void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
@ -121,7 +120,7 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
  //  assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;

  Compressor compressor(DaggerNo);
-  Stencil.HaloExchange(in,comm_buf,compressor);
+  Stencil.HaloExchange(in,compressor);
  
  int skip = (disp==1) ? 0 : 1;

@ -136,7 +135,7 @@ PARALLEL_FOR_LOOP
    for(int s=0;s<Ls;s++){
      int sU=ss;
      int sF = s+Ls*sU; 
-      Kernels::DiracOptDhopDir(Stencil,Umu,comm_buf,sF,sU,in,out,dirdisp,gamma);
+      Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.comm_buf,sF,sU,in,out,dirdisp,gamma);
    }
  }
 };
@ -159,7 +158,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
  FermionField Btilde(B._grid);
  FermionField Atilde(B._grid);

-  st.HaloExchange(B,comm_buf,compressor);
+  st.HaloExchange(B,compressor);

  Atilde=A;

@ -184,7 +183,7 @@ PARALLEL_FOR_LOOP
 	assert ( sF< B._grid->oSites());
 	assert ( sU< U._grid->oSites());

-	Kernels::DiracOptDhopDir(st,U,comm_buf,sF,sU,B,Btilde,mu,gamma);
+	Kernels::DiracOptDhopDir(st,U,st.comm_buf,sF,sU,B,Btilde,mu,gamma);

    ////////////////////////////
    // spin trace outer product
@ -236,9 +235,10 @@ template<class Impl>
 void WilsonFermion5D<Impl>::Report(void)
 {
  std::cout<<GridLogMessage << "********************"<<std::endl;
-  std::cout<<GridLogMessage << "Halo   time "<<commtime <<" us"<<std::endl;
-  std::cout<<GridLogMessage << "Dslash time "<<dslashtime<<" us"<<std::endl;
-  std::cout<<GridLogMessage << "join   time "<<jointime<<" us"<<std::endl;
+  std::cout<<GridLogMessage << "Halo    time "<<commtime <<" us"<<std::endl;
+  std::cout<<GridLogMessage << "Dslash  time "<<dslashtime<<" us"<<std::endl;
+  std::cout<<GridLogMessage << "Dslash1 time "<<dslash1time<<" us"<<std::endl;
+  std::cout<<GridLogMessage << "join    time "<<jointime<<" us"<<std::endl;
  std::cout<<GridLogMessage << "Stencil All    time "<<Stencil.halotime<<" us"<<std::endl;
  std::cout<<GridLogMessage << "********************"<<std::endl;
  std::cout<<GridLogMessage << "Stencil nosplice time "<<Stencil.nosplicetime<<" us"<<std::endl;
@ -299,11 +299,11 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
  int nwork = U._grid->oSites();
  
  commtime -=usecond();
-  std::thread thr = st.HaloExchangeBegin(in,comm_buf,compressor);
+  auto handle = st.HaloExchangeBegin(in,compressor);
+  st.HaloExchangeComplete(handle);
  commtime +=usecond();

  jointime -=usecond();
-  thr.join();
  jointime +=usecond();
  
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
@ -319,7 +319,7 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
 	int sU=ss;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out);
+	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 	  }
      }
    } else { 
@ -330,7 +330,7 @@ PARALLEL_FOR_LOOP
 	  for(sd=0;sd<Ls;sd++){
 	    int sU=ss;
 	    int sF = sd+Ls*sU;
-	    Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sF,sU,in,out);
+	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 	  }
 	}
      }
@ -362,7 +362,7 @@ PARALLEL_FOR_LOOP
 	      sU = lo.Reorder(sU);
 	    }
 	    sF = s+Ls*sU;
-	    Kernels::DiracOptAsmDhopSite(st,U,comm_buf,sF,sU,in,out,(uint64_t *)0);// &buf[0]
+	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out,(uint64_t *)0);// &buf[0]
 	  }
 	}
      }
@ -387,7 +387,7 @@ PARALLEL_FOR_LOOP
 	  sU=ss+ ssoff;
 	  for(int s=soff;s<soff+swork;s++){
 	    sF = s+Ls*sU;
-	    Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
+	    Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	  }
 	}
      }
@ -398,7 +398,7 @@ PARALLEL_FOR_LOOP
 	int sU=ss;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
+	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	}
      }
    } else { 
@ -407,7 +407,7 @@ PARALLEL_FOR_LOOP
 	int sU=ss;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU; 
-	  Kernels::DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out);
+	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	}
      }
    }
@ -432,7 +432,7 @@ void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, Le
  int nwork = U._grid->oSites();
  
  commtime -=usecond();
-  std::thread thr = st.HaloExchangeBegin(in,comm_buf,compressor);
+  auto handle = st.HaloExchangeBegin(in,compressor);
  commtime +=usecond();
  
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
@ -450,7 +450,7 @@ PARALLEL_FOR_LOOP
 	int sU=ss;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
+	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 	  }
      }
    } else { 
@ -461,7 +461,7 @@ PARALLEL_FOR_LOOP
 	  for(sd=0;sd<Ls;sd++){
 	    int sU=ss;
 	    int sF = sd+Ls*sU;
-	    Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
+	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 	  }
 	}
      }
@ -473,7 +473,7 @@ PARALLEL_FOR_LOOP
 	int sU=ss;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
+	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 	}
      }
    } else { 
@ -482,7 +482,7 @@ PARALLEL_FOR_LOOP
 	int sU=ss;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU; 
-	  Kernels::DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
+	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 	}
      }
    }
@ -490,12 +490,12 @@ PARALLEL_FOR_LOOP
  dslashtime +=usecond();

  jointime -=usecond();
-  thr.join();
+  st.HaloExchangeComplete(handle);
  jointime +=usecond();

  local    = false;
  nonlocal = true;
-  dslashtime -=usecond();
+  dslash1time -=usecond();
  if ( dag == DaggerYes ) {
    if( this->HandOptDslash ) {
 PARALLEL_FOR_LOOP
@ -503,7 +503,7 @@ PARALLEL_FOR_LOOP
 	int sU=ss;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
+	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 	  }
      }
    } else { 
@ -514,7 +514,7 @@ PARALLEL_FOR_LOOP
 	  for(sd=0;sd<Ls;sd++){
 	    int sU=ss;
 	    int sF = sd+Ls*sU;
-	    Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
+	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 	  }
 	}
      }
@ -526,7 +526,7 @@ PARALLEL_FOR_LOOP
 	int sU=ss;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
+	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 	}
      }
    } else { 
@ -535,13 +535,12 @@ PARALLEL_FOR_LOOP
 	int sU=ss;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU; 
-	  Kernels::DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
+	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 	}
      }
    }
  }
-  dslashtime +=usecond();
-
+  dslash1time +=usecond();

 }

--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@ -64,6 +64,7 @@ namespace Grid {
     double jointime;
     double commtime;
     double dslashtime;
+     double dslash1time;
      ///////////////////////////////////////////////////////////////
      // Implement the abstract base
      ///////////////////////////////////////////////////////////////
--- a/tests/Test_stencil.cc
+++ b/tests/Test_stencil.cc
@ -99,9 +99,8 @@ int main (int argc, char ** argv)
 	  ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir];
 	}
 	
-	std::vector<vobj,alignedAllocator<vobj> >  comm_buf(myStencil._unified_buffer_size);
 	SimpleCompressor<vobj> compress;
-	myStencil.HaloExchange(Foo,comm_buf,compress);
+	myStencil.HaloExchange(Foo,compress);

 	Bar = Cshift(Foo,dir,disp);

@ -117,7 +116,7 @@ int main (int argc, char ** argv)
 	  else if (SE->_is_local)
 	    Check._odata[i] = Foo._odata[SE->_offset];
 	  else 
-	    Check._odata[i] = comm_buf[SE->_offset];
+	    Check._odata[i] = myStencil.comm_buf[SE->_offset];
 	}

 	Real nrmC = norm2(Check);
@ -181,13 +180,10 @@ int main (int argc, char ** argv)
 	  ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir];
 	}
 	
-	std::vector<vobj,alignedAllocator<vobj> >  Ecomm_buf(EStencil._unified_buffer_size);
-	std::vector<vobj,alignedAllocator<vobj> >  Ocomm_buf(OStencil._unified_buffer_size);
-
 	SimpleCompressor<vobj> compress;

-	EStencil.HaloExchange(EFoo,Ecomm_buf,compress);
-	OStencil.HaloExchange(OFoo,Ocomm_buf,compress);
+	EStencil.HaloExchange(EFoo,compress);
+	OStencil.HaloExchange(OFoo,compress);
 	
 	Bar = Cshift(Foo,dir,disp);

@ -211,7 +207,7 @@ int main (int argc, char ** argv)
 	  else if (SE->_is_local)
 	    OCheck._odata[i] = EFoo._odata[SE->_offset];
 	  else 
-	    OCheck._odata[i] = Ecomm_buf[SE->_offset];
+	    OCheck._odata[i] = EStencil.comm_buf[SE->_offset];
 	}
 	for(int i=0;i<ECheck._grid->oSites();i++){
 	  int permute_type;
@ -224,7 +220,7 @@ int main (int argc, char ** argv)
 	  else if (SE->_is_local)
 	    ECheck._odata[i] = OFoo._odata[SE->_offset];
 	  else 
-	    ECheck._odata[i] = Ocomm_buf[SE->_offset];
+	    ECheck._odata[i] = OStencil.comm_buf[SE->_offset];
 	}
 	
 	setCheckerboard(Check,ECheck);