From 65920faebae56dd8e28d43442fe4010c2f491de0 Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Mon, 31 Aug 2020 18:39:27 +0200
Subject: [PATCH 01/10] correct formatting of Benchmark_wilson_sweep output

---
 benchmarks/Benchmark_wilson_sweep.cc | 1 +
 1 file changed, 1 insertion(+)
diff --git a/benchmarks/Benchmark_wilson_sweep.cc b/benchmarks/Benchmark_wilson_sweep.cc
index f1473b44..100e09f5 100644
--- a/benchmarks/Benchmark_wilson_sweep.cc
+++ b/benchmarks/Benchmark_wilson_sweep.cc
@@ -89,6 +89,7 @@ int main (int argc, char ** argv)
 
 	  std::cout << GridLogMessage;
 	  std::cout << latt_size;
+    std::cout << "\t\t";
 
 	  GridCartesian           Grid(latt_size,simd_layout,mpi_layout);
 	  GridRedBlackCartesian RBGrid(&Grid);

From a9b92867a817deed5c041a268d97c54d0674e56a Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Mon, 31 Aug 2020 18:41:17 +0200
Subject: [PATCH 02/10] use tabulator

---
 benchmarks/Benchmark_wilson_sweep.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/Benchmark_wilson_sweep.cc b/benchmarks/Benchmark_wilson_sweep.cc
index 100e09f5..986a1831 100644
--- a/benchmarks/Benchmark_wilson_sweep.cc
+++ b/benchmarks/Benchmark_wilson_sweep.cc
@@ -89,7 +89,7 @@ int main (int argc, char ** argv)
 
 	  std::cout << GridLogMessage;
 	  std::cout << latt_size;
-    std::cout << "\t\t";
+	  std::cout << "\t\t";
 
 	  GridCartesian           Grid(latt_size,simd_layout,mpi_layout);
 	  GridRedBlackCartesian RBGrid(&Grid);

From 54523369a3096750dac18f2150a23d6d98709030 Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Mon, 31 Aug 2020 19:39:36 +0200
Subject: [PATCH 03/10] do not use backspace in Coordinate output stream
 operator

---
 Grid/util/Coordinate.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Grid/util/Coordinate.h b/Grid/util/Coordinate.h
index 7f1d31c0..004fbc72 100644
--- a/Grid/util/Coordinate.h
+++ b/Grid/util/Coordinate.h
@@ -99,10 +99,10 @@ inline std::ostream & operator<<(std::ostream &os, const AcceleratorVector<T,_nd
 {
   os << "[";
   for(int s=0;s<v.size();s++) {
-    os << v[s] << " ";
-  }
-  if (v.size() > 0) {
-    os << "\b";
+    os << v[s];
+    if( s < (v.size()-1) ){
+      os << " ";
+    }
   }
   os << "]";
   return os;

From d3ce60713d87dd1baf7d9f7acf1097da1c932d1a Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 3 Sep 2020 15:44:13 -0400
Subject: [PATCH 04/10] UVM, Device and Lattice/aligned allocators

---
 Grid/allocator/AlignedAllocator.h | 57 +++++++++++++++++++++++++++----
 Grid/allocator/MemoryManager.h    |  4 +--
 2 files changed, 52 insertions(+), 9 deletions(-)

diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h
index ebb3162b..249732fb 100644
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -65,8 +65,7 @@ public:
     MemoryManager::CpuFree((void *)__p,bytes);
   }
 
-  // FIXME: hack for the copy constructor, eventually it must be avoided
-  //void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
+  // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
   void construct(pointer __p, const _Tp& __val) { assert(0);};
   void construct(pointer __p) { };
   void destroy(pointer __p) { };
@@ -74,6 +73,9 @@ public:
 template<typename _Tp>  inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
 template<typename _Tp>  inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
 
+//////////////////////////////////////////////////////////////////////////////////////
+// Unified virtual memory
+//////////////////////////////////////////////////////////////////////////////////////
 template<typename _Tp>
 class uvmAllocator {
 public: 
@@ -109,22 +111,63 @@ public:
     MemoryManager::SharedFree((void *)__p,bytes);
   }
 
-  // FIXME: hack for the copy constructor, eventually it must be avoided
   void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
-  //void construct(pointer __p, const _Tp& __val) { };
   void construct(pointer __p) { };
   void destroy(pointer __p) { };
 };
 template<typename _Tp>  inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; }
 template<typename _Tp>  inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; }
 
+////////////////////////////////////////////////////////////////////////////////
+// Device memory
+////////////////////////////////////////////////////////////////////////////////
+template<typename _Tp>
+class devAllocator {
+public: 
+  typedef std::size_t     size_type;
+  typedef std::ptrdiff_t  difference_type;
+  typedef _Tp*       pointer;
+  typedef const _Tp* const_pointer;
+  typedef _Tp&       reference;
+  typedef const _Tp& const_reference;
+  typedef _Tp        value_type;
+
+  template<typename _Tp1>  struct rebind { typedef devAllocator<_Tp1> other; };
+  devAllocator() throw() { }
+  devAllocator(const devAllocator&) throw() { }
+  template<typename _Tp1> devAllocator(const devAllocator<_Tp1>&) throw() { }
+  ~devAllocator() throw() { }
+  pointer       address(reference __x)       const { return &__x; }
+  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
+
+  pointer allocate(size_type __n, const void* _p= 0)
+  { 
+    size_type bytes = __n*sizeof(_Tp);
+    profilerAllocate(bytes);
+    _Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes);
+    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
+    return ptr;
+  }
+
+  void deallocate(pointer __p, size_type __n) 
+  { 
+    size_type bytes = __n * sizeof(_Tp);
+    profilerFree(bytes);
+    MemoryManager::AcceleratorFree((void *)__p,bytes);
+  }
+  void construct(pointer __p, const _Tp& __val) { };
+  void construct(pointer __p) { };
+  void destroy(pointer __p) { };
+};
+template<typename _Tp>  inline bool operator==(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return true; }
+template<typename _Tp>  inline bool operator!=(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return false; }
+
 ////////////////////////////////////////////////////////////////////////////////
 // Template typedefs
 ////////////////////////////////////////////////////////////////////////////////
-template<class T> using commAllocator = uvmAllocator<T>;
+//template<class T> using commAllocator = devAllocator<T>;
 template<class T> using Vector     = std::vector<T,uvmAllocator<T> >;           
-template<class T> using commVector = std::vector<T,uvmAllocator<T> >;
-//template<class T> using Matrix     = std::vector<std::vector<T,alignedAllocator<T> > >;
+template<class T> using commVector = std::vector<T,devAllocator<T> >;
 
 NAMESPACE_END(Grid);
 
diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h
index 23065c58..aac13aee 100644
--- a/Grid/allocator/MemoryManager.h
+++ b/Grid/allocator/MemoryManager.h
@@ -93,12 +93,12 @@ private:
   static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
   static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
 
-  static void *AcceleratorAllocate(size_t bytes);
-  static void  AcceleratorFree    (void *ptr,size_t bytes);
   static void PrintBytes(void);
  public:
   static void Init(void);
   static void InitMessage(void);
+  static void *AcceleratorAllocate(size_t bytes);
+  static void  AcceleratorFree    (void *ptr,size_t bytes);
   static void *SharedAllocate(size_t bytes);
   static void  SharedFree    (void *ptr,size_t bytes);
   static void *CpuAllocate(size_t bytes);

From 0c3095e173bc2886b121edb3fafa78646948f5e8 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 3 Sep 2020 15:45:35 -0400
Subject: [PATCH 05/10] Comms buffers to device memory

---
 Grid/communicator/Communicator_mpi3.cc | 88 +++++++++-----------------
 1 file changed, 31 insertions(+), 57 deletions(-)

diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc
index 6130195d..28c7b8a4 100644
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -302,60 +302,35 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int bytes)
 {
   std::vector<CommsRequest_t> reqs(0);
-  //    unsigned long  xcrc = crc32(0L, Z_NULL, 0);
-  //    unsigned long  rcrc = crc32(0L, Z_NULL, 0);
-  //    xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
-  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
-  SendToRecvFromComplete(reqs);
-  //    rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
-  //    printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc);
-}
-void CartesianCommunicator::SendRecvPacket(void *xmit,
-					   void *recv,
-					   int sender,
-					   int receiver,
-					   int bytes)
-{
-  MPI_Status stat;
-  assert(sender != receiver);
-  int tag = sender;
-  if ( _processor == sender ) {
-    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
-  }
-  if ( _processor == receiver ) {
-    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
-  }
-}
-// Basic Halo comms primitive
-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-						void *xmit,
-						int dest,
-						void *recv,
-						int from,
-						int bytes)
-{
+  unsigned long  xcrc = crc32(0L, Z_NULL, 0);
+  unsigned long  rcrc = crc32(0L, Z_NULL, 0);
+
   int myrank = _processor;
   int ierr;
 
-  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
-    MPI_Request xrq;
-    MPI_Request rrq;
+  // Enforce no UVM in comms, device or host OK
+  int uvm;
+  auto 
+  cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) xmit);
+  assert(cuerr == cudaSuccess );
+  assert(uvm==0);
 
-    ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
-    ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+  cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) recv);
+  assert(cuerr == cudaSuccess );
+  assert(uvm==0);
 
-    assert(ierr==0);
-    list.push_back(xrq);
-    list.push_back(rrq);
-  } else {
-    // Give the CPU to MPI immediately; can use threads to overlap optionally
-    ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
-		      recv,bytes,MPI_CHAR,from, from,
-		      communicator,MPI_STATUS_IGNORE);
-    assert(ierr==0);
-  }
+  // Give the CPU to MPI immediately; can use threads to overlap optionally
+  //  printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes);
+  ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
+		    recv,bytes,MPI_CHAR,from, from,
+		    communicator,MPI_STATUS_IGNORE);
+  assert(ierr==0);
+
+  //  xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
+  //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
+  //  printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
 }
-
+// Basic Halo comms primitive
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int dest,
 						     void *recv,
@@ -411,15 +386,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 
   return off_node_bytes;
 }
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
-{
-  SendToRecvFromComplete(waitall);
-}
-void CartesianCommunicator::StencilBarrier(void)
-{
-  MPI_Barrier  (ShmComm);
-}
-void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
   int nreq=list.size();
 
@@ -430,6 +397,13 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
   assert(ierr==0);
   list.resize(0);
 }
+void CartesianCommunicator::StencilBarrier(void)
+{
+  MPI_Barrier  (ShmComm);
+}
+//void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
+//{
+//}
 void CartesianCommunicator::Barrier(void)
 {
   int ierr = MPI_Barrier(communicator);

From b4255140d611f3cbb1ccf6148450f14f5e0539cb Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 3 Sep 2020 15:47:46 -0400
Subject: [PATCH 06/10] Stale data member eliminated

---
 Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h | 2 +-
 Grid/qcd/action/fermion/WilsonFermion5D.h            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
index 625eda63..ca660610 100644
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -208,7 +208,7 @@ public:
   LebesgueOrder LebesgueEvenOdd;
     
   // Comms buffer
-  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
+  //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
     
   ///////////////////////////////////////////////////////////////
   // Conserved current utilities
diff --git a/Grid/qcd/action/fermion/WilsonFermion5D.h b/Grid/qcd/action/fermion/WilsonFermion5D.h
index 804b1d10..80231bb4 100644
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@@ -215,7 +215,7 @@ public:
   LebesgueOrder LebesgueEvenOdd;
     
   // Comms buffer
-  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
+  //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
 
 
 };

From 85b1c5df39c33e3de9d3ab2917400b9c985428c8 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 3 Sep 2020 15:48:16 -0400
Subject: [PATCH 07/10] A never hit case that is not 100% confident is asserted
 for safety

---
 Grid/cshift/Cshift_common.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h
index d296c024..40ab5032 100644
--- a/Grid/cshift/Cshift_common.h
+++ b/Grid/cshift/Cshift_common.h
@@ -222,6 +222,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
     // Test_cshift_red_black code.
     //    std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
     std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
+    assert(0); // This will fail if hit on GPU
     autoView( rhs_v, rhs, CpuWrite);
     for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){

From bcd7895362e6c1d1084da44837a8aa353d0ac936 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 3 Sep 2020 15:49:13 -0400
Subject: [PATCH 08/10] Include cuda.h

---
 Grid/threads/Accelerator.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index 74a3ea22..29d12904 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -90,6 +90,7 @@ void     acceleratorInit(void);
 //////////////////////////////////////////////
 
 #ifdef GRID_CUDA
+#include <cuda.h>
 
 #ifdef __CUDA_ARCH__
 #define GRID_SIMT

From 8244caff2507eab0a99a59f275bd1745efa7336d Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 3 Sep 2020 18:52:55 -0400
Subject: [PATCH 09/10] Remove the asynchronous non-Stencil calls.

---
 benchmarks/Benchmark_comms.cc | 108 ++++------------------------------
 1 file changed, 10 insertions(+), 98 deletions(-)

diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc
index 44ccbd19..232030c8 100644
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -74,90 +74,6 @@ int main (int argc, char ** argv)
   std::vector<double> t_time(Nloop);
   time_statistics timestat;
 
-  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl;
-  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  header();
-  for(int lat=8;lat<=maxlat;lat+=4){
-    for(int Ls=8;Ls<=8;Ls*=2){
-
-      Coordinate latt_size  ({lat*mpi_layout[0],
-	                      lat*mpi_layout[1],
-      			      lat*mpi_layout[2],
-      			      lat*mpi_layout[3]});
-
-      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      RealD Nrank = Grid._Nprocessors;
-      RealD Nnode = Grid.NodeCount();
-      RealD ppn = Nrank/Nnode;
-
-      std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);	
-      std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
-
-      int ncomm;
-      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
-      for(int mu=0;mu<8;mu++){
-	xbuf[mu].resize(lat*lat*lat*Ls);
-	rbuf[mu].resize(lat*lat*lat*Ls);
-	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
-      }
-
-      for(int i=0;i<Nloop;i++){
-      double start=usecond();
-
-	std::vector<CommsRequest_t> requests;
-
-	ncomm=0;
-	for(int mu=0;mu<4;mu++){
-	
-	  if (mpi_layout[mu]>1 ) {
-	  
-	    ncomm++;
-	    int comm_proc=1;
-	    int xmit_to_rank;
-	    int recv_from_rank;
-	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	    Grid.SendToRecvFromBegin(requests,
-				   (void *)&xbuf[mu][0],
-				   xmit_to_rank,
-				   (void *)&rbuf[mu][0],
-				   recv_from_rank,
-				   bytes);
-	
-	    comm_proc = mpi_layout[mu]-1;
-	  
-	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	    Grid.SendToRecvFromBegin(requests,
-				     (void *)&xbuf[mu+4][0],
-				     xmit_to_rank,
-				     (void *)&rbuf[mu+4][0],
-				     recv_from_rank,
-				     bytes);
-	  
-	  }
-	}
-	Grid.SendToRecvFromComplete(requests);
-	Grid.Barrier();
-	double stop=usecond();
-	t_time[i] = stop-start; // microseconds
-      }
-
-      timestat.statistics(t_time);
-
-      double dbytes    = bytes*ppn;
-      double xbytes    = dbytes*2.0*ncomm;
-      double rbytes    = xbytes;
-      double bidibytes = xbytes+rbytes;
-
-      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
-               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
-               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
-               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
-               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
-               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
-
-    }
-  }    
 
 
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
@@ -206,26 +122,22 @@ int main (int argc, char ** argv)
 	    {
 	      std::vector<CommsRequest_t> requests;
 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	      Grid.SendToRecvFromBegin(requests,
-				       (void *)&xbuf[mu][0],
-				       xmit_to_rank,
-				       (void *)&rbuf[mu][0],
-				       recv_from_rank,
-				       bytes);
-	      Grid.SendToRecvFromComplete(requests);
+	      Grid.SendToRecvFrom((void *)&xbuf[mu][0],
+				  xmit_to_rank,
+				  (void *)&rbuf[mu][0],
+				  recv_from_rank,
+				  bytes);
 	    }
 
 	    comm_proc = mpi_layout[mu]-1;
 	    {
 	      std::vector<CommsRequest_t> requests;
 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	      Grid.SendToRecvFromBegin(requests,
-				       (void *)&xbuf[mu+4][0],
-				       xmit_to_rank,
-				       (void *)&rbuf[mu+4][0],
-				       recv_from_rank,
-				       bytes);
-	      Grid.SendToRecvFromComplete(requests);
+	      Grid.SendToRecvFrom((void *)&xbuf[mu+4][0],
+				  xmit_to_rank,
+				  (void *)&rbuf[mu+4][0],
+				  recv_from_rank,
+				  bytes);
 	    }
 	  }
 	}

From a8309638d4be9faf49619a8220a9b06a4e16eb73 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 3 Sep 2020 20:29:26 -0400
Subject: [PATCH 10/10] UVM check in MPI calls

---
 Grid/communicator/Communicator_mpi3.cc | 11 ++---------
 Grid/threads/Accelerator.h             | 22 ++++++++++++++++++++++
 configure.ac                           |  5 ++---
 3 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc
index 28c7b8a4..83f71233 100644
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -309,15 +309,8 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
   int ierr;
 
   // Enforce no UVM in comms, device or host OK
-  int uvm;
-  auto 
-  cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) xmit);
-  assert(cuerr == cudaSuccess );
-  assert(uvm==0);
-
-  cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) recv);
-  assert(cuerr == cudaSuccess );
-  assert(uvm==0);
+  assert(acceleratorIsCommunicable(xmit));
+  assert(acceleratorIsCommunicable(recv));
 
   // Give the CPU to MPI immediately; can use threads to overlap optionally
   //  printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes);
diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index 29d12904..1a3dfdc2 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -70,6 +70,7 @@ NAMESPACE_BEGIN(Grid);
 //
 // Memory management:
 //
+//    int   acceleratorIsCommunicable(void *pointer);
 //    void *acceleratorAllocShared(size_t bytes);
 //    void acceleratorFreeShared(void *ptr);
 //
@@ -166,6 +167,16 @@ inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
+inline int  acceleratorIsCommunicable(void *ptr)
+{
+  int uvm;
+  auto 
+  cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) ptr);
+  assert(cuerr == cudaSuccess );
+  if(uvm) return 0;
+  else    return 1;
+}
+
 #endif
 
 //////////////////////////////////////////////
@@ -220,6 +231,15 @@ inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
+inline int  acceleratorIsCommunicable(void *ptr)
+{
+#if 0
+  auto uvm = cl::sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context());
+  if ( uvm = cl::sycl::usm::alloc::shared ) return 1;
+  else return 0;
+#endif
+  return 1;
+}
 
 #endif
 
@@ -299,6 +319,7 @@ inline void *acceleratorAllocShared(size_t bytes)
   return malloc(bytes);
 #endif
 };
+inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
 
 inline void *acceleratorAllocDevice(size_t bytes)
 {
@@ -353,6 +374,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA spec
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { memcpy(to,from,bytes);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}
 
+inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
 #ifdef HAVE_MM_MALLOC_H
 inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
 inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
diff --git a/configure.ac b/configure.ac
index f0bea6a4..5113757c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -154,6 +154,7 @@ AC_ARG_ENABLE([accelerator],
 case ${ac_ACCELERATOR} in
     cuda)
       echo CUDA acceleration
+      LIBS="${LIBS} -lcuda"
       AC_DEFINE([GRID_CUDA],[1],[Use CUDA offload]);;
     sycl)
       echo SYCL acceleration
@@ -323,7 +324,6 @@ case ${CXXTEST} in
 #    CXXLD="nvcc -v -link"
     CXX="${CXXBASE} -x cu "
     CXXLD="${CXXBASE} -link"
-#    CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing -Xcompiler -Wno-unusable-partial-specialization --expt-extended-lambda --expt-relaxed-constexpr"
     CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
     if test $ac_openmp = yes; then
        CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp"
@@ -483,8 +483,7 @@ case ${ac_SHM} in
      LDFLAGS_CPY=$LDFLAGS
      CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS"
      LDFLAGS="$AM_LDFLAGS $LDFLAGS"
-     AC_SEARCH_LIBS([shm_unlink], [rt], [],
-                    [AC_MSG_ERROR("no library found for shm_unlink")])
+     AC_SEARCH_LIBS([shm_unlink], [rt], [],[AC_MSG_ERROR("no library found for shm_unlink")])
      CXXFLAGS=$CXXFLAGS_CPY
      LDFLAGS=$LDFLAGS_CPY
      ;;