Merge branch 'develop' of https://github.com/paboyle/Grid into develop

2025-10-23 09:14:48 +01:00 · 2020-09-03 20:30:49 -04:00
parent eac1f08b7b 198b29f618
commit 6dbd117aa5
11 changed files with 119 additions and 173 deletions
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -65,8 +65,7 @@ public:
    MemoryManager::CpuFree((void *)__p,bytes);
  }

-  // FIXME: hack for the copy constructor, eventually it must be avoided
-  //void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
+  // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
  void construct(pointer __p, const _Tp& __val) { assert(0);};
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
@@ -74,6 +73,9 @@ public:
 template<typename _Tp>  inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
 template<typename _Tp>  inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }

+//////////////////////////////////////////////////////////////////////////////////////
+// Unified virtual memory
+//////////////////////////////////////////////////////////////////////////////////////
 template<typename _Tp>
 class uvmAllocator {
 public: 
@@ -109,22 +111,63 @@ public:
    MemoryManager::SharedFree((void *)__p,bytes);
  }

-  // FIXME: hack for the copy constructor, eventually it must be avoided
  void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
-  //void construct(pointer __p, const _Tp& __val) { };
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
 };
 template<typename _Tp>  inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; }
 template<typename _Tp>  inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; }

+////////////////////////////////////////////////////////////////////////////////
+// Device memory
+////////////////////////////////////////////////////////////////////////////////
+template<typename _Tp>
+class devAllocator {
+public: 
+  typedef std::size_t     size_type;
+  typedef std::ptrdiff_t  difference_type;
+  typedef _Tp*       pointer;
+  typedef const _Tp* const_pointer;
+  typedef _Tp&       reference;
+  typedef const _Tp& const_reference;
+  typedef _Tp        value_type;
+
+  template<typename _Tp1>  struct rebind { typedef devAllocator<_Tp1> other; };
+  devAllocator() throw() { }
+  devAllocator(const devAllocator&) throw() { }
+  template<typename _Tp1> devAllocator(const devAllocator<_Tp1>&) throw() { }
+  ~devAllocator() throw() { }
+  pointer       address(reference __x)       const { return &__x; }
+  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
+
+  pointer allocate(size_type __n, const void* _p= 0)
+  { 
+    size_type bytes = __n*sizeof(_Tp);
+    profilerAllocate(bytes);
+    _Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes);
+    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
+    return ptr;
+  }
+
+  void deallocate(pointer __p, size_type __n) 
+  { 
+    size_type bytes = __n * sizeof(_Tp);
+    profilerFree(bytes);
+    MemoryManager::AcceleratorFree((void *)__p,bytes);
+  }
+  void construct(pointer __p, const _Tp& __val) { };
+  void construct(pointer __p) { };
+  void destroy(pointer __p) { };
+};
+template<typename _Tp>  inline bool operator==(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return true; }
+template<typename _Tp>  inline bool operator!=(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return false; }
+
 ////////////////////////////////////////////////////////////////////////////////
 // Template typedefs
 ////////////////////////////////////////////////////////////////////////////////
-template<class T> using commAllocator = uvmAllocator<T>;
+//template<class T> using commAllocator = devAllocator<T>;
 template<class T> using Vector     = std::vector<T,uvmAllocator<T> >;           
-template<class T> using commVector = std::vector<T,uvmAllocator<T> >;
-//template<class T> using Matrix     = std::vector<std::vector<T,alignedAllocator<T> > >;
+template<class T> using commVector = std::vector<T,devAllocator<T> >;

 NAMESPACE_END(Grid);

--- a/Grid/allocator/MemoryManager.h
+++ b/Grid/allocator/MemoryManager.h
@@ -93,12 +93,12 @@ private:
  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;

-  static void *AcceleratorAllocate(size_t bytes);
-  static void  AcceleratorFree    (void *ptr,size_t bytes);
  static void PrintBytes(void);
 public:
  static void Init(void);
  static void InitMessage(void);
+  static void *AcceleratorAllocate(size_t bytes);
+  static void  AcceleratorFree    (void *ptr,size_t bytes);
  static void *SharedAllocate(size_t bytes);
  static void  SharedFree    (void *ptr,size_t bytes);
  static void *CpuAllocate(size_t bytes);
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -302,60 +302,28 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int bytes)
 {
  std::vector<CommsRequest_t> reqs(0);
-  //    unsigned long  xcrc = crc32(0L, Z_NULL, 0);
-  //    unsigned long  rcrc = crc32(0L, Z_NULL, 0);
-  //    xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
-  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
-  SendToRecvFromComplete(reqs);
-  //    rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
-  //    printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc);
-}
-void CartesianCommunicator::SendRecvPacket(void *xmit,
-					   void *recv,
-					   int sender,
-					   int receiver,
-					   int bytes)
-{
-  MPI_Status stat;
-  assert(sender != receiver);
-  int tag = sender;
-  if ( _processor == sender ) {
-    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
-  }
-  if ( _processor == receiver ) {
-    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
-  }
-}
-// Basic Halo comms primitive
-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-						void *xmit,
-						int dest,
-						void *recv,
-						int from,
-						int bytes)
-{
+  unsigned long  xcrc = crc32(0L, Z_NULL, 0);
+  unsigned long  rcrc = crc32(0L, Z_NULL, 0);
+
  int myrank = _processor;
  int ierr;

-  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
-    MPI_Request xrq;
-    MPI_Request rrq;
+  // Enforce no UVM in comms, device or host OK
+  assert(acceleratorIsCommunicable(xmit));
+  assert(acceleratorIsCommunicable(recv));

-    ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
-    ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+  // Give the CPU to MPI immediately; can use threads to overlap optionally
+  //  printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes);
+  ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
+		    recv,bytes,MPI_CHAR,from, from,
+		    communicator,MPI_STATUS_IGNORE);
+  assert(ierr==0);

-    assert(ierr==0);
-    list.push_back(xrq);
-    list.push_back(rrq);
-  } else {
-    // Give the CPU to MPI immediately; can use threads to overlap optionally
-    ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
-		      recv,bytes,MPI_CHAR,from, from,
-		      communicator,MPI_STATUS_IGNORE);
-    assert(ierr==0);
-  }
+  //  xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
+  //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
+  //  printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
 }
-
+// Basic Halo comms primitive
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int dest,
 						     void *recv,
@@ -411,15 +379,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques

  return off_node_bytes;
 }
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
-{
-  SendToRecvFromComplete(waitall);
-}
-void CartesianCommunicator::StencilBarrier(void)
-{
-  MPI_Barrier  (ShmComm);
-}
-void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
  int nreq=list.size();

@@ -430,6 +390,13 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
  assert(ierr==0);
  list.resize(0);
 }
+void CartesianCommunicator::StencilBarrier(void)
+{
+  MPI_Barrier  (ShmComm);
+}
+//void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
+//{
+//}
 void CartesianCommunicator::Barrier(void)
 {
  int ierr = MPI_Barrier(communicator);
--- a/Grid/cshift/Cshift_common.h
+++ b/Grid/cshift/Cshift_common.h
@@ -222,6 +222,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
    // Test_cshift_red_black code.
    //    std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
    std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
+    assert(0); // This will fail if hit on GPU
    autoView( rhs_v, rhs, CpuWrite);
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -208,7 +208,7 @@ public:
  LebesgueOrder LebesgueEvenOdd;
    
  // Comms buffer
-  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
+  //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
    
  ///////////////////////////////////////////////////////////////
  // Conserved current utilities
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@@ -215,7 +215,7 @@ public:
  LebesgueOrder LebesgueEvenOdd;
    
  // Comms buffer
-  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
+  //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;


 };
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -70,6 +70,7 @@ NAMESPACE_BEGIN(Grid);
 //
 // Memory management:
 //
+//    int   acceleratorIsCommunicable(void *pointer);
 //    void *acceleratorAllocShared(size_t bytes);
 //    void acceleratorFreeShared(void *ptr);
 //
@@ -90,6 +91,7 @@ void     acceleratorInit(void);
 //////////////////////////////////////////////

 #ifdef GRID_CUDA
+#include <cuda.h>

 #ifdef __CUDA_ARCH__
 #define GRID_SIMT
@@ -165,6 +167,16 @@ inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
+inline int  acceleratorIsCommunicable(void *ptr)
+{
+  int uvm;
+  auto 
+  cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) ptr);
+  assert(cuerr == cudaSuccess );
+  if(uvm) return 0;
+  else    return 1;
+}
+
 #endif

 //////////////////////////////////////////////
@@ -219,6 +231,15 @@ inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
+inline int  acceleratorIsCommunicable(void *ptr)
+{
+#if 0
+  auto uvm = cl::sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context());
+  if ( uvm = cl::sycl::usm::alloc::shared ) return 1;
+  else return 0;
+#endif
+  return 1;
+}

 #endif

@@ -298,6 +319,7 @@ inline void *acceleratorAllocShared(size_t bytes)
  return malloc(bytes);
 #endif
 };
+inline int  acceleratorIsCommunicable(void *ptr){ return 1; }

 inline void *acceleratorAllocDevice(size_t bytes)
 {
@@ -352,6 +374,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA spec
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { memcpy(to,from,bytes);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}

+inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
 #ifdef HAVE_MM_MALLOC_H
 inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
 inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
--- a/Grid/util/Coordinate.h
+++ b/Grid/util/Coordinate.h
@@ -99,10 +99,10 @@ inline std::ostream & operator<<(std::ostream &os, const AcceleratorVector<T,_nd
 {
  os << "[";
  for(int s=0;s<v.size();s++) {
-    os << v[s] << " ";
-  }
-  if (v.size() > 0) {
-    os << "\b";
+    os << v[s];
+    if( s < (v.size()-1) ){
+      os << " ";
+    }
  }
  os << "]";
  return os;
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -74,90 +74,6 @@ int main (int argc, char ** argv)
  std::vector<double> t_time(Nloop);
  time_statistics timestat;

-  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl;
-  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  header();
-  for(int lat=8;lat<=maxlat;lat+=4){
-    for(int Ls=8;Ls<=8;Ls*=2){
-
-      Coordinate latt_size  ({lat*mpi_layout[0],
-	                      lat*mpi_layout[1],
-      			      lat*mpi_layout[2],
-      			      lat*mpi_layout[3]});
-
-      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      RealD Nrank = Grid._Nprocessors;
-      RealD Nnode = Grid.NodeCount();
-      RealD ppn = Nrank/Nnode;
-
-      std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);	
-      std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
-
-      int ncomm;
-      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
-      for(int mu=0;mu<8;mu++){
-	xbuf[mu].resize(lat*lat*lat*Ls);
-	rbuf[mu].resize(lat*lat*lat*Ls);
-	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
-      }
-
-      for(int i=0;i<Nloop;i++){
-      double start=usecond();
-
-	std::vector<CommsRequest_t> requests;
-
-	ncomm=0;
-	for(int mu=0;mu<4;mu++){
-	
-	  if (mpi_layout[mu]>1 ) {
-	  
-	    ncomm++;
-	    int comm_proc=1;
-	    int xmit_to_rank;
-	    int recv_from_rank;
-	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	    Grid.SendToRecvFromBegin(requests,
-				   (void *)&xbuf[mu][0],
-				   xmit_to_rank,
-				   (void *)&rbuf[mu][0],
-				   recv_from_rank,
-				   bytes);
-	
-	    comm_proc = mpi_layout[mu]-1;
-	  
-	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	    Grid.SendToRecvFromBegin(requests,
-				     (void *)&xbuf[mu+4][0],
-				     xmit_to_rank,
-				     (void *)&rbuf[mu+4][0],
-				     recv_from_rank,
-				     bytes);
-	  
-	  }
-	}
-	Grid.SendToRecvFromComplete(requests);
-	Grid.Barrier();
-	double stop=usecond();
-	t_time[i] = stop-start; // microseconds
-      }
-
-      timestat.statistics(t_time);
-
-      double dbytes    = bytes*ppn;
-      double xbytes    = dbytes*2.0*ncomm;
-      double rbytes    = xbytes;
-      double bidibytes = xbytes+rbytes;
-
-      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
-               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
-               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
-               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
-               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
-               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
-
-    }
-  }    


  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
@@ -206,26 +122,22 @@ int main (int argc, char ** argv)
 	    {
 	      std::vector<CommsRequest_t> requests;
 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	      Grid.SendToRecvFromBegin(requests,
-				       (void *)&xbuf[mu][0],
-				       xmit_to_rank,
-				       (void *)&rbuf[mu][0],
-				       recv_from_rank,
-				       bytes);
-	      Grid.SendToRecvFromComplete(requests);
+	      Grid.SendToRecvFrom((void *)&xbuf[mu][0],
+				  xmit_to_rank,
+				  (void *)&rbuf[mu][0],
+				  recv_from_rank,
+				  bytes);
 	    }

 	    comm_proc = mpi_layout[mu]-1;
 	    {
 	      std::vector<CommsRequest_t> requests;
 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	      Grid.SendToRecvFromBegin(requests,
-				       (void *)&xbuf[mu+4][0],
-				       xmit_to_rank,
-				       (void *)&rbuf[mu+4][0],
-				       recv_from_rank,
-				       bytes);
-	      Grid.SendToRecvFromComplete(requests);
+	      Grid.SendToRecvFrom((void *)&xbuf[mu+4][0],
+				  xmit_to_rank,
+				  (void *)&rbuf[mu+4][0],
+				  recv_from_rank,
+				  bytes);
 	    }
 	  }
 	}
--- a/benchmarks/Benchmark_wilson_sweep.cc
+++ b/benchmarks/Benchmark_wilson_sweep.cc
@@ -89,6 +89,7 @@ int main (int argc, char ** argv)

 	  std::cout << GridLogMessage;
 	  std::cout << latt_size;
+	  std::cout << "\t\t";

 	  GridCartesian           Grid(latt_size,simd_layout,mpi_layout);
 	  GridRedBlackCartesian RBGrid(&Grid);
--- a/configure.ac
+++ b/configure.ac
@@ -154,6 +154,7 @@ AC_ARG_ENABLE([accelerator],
 case ${ac_ACCELERATOR} in
    cuda)
      echo CUDA acceleration
+      LIBS="${LIBS} -lcuda"
      AC_DEFINE([GRID_CUDA],[1],[Use CUDA offload]);;
    sycl)
      echo SYCL acceleration
@@ -323,7 +324,6 @@ case ${CXXTEST} in
 #    CXXLD="nvcc -v -link"
    CXX="${CXXBASE} -x cu "
    CXXLD="${CXXBASE} -link"
-#    CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing -Xcompiler -Wno-unusable-partial-specialization --expt-extended-lambda --expt-relaxed-constexpr"
    CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
    if test $ac_openmp = yes; then
       CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp"
@@ -483,8 +483,7 @@ case ${ac_SHM} in
     LDFLAGS_CPY=$LDFLAGS
     CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS"
     LDFLAGS="$AM_LDFLAGS $LDFLAGS"
-     AC_SEARCH_LIBS([shm_unlink], [rt], [],
-                    [AC_MSG_ERROR("no library found for shm_unlink")])
+     AC_SEARCH_LIBS([shm_unlink], [rt], [],[AC_MSG_ERROR("no library found for shm_unlink")])
     CXXFLAGS=$CXXFLAGS_CPY
     LDFLAGS=$LDFLAGS_CPY
     ;;