UVM check in MPI calls

2025-08-03 13:17:06 +01:00 · 2020-09-03 20:29:26 -04:00
parent 8244caff25
commit a8309638d4
3 changed files with 26 additions and 12 deletions
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -309,15 +309,8 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
  int ierr;

  // Enforce no UVM in comms, device or host OK
-  int uvm;
-  auto 
-  cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) xmit);
-  assert(cuerr == cudaSuccess );
-  assert(uvm==0);
-
-  cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) recv);
-  assert(cuerr == cudaSuccess );
-  assert(uvm==0);
+  assert(acceleratorIsCommunicable(xmit));
+  assert(acceleratorIsCommunicable(recv));

  // Give the CPU to MPI immediately; can use threads to overlap optionally
  //  printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes);
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -70,6 +70,7 @@ NAMESPACE_BEGIN(Grid);
 //
 // Memory management:
 //
+//    int   acceleratorIsCommunicable(void *pointer);
 //    void *acceleratorAllocShared(size_t bytes);
 //    void acceleratorFreeShared(void *ptr);
 //
@@ -166,6 +167,16 @@ inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
+inline int  acceleratorIsCommunicable(void *ptr)
+{
+  int uvm;
+  auto 
+  cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) ptr);
+  assert(cuerr == cudaSuccess );
+  if(uvm) return 0;
+  else    return 1;
+}
+
 #endif

 //////////////////////////////////////////////
@@ -220,6 +231,15 @@ inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
+inline int  acceleratorIsCommunicable(void *ptr)
+{
+#if 0
+  auto uvm = cl::sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context());
+  if ( uvm = cl::sycl::usm::alloc::shared ) return 1;
+  else return 0;
+#endif
+  return 1;
+}

 #endif

@@ -299,6 +319,7 @@ inline void *acceleratorAllocShared(size_t bytes)
  return malloc(bytes);
 #endif
 };
+inline int  acceleratorIsCommunicable(void *ptr){ return 1; }

 inline void *acceleratorAllocDevice(size_t bytes)
 {
@@ -353,6 +374,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA spec
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { memcpy(to,from,bytes);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}

+inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
 #ifdef HAVE_MM_MALLOC_H
 inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
 inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
--- a/configure.ac
+++ b/configure.ac
@@ -154,6 +154,7 @@ AC_ARG_ENABLE([accelerator],
 case ${ac_ACCELERATOR} in
    cuda)
      echo CUDA acceleration
+      LIBS="${LIBS} -lcuda"
      AC_DEFINE([GRID_CUDA],[1],[Use CUDA offload]);;
    sycl)
      echo SYCL acceleration
@@ -323,7 +324,6 @@ case ${CXXTEST} in
 #    CXXLD="nvcc -v -link"
    CXX="${CXXBASE} -x cu "
    CXXLD="${CXXBASE} -link"
-#    CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing -Xcompiler -Wno-unusable-partial-specialization --expt-extended-lambda --expt-relaxed-constexpr"
    CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
    if test $ac_openmp = yes; then
       CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp"
@@ -483,8 +483,7 @@ case ${ac_SHM} in
     LDFLAGS_CPY=$LDFLAGS
     CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS"
     LDFLAGS="$AM_LDFLAGS $LDFLAGS"
-     AC_SEARCH_LIBS([shm_unlink], [rt], [],
-                    [AC_MSG_ERROR("no library found for shm_unlink")])
+     AC_SEARCH_LIBS([shm_unlink], [rt], [],[AC_MSG_ERROR("no library found for shm_unlink")])
     CXXFLAGS=$CXXFLAGS_CPY
     LDFLAGS=$LDFLAGS_CPY
     ;;