Replace cuda/hip memcpy with Grid functions

2025-12-20 04:34:29 +00:00 · 2024-02-26 09:55:07 -05:00
parent 0a816b5509
commit 6cd2d8fcd5
2 changed files with 18 additions and 44 deletions
--- a/Grid/lattice/Lattice_slicesum_core.h
+++ b/Grid/lattice/Lattice_slicesum_core.h
@@ -4,11 +4,6 @@

 #include <cub/cub.cuh>
 #define gpucub cub
-#define gpuMalloc cudaMalloc
-#define gpuFree cudaFree
-#define gpuMemcpyAsync cudaMemcpyAsync
-#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
-#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuError_t cudaError_t
 #define gpuSuccess cudaSuccess

@@ -16,11 +11,6 @@

 #include <hipcub/hipcub.hpp>
 #define gpucub hipcub
-#define gpuMalloc hipMalloc
-#define gpuFree hipFree
-#define gpuMemcpyAsync hipMemcpyAsync
-#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
-#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuError_t hipError_t
 #define gpuSuccess hipSuccess

@@ -51,38 +41,22 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
  }
  
  //Allocate memory for output and offset arrays on device
-  gpuError_t gpuErr = gpuMalloc(&d_out,rd*sizeof(vobj));
-  if (gpuErr != gpuSuccess) {
-    std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpuMalloc (d_out)! Error: " << gpuErr <<std::endl;
-    exit(EXIT_FAILURE);
-  }
-
-  gpuErr = gpuMalloc(&d_offsets,sizeof(int)*(rd+1));
-  if (gpuErr != gpuSuccess) {
-    std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpuMalloc (d_offsets)! Error: " << gpuErr <<std::endl;
-    exit(EXIT_FAILURE);
-  }
-
-  //copy offsets to device
-  gpuErr = gpuMemcpyAsync(d_offsets,&offsets[0],sizeof(int)*(rd+1),gpuMemcpyHostToDevice,computeStream);
-  if (gpuErr != gpuSuccess) {
-    std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpuMemcpy (d_offsets)! Error: " << gpuErr <<std::endl;
-    exit(EXIT_FAILURE);
-  }
-
+  d_out = static_cast<vobj*>(acceleratorAllocDevice(rd*sizeof(vobj)));
  
-  gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
+  d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
+  
+  //copy offsets to device
+  acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
+  
+  
+  gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
  if (gpuErr!=gpuSuccess) {
    std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce (setup)! Error: " << gpuErr <<std::endl;
    exit(EXIT_FAILURE);
  }

  //allocate memory for temp_storage_array  
-  gpuErr = gpuMalloc(&temp_storage_array,temp_storage_bytes);
-  if (gpuErr!=gpuSuccess) {
-    std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpuMalloc (temp_storage_array)! Error: " << gpuErr <<std::endl;
-    exit(EXIT_FAILURE);
-  }
+  temp_storage_array = acceleratorAllocDevice(temp_storage_bytes);
  
  //prepare buffer for reduction
  //use non-blocking accelerator_for to avoid syncs (ok because we submit to same computeStream)
@@ -105,18 +79,14 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
    exit(EXIT_FAILURE);
  }
  
-  gpuErr = gpuMemcpyAsync(&lvSum[0],d_out,rd*sizeof(vobj),gpuMemcpyDeviceToHost,computeStream);
-  if (gpuErr!=gpuSuccess) {
-    std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpuMemcpy (d_out)! Error: " << gpuErr <<std::endl;
-    exit(EXIT_FAILURE);
-  }
-
+  acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
+  
  //sync after copy
  accelerator_barrier();
 
-  gpuFree(temp_storage_array);
-  gpuFree(d_out);
-  gpuFree(d_offsets);
+  acceleratorFreeDevice(temp_storage_array);
+  acceleratorFreeDevice(d_out);
+  acceleratorFreeDevice(d_offsets);
  

 }
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -225,6 +225,8 @@ inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
+inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
+inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
@@ -442,6 +444,8 @@ inline void acceleratorFreeShared(void *ptr){ auto r=hipFree(ptr);};
 inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);};
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { auto r=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto r=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
+inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);}
+inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);}
 //inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
 //inline void acceleratorCopySynchronise(void) {  }
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto r=hipMemset(base,value,bytes);}