From 6ceb55668420e843b984ed244953cdbc5612dfea Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 22 Nov 2021 20:45:12 -0500 Subject: [PATCH] Intranode asynch hipMemCopy --- Grid/threads/Accelerator.h | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index cec0600f..0dfe8c64 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -230,6 +230,7 @@ inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream); } inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); }; + inline int acceleratorIsCommunicable(void *ptr) { // int uvm=0; @@ -337,6 +338,7 @@ NAMESPACE_BEGIN(Grid); #define accelerator __host__ __device__ #define accelerator_inline __host__ __device__ inline +extern hipStream_t copyStream; /*These routines define mapping from thread grid to loop & vector lane indexing */ accelerator_inline int acceleratorSIMTlane(int Nsimd) { #ifdef GRID_SIMT @@ -411,10 +413,16 @@ inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} -inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);} -inline void acceleratorCopySynchronise(void) { } +//inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);} +//inline void acceleratorCopySynchronise(void) { } inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);} +inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch +{ + hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToDevice,copyStream); +} +inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); }; + #endif ////////////////////////////////////////////// @@ -485,18 +493,12 @@ inline void acceleratorFreeCpu (void *ptr){free(ptr);}; /////////////////////////////////////////////////// // Synchronise across local threads for divergence resynch /////////////////////////////////////////////////// -accelerator_inline void acceleratorSynchronise(void) +accelerator_inline void acceleratorSynchronise(void) // Only Nvidia needs { #ifdef GRID_SIMT #ifdef GRID_CUDA __syncwarp(); #endif -#ifdef GRID_SYCL - //cl::sycl::detail::workGroupBarrier(); -#endif -#ifdef GRID_HIP - __syncthreads(); -#endif #endif return; }