diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index 9c40f538..32a6693e 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -8,6 +8,7 @@ void acceleratorThreads(uint32_t t) {accelerator_threads = t;}; #ifdef GRID_CUDA cudaDeviceProp *gpu_props; +cudaStream_t copyStream; void acceleratorInit(void) { int nDevices = 1; diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index a8c91aa8..fa4d1dc0 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -105,6 +105,7 @@ void acceleratorInit(void); #define accelerator_inline __host__ __device__ inline extern int acceleratorAbortOnGpuError; +extern cudaStream_t copyStream; accelerator_inline int acceleratorSIMTlane(int Nsimd) { #ifdef GRID_SIMT @@ -213,9 +214,13 @@ inline void *acceleratorAllocDevice(size_t bytes) inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} -inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToDevice);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);} +inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch +{ + cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream); +} +inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); }; inline int acceleratorIsCommunicable(void *ptr) { // int uvm=0; @@ -289,7 +294,10 @@ inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*t inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);}; inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; -inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} +inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { + theGridAccelerator->memcpy(to,from,bytes); +} +inline void acceleratorCopySynchronise(void) { theGridAccelerator->wait(); } inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} inline void acceleratorMemSet(void *base,int value,size_t bytes) { theGridAccelerator->memset(base,value,bytes); theGridAccelerator->wait();} @@ -394,7 +402,8 @@ inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} -inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);} +inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);} +inline void acceleratorCopySynchronise(void) { } inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);} #endif @@ -435,7 +444,8 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(bas accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);} -inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);} +inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);} +inline void acceleratorCopySynchronize(void) {}; inline int acceleratorIsCommunicable(void *ptr){ return 1; } inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);}