1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-09 21:50:45 +01:00

Introduce a non-default stream for compute operatoins

This commit is contained in:
Peter Boyle 2022-07-01 09:42:53 -04:00
parent 57b442d0de
commit bd99fd608c
2 changed files with 14 additions and 8 deletions

View File

@ -16,6 +16,7 @@ void acceleratorThreads(uint32_t t) {accelerator_threads = t;};
#ifdef GRID_CUDA #ifdef GRID_CUDA
cudaDeviceProp *gpu_props; cudaDeviceProp *gpu_props;
cudaStream_t copyStream; cudaStream_t copyStream;
cudaStream_t cpuStream;
void acceleratorInit(void) void acceleratorInit(void)
{ {
int nDevices = 1; int nDevices = 1;
@ -98,6 +99,7 @@ void acceleratorInit(void)
cudaSetDevice(device); cudaSetDevice(device);
cudaStreamCreate(&copyStream); cudaStreamCreate(&copyStream);
cudaStreamCreate(&cpuStream);
const int len=64; const int len=64;
char busid[len]; char busid[len];
if( rank == world_rank ) { if( rank == world_rank ) {
@ -112,6 +114,7 @@ void acceleratorInit(void)
#ifdef GRID_HIP #ifdef GRID_HIP
hipDeviceProp_t *gpu_props; hipDeviceProp_t *gpu_props;
hipStream_t copyStream; hipStream_t copyStream;
hipStream_t cpuStream;
void acceleratorInit(void) void acceleratorInit(void)
{ {
int nDevices = 1; int nDevices = 1;
@ -180,6 +183,7 @@ void acceleratorInit(void)
#endif #endif
hipSetDevice(device); hipSetDevice(device);
hipStreamCreate(&copyStream); hipStreamCreate(&copyStream);
hipStreamCreate(&cpuStream);
const int len=64; const int len=64;
char busid[len]; char busid[len];
if( rank == world_rank ) { if( rank == world_rank ) {

View File

@ -107,6 +107,7 @@ void acceleratorInit(void);
extern int acceleratorAbortOnGpuError; extern int acceleratorAbortOnGpuError;
extern cudaStream_t copyStream; extern cudaStream_t copyStream;
extern cudaStream_t cpuStream;
accelerator_inline int acceleratorSIMTlane(int Nsimd) { accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#ifdef GRID_SIMT #ifdef GRID_SIMT
@ -134,7 +135,7 @@ inline void cuda_mem(void)
}; \ }; \
dim3 cu_threads(nsimd,acceleratorThreads(),1); \ dim3 cu_threads(nsimd,acceleratorThreads(),1); \
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \ dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
LambdaApply<<<cu_blocks,cu_threads>>>(num1,num2,nsimd,lambda); \ LambdaApply<<<cu_blocks,cu_threads,0,cpuStream>>>(num1,num2,nsimd,lambda); \
} }
#define accelerator_for6dNB(iter1, num1, \ #define accelerator_for6dNB(iter1, num1, \
@ -153,7 +154,7 @@ inline void cuda_mem(void)
}; \ }; \
dim3 cu_blocks (num1,num2,num3); \ dim3 cu_blocks (num1,num2,num3); \
dim3 cu_threads(num4,num5,num6); \ dim3 cu_threads(num4,num5,num6); \
Lambda6Apply<<<cu_blocks,cu_threads>>>(num1,num2,num3,num4,num5,num6,lambda); \ Lambda6Apply<<<cu_blocks,cu_threads,0,cpuStream>>>(num1,num2,num3,num4,num5,num6,lambda); \
} }
template<typename lambda> __global__ template<typename lambda> __global__
@ -189,7 +190,7 @@ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
#define accelerator_barrier(dummy) \ #define accelerator_barrier(dummy) \
{ \ { \
cudaDeviceSynchronize(); \ cudaStreamSynchronize(cpuStream); \
cudaError err = cudaGetLastError(); \ cudaError err = cudaGetLastError(); \
if ( cudaSuccess != err ) { \ if ( cudaSuccess != err ) { \
printf("accelerator_barrier(): Cuda error %s \n", \ printf("accelerator_barrier(): Cuda error %s \n", \
@ -339,6 +340,7 @@ NAMESPACE_BEGIN(Grid);
#define accelerator_inline __host__ __device__ inline #define accelerator_inline __host__ __device__ inline
extern hipStream_t copyStream; extern hipStream_t copyStream;
extern hipStream_t cpuStream;
/*These routines define mapping from thread grid to loop & vector lane indexing */ /*These routines define mapping from thread grid to loop & vector lane indexing */
accelerator_inline int acceleratorSIMTlane(int Nsimd) { accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#ifdef GRID_SIMT #ifdef GRID_SIMT
@ -360,12 +362,12 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \ dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \
if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \ if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \
hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads, \ hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads, \
0,0, \ 0,cpuStream, \
num1,num2,nsimd, lambda); \ num1,num2,nsimd, lambda); \
} else { \ } else { \
hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \ hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \
0,0, \ 0,cpuStream, \
num1,num2,nsimd, lambda); \ num1,num2,nsimd, lambda); \
} \ } \
} }
@ -398,7 +400,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
#define accelerator_barrier(dummy) \ #define accelerator_barrier(dummy) \
{ \ { \
hipDeviceSynchronize(); \ hipStreamSynchronize(cpuStream); \
auto err = hipGetLastError(); \ auto err = hipGetLastError(); \
if ( err != hipSuccess ) { \ if ( err != hipSuccess ) { \
printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \ printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \