mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-09 21:50:45 +01:00
Introduce a non-default stream for compute operatoins
This commit is contained in:
parent
57b442d0de
commit
bd99fd608c
@ -16,6 +16,7 @@ void acceleratorThreads(uint32_t t) {accelerator_threads = t;};
|
|||||||
#ifdef GRID_CUDA
|
#ifdef GRID_CUDA
|
||||||
cudaDeviceProp *gpu_props;
|
cudaDeviceProp *gpu_props;
|
||||||
cudaStream_t copyStream;
|
cudaStream_t copyStream;
|
||||||
|
cudaStream_t cpuStream;
|
||||||
void acceleratorInit(void)
|
void acceleratorInit(void)
|
||||||
{
|
{
|
||||||
int nDevices = 1;
|
int nDevices = 1;
|
||||||
@ -98,6 +99,7 @@ void acceleratorInit(void)
|
|||||||
|
|
||||||
cudaSetDevice(device);
|
cudaSetDevice(device);
|
||||||
cudaStreamCreate(©Stream);
|
cudaStreamCreate(©Stream);
|
||||||
|
cudaStreamCreate(&cpuStream);
|
||||||
const int len=64;
|
const int len=64;
|
||||||
char busid[len];
|
char busid[len];
|
||||||
if( rank == world_rank ) {
|
if( rank == world_rank ) {
|
||||||
@ -112,6 +114,7 @@ void acceleratorInit(void)
|
|||||||
#ifdef GRID_HIP
|
#ifdef GRID_HIP
|
||||||
hipDeviceProp_t *gpu_props;
|
hipDeviceProp_t *gpu_props;
|
||||||
hipStream_t copyStream;
|
hipStream_t copyStream;
|
||||||
|
hipStream_t cpuStream;
|
||||||
void acceleratorInit(void)
|
void acceleratorInit(void)
|
||||||
{
|
{
|
||||||
int nDevices = 1;
|
int nDevices = 1;
|
||||||
@ -180,6 +183,7 @@ void acceleratorInit(void)
|
|||||||
#endif
|
#endif
|
||||||
hipSetDevice(device);
|
hipSetDevice(device);
|
||||||
hipStreamCreate(©Stream);
|
hipStreamCreate(©Stream);
|
||||||
|
hipStreamCreate(&cpuStream);
|
||||||
const int len=64;
|
const int len=64;
|
||||||
char busid[len];
|
char busid[len];
|
||||||
if( rank == world_rank ) {
|
if( rank == world_rank ) {
|
||||||
|
@ -107,6 +107,7 @@ void acceleratorInit(void);
|
|||||||
|
|
||||||
extern int acceleratorAbortOnGpuError;
|
extern int acceleratorAbortOnGpuError;
|
||||||
extern cudaStream_t copyStream;
|
extern cudaStream_t copyStream;
|
||||||
|
extern cudaStream_t cpuStream;
|
||||||
|
|
||||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
||||||
#ifdef GRID_SIMT
|
#ifdef GRID_SIMT
|
||||||
@ -134,7 +135,7 @@ inline void cuda_mem(void)
|
|||||||
}; \
|
}; \
|
||||||
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
|
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
|
||||||
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
|
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
|
||||||
LambdaApply<<<cu_blocks,cu_threads>>>(num1,num2,nsimd,lambda); \
|
LambdaApply<<<cu_blocks,cu_threads,0,cpuStream>>>(num1,num2,nsimd,lambda); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define accelerator_for6dNB(iter1, num1, \
|
#define accelerator_for6dNB(iter1, num1, \
|
||||||
@ -153,7 +154,7 @@ inline void cuda_mem(void)
|
|||||||
}; \
|
}; \
|
||||||
dim3 cu_blocks (num1,num2,num3); \
|
dim3 cu_blocks (num1,num2,num3); \
|
||||||
dim3 cu_threads(num4,num5,num6); \
|
dim3 cu_threads(num4,num5,num6); \
|
||||||
Lambda6Apply<<<cu_blocks,cu_threads>>>(num1,num2,num3,num4,num5,num6,lambda); \
|
Lambda6Apply<<<cu_blocks,cu_threads,0,cpuStream>>>(num1,num2,num3,num4,num5,num6,lambda); \
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename lambda> __global__
|
template<typename lambda> __global__
|
||||||
@ -189,7 +190,7 @@ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
|
|||||||
|
|
||||||
#define accelerator_barrier(dummy) \
|
#define accelerator_barrier(dummy) \
|
||||||
{ \
|
{ \
|
||||||
cudaDeviceSynchronize(); \
|
cudaStreamSynchronize(cpuStream); \
|
||||||
cudaError err = cudaGetLastError(); \
|
cudaError err = cudaGetLastError(); \
|
||||||
if ( cudaSuccess != err ) { \
|
if ( cudaSuccess != err ) { \
|
||||||
printf("accelerator_barrier(): Cuda error %s \n", \
|
printf("accelerator_barrier(): Cuda error %s \n", \
|
||||||
@ -339,6 +340,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
#define accelerator_inline __host__ __device__ inline
|
#define accelerator_inline __host__ __device__ inline
|
||||||
|
|
||||||
extern hipStream_t copyStream;
|
extern hipStream_t copyStream;
|
||||||
|
extern hipStream_t cpuStream;
|
||||||
/*These routines define mapping from thread grid to loop & vector lane indexing */
|
/*These routines define mapping from thread grid to loop & vector lane indexing */
|
||||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
||||||
#ifdef GRID_SIMT
|
#ifdef GRID_SIMT
|
||||||
@ -360,12 +362,12 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
|||||||
dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \
|
dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \
|
||||||
if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \
|
if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \
|
||||||
hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads, \
|
hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads, \
|
||||||
0,0, \
|
0,cpuStream, \
|
||||||
num1,num2,nsimd, lambda); \
|
num1,num2,nsimd, lambda); \
|
||||||
} else { \
|
} else { \
|
||||||
hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \
|
hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \
|
||||||
0,0, \
|
0,cpuStream, \
|
||||||
num1,num2,nsimd, lambda); \
|
num1,num2,nsimd, lambda); \
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -398,7 +400,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
|
|||||||
|
|
||||||
#define accelerator_barrier(dummy) \
|
#define accelerator_barrier(dummy) \
|
||||||
{ \
|
{ \
|
||||||
hipDeviceSynchronize(); \
|
hipStreamSynchronize(cpuStream); \
|
||||||
auto err = hipGetLastError(); \
|
auto err = hipGetLastError(); \
|
||||||
if ( err != hipSuccess ) { \
|
if ( err != hipSuccess ) { \
|
||||||
printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
|
printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
|
||||||
|
Loading…
x
Reference in New Issue
Block a user