1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

Copy stream HIP improvements

This commit is contained in:
Peter Boyle 2022-08-04 15:24:03 -04:00
parent 806b02bddf
commit 2cb5bedc15
2 changed files with 15 additions and 10 deletions

View File

@ -1,6 +1,7 @@
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
int world_rank; // Use to control world rank for print guarding
int acceleratorAbortOnGpuError=1; int acceleratorAbortOnGpuError=1;
uint32_t accelerator_threads=2; uint32_t accelerator_threads=2;
uint32_t acceleratorThreads(void) {return accelerator_threads;}; uint32_t acceleratorThreads(void) {return accelerator_threads;};
@ -16,7 +17,7 @@ void acceleratorThreads(uint32_t t) {accelerator_threads = t;};
#ifdef GRID_CUDA #ifdef GRID_CUDA
cudaDeviceProp *gpu_props; cudaDeviceProp *gpu_props;
cudaStream_t copyStream; cudaStream_t copyStream;
cudaStream_t cpuStream; cudaStream_t computeStream;
void acceleratorInit(void) void acceleratorInit(void)
{ {
int nDevices = 1; int nDevices = 1;
@ -24,7 +25,8 @@ void acceleratorInit(void)
gpu_props = new cudaDeviceProp[nDevices]; gpu_props = new cudaDeviceProp[nDevices];
char * localRankStr = NULL; char * localRankStr = NULL;
int rank = 0, world_rank=0; int rank = 0;
world_rank=0;
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
if ((localRankStr = getenv(ENV_RANK_SLURM )) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_SLURM )) != NULL) { world_rank = atoi(localRankStr);}
@ -99,7 +101,7 @@ void acceleratorInit(void)
cudaSetDevice(device); cudaSetDevice(device);
cudaStreamCreate(&copyStream); cudaStreamCreate(&copyStream);
cudaStreamCreate(&cpuStream); cudaStreamCreate(&computeStream);
const int len=64; const int len=64;
char busid[len]; char busid[len];
if( rank == world_rank ) { if( rank == world_rank ) {
@ -114,7 +116,7 @@ void acceleratorInit(void)
#ifdef GRID_HIP #ifdef GRID_HIP
hipDeviceProp_t *gpu_props; hipDeviceProp_t *gpu_props;
hipStream_t copyStream; hipStream_t copyStream;
hipStream_t cpuStream; hipStream_t computeStream;
void acceleratorInit(void) void acceleratorInit(void)
{ {
int nDevices = 1; int nDevices = 1;
@ -122,7 +124,8 @@ void acceleratorInit(void)
gpu_props = new hipDeviceProp_t[nDevices]; gpu_props = new hipDeviceProp_t[nDevices];
char * localRankStr = NULL; char * localRankStr = NULL;
int rank = 0, world_rank=0; int rank = 0;
world_rank=0;
// We extract the local rank initialization using an environment variable // We extract the local rank initialization using an environment variable
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
{ {
@ -183,7 +186,7 @@ void acceleratorInit(void)
#endif #endif
hipSetDevice(device); hipSetDevice(device);
hipStreamCreate(&copyStream); hipStreamCreate(&copyStream);
hipStreamCreate(&cpuStream); hipStreamCreate(&computeStream);
const int len=64; const int len=64;
char busid[len]; char busid[len];
if( rank == world_rank ) { if( rank == world_rank ) {
@ -210,7 +213,8 @@ void acceleratorInit(void)
#endif #endif
char * localRankStr = NULL; char * localRankStr = NULL;
int rank = 0, world_rank=0; int rank = 0;
world_rank=0;
// We extract the local rank initialization using an environment variable // We extract the local rank initialization using an environment variable
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)

View File

@ -370,7 +370,8 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
num1,num2,nsimd, lambda); \ num1,num2,nsimd, lambda); \
} \ } \
} }
// Works with MPI if barrier here
// accelerator_barrier();
template<typename lambda> __global__ template<typename lambda> __global__
__launch_bounds__(64,1) __launch_bounds__(64,1)
@ -400,7 +401,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
#define accelerator_barrier(dummy) \ #define accelerator_barrier(dummy) \
{ \ { \
hipStreamSynchronize(cpuStream); \ hipDeviceSynchronize(); \
auto err = hipGetLastError(); \ auto err = hipGetLastError(); \
if ( err != hipSuccess ) { \ if ( err != hipSuccess ) { \
printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \ printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
@ -443,7 +444,7 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(bas
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
{ {
hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice); hipMemcpyDtoDAsync(to,from,bytes, copyStream);
} }
inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); }; inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); };