1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-09-20 09:15:38 +01:00

Copy stream HIP improvements

This commit is contained in:
Peter Boyle 2022-08-04 15:24:03 -04:00
parent 806b02bddf
commit 2cb5bedc15
2 changed files with 15 additions and 10 deletions

View File

@ -1,6 +1,7 @@
#include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid);
int world_rank; // Use to control world rank for print guarding
int acceleratorAbortOnGpuError=1;
uint32_t accelerator_threads=2;
uint32_t acceleratorThreads(void) {return accelerator_threads;};
@ -16,7 +17,7 @@ void acceleratorThreads(uint32_t t) {accelerator_threads = t;};
#ifdef GRID_CUDA
cudaDeviceProp *gpu_props;
cudaStream_t copyStream;
cudaStream_t cpuStream;
cudaStream_t computeStream;
void acceleratorInit(void)
{
int nDevices = 1;
@ -24,7 +25,8 @@ void acceleratorInit(void)
gpu_props = new cudaDeviceProp[nDevices];
char * localRankStr = NULL;
int rank = 0, world_rank=0;
int rank = 0;
world_rank=0;
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
if ((localRankStr = getenv(ENV_RANK_SLURM )) != NULL) { world_rank = atoi(localRankStr);}
@ -99,7 +101,7 @@ void acceleratorInit(void)
cudaSetDevice(device);
cudaStreamCreate(&copyStream);
cudaStreamCreate(&cpuStream);
cudaStreamCreate(&computeStream);
const int len=64;
char busid[len];
if( rank == world_rank ) {
@ -114,7 +116,7 @@ void acceleratorInit(void)
#ifdef GRID_HIP
hipDeviceProp_t *gpu_props;
hipStream_t copyStream;
hipStream_t cpuStream;
hipStream_t computeStream;
void acceleratorInit(void)
{
int nDevices = 1;
@ -122,7 +124,8 @@ void acceleratorInit(void)
gpu_props = new hipDeviceProp_t[nDevices];
char * localRankStr = NULL;
int rank = 0, world_rank=0;
int rank = 0;
world_rank=0;
// We extract the local rank initialization using an environment variable
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
{
@ -183,7 +186,7 @@ void acceleratorInit(void)
#endif
hipSetDevice(device);
hipStreamCreate(&copyStream);
hipStreamCreate(&cpuStream);
hipStreamCreate(&computeStream);
const int len=64;
char busid[len];
if( rank == world_rank ) {
@ -210,7 +213,8 @@ void acceleratorInit(void)
#endif
char * localRankStr = NULL;
int rank = 0, world_rank=0;
int rank = 0;
world_rank=0;
// We extract the local rank initialization using an environment variable
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)

View File

@ -370,7 +370,8 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
num1,num2,nsimd, lambda); \
} \
}
// Works with MPI if barrier here
// accelerator_barrier();
template<typename lambda> __global__
__launch_bounds__(64,1)
@ -400,7 +401,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
#define accelerator_barrier(dummy) \
{ \
hipStreamSynchronize(cpuStream); \
hipDeviceSynchronize(); \
auto err = hipGetLastError(); \
if ( err != hipSuccess ) { \
printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
@ -443,7 +444,7 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(bas
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
{
hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);
hipMemcpyDtoDAsync(to,from,bytes, copyStream);
}
inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); };