mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-18 15:57:05 +01:00
Merge branch 'develop' of https://github.com/paboyle/Grid into develop
This commit is contained in:
@ -84,7 +84,8 @@ void acceleratorInit(void)
|
||||
// IBM Jsrun makes cuda Device numbering screwy and not match rank
|
||||
if ( world_rank == 0 ) {
|
||||
printf("AcceleratorCudaInit: using default device \n");
|
||||
printf("AcceleratorCudaInit: assume user either uses a) IBM jsrun, or \n");
|
||||
printf("AcceleratorCudaInit: assume user either uses\n");
|
||||
printf("AcceleratorCudaInit: a) IBM jsrun, or \n");
|
||||
printf("AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
|
||||
printf("AcceleratorCudaInit: Configure options --enable-setdevice=no \n");
|
||||
}
|
||||
@ -109,6 +110,7 @@ void acceleratorInit(void)
|
||||
|
||||
#ifdef GRID_HIP
|
||||
hipDeviceProp_t *gpu_props;
|
||||
hipStream_t copyStream;
|
||||
void acceleratorInit(void)
|
||||
{
|
||||
int nDevices = 1;
|
||||
@ -166,16 +168,25 @@ void acceleratorInit(void)
|
||||
#ifdef GRID_DEFAULT_GPU
|
||||
if ( world_rank == 0 ) {
|
||||
printf("AcceleratorHipInit: using default device \n");
|
||||
printf("AcceleratorHipInit: assume user either uses a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
|
||||
printf("AcceleratorHipInit: Configure options --enable-summit, --enable-select-gpu=no \n");
|
||||
printf("AcceleratorHipInit: assume user or srun sets ROCR_VISIBLE_DEVICES and numa binding \n");
|
||||
printf("AcceleratorHipInit: Configure options --enable-setdevice=no \n");
|
||||
}
|
||||
int device = 0;
|
||||
#else
|
||||
if ( world_rank == 0 ) {
|
||||
printf("AcceleratorHipInit: rank %d setting device to node rank %d\n",world_rank,rank);
|
||||
printf("AcceleratorHipInit: Configure options --enable-select-gpu=yes \n");
|
||||
printf("AcceleratorHipInit: Configure options --enable-setdevice=yes \n");
|
||||
}
|
||||
hipSetDevice(rank);
|
||||
int device = rank;
|
||||
#endif
|
||||
hipSetDevice(device);
|
||||
hipStreamCreate(©Stream);
|
||||
const int len=64;
|
||||
char busid[len];
|
||||
if( rank == world_rank ) {
|
||||
hipDeviceGetPCIBusId(busid, len, device);
|
||||
printf("local rank %d device %d bus id: %s\n", rank, device, busid);
|
||||
}
|
||||
if ( world_rank == 0 ) printf("AcceleratorHipInit: ================================================\n");
|
||||
}
|
||||
#endif
|
||||
|
@ -230,6 +230,7 @@ inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes
|
||||
cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
|
||||
}
|
||||
inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
|
||||
|
||||
inline int acceleratorIsCommunicable(void *ptr)
|
||||
{
|
||||
// int uvm=0;
|
||||
@ -337,6 +338,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
#define accelerator __host__ __device__
|
||||
#define accelerator_inline __host__ __device__ inline
|
||||
|
||||
extern hipStream_t copyStream;
|
||||
/*These routines define mapping from thread grid to loop & vector lane indexing */
|
||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
||||
#ifdef GRID_SIMT
|
||||
@ -411,10 +413,16 @@ inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
|
||||
inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
|
||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
|
||||
inline void acceleratorCopySynchronise(void) { }
|
||||
//inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
|
||||
//inline void acceleratorCopySynchronise(void) { }
|
||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);}
|
||||
|
||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
||||
{
|
||||
hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToDevice,copyStream);
|
||||
}
|
||||
inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); };
|
||||
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////
|
||||
@ -485,18 +493,12 @@ inline void acceleratorFreeCpu (void *ptr){free(ptr);};
|
||||
///////////////////////////////////////////////////
|
||||
// Synchronise across local threads for divergence resynch
|
||||
///////////////////////////////////////////////////
|
||||
accelerator_inline void acceleratorSynchronise(void)
|
||||
accelerator_inline void acceleratorSynchronise(void) // Only Nvidia needs
|
||||
{
|
||||
#ifdef GRID_SIMT
|
||||
#ifdef GRID_CUDA
|
||||
__syncwarp();
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
//cl::sycl::detail::workGroupBarrier();
|
||||
#endif
|
||||
#ifdef GRID_HIP
|
||||
__syncthreads();
|
||||
#endif
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
Reference in New Issue
Block a user