1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-18 07:47:06 +01:00

Merge branch 'develop' of https://github.com/paboyle/Grid into develop

This commit is contained in:
Peter Boyle
2021-09-21 09:25:07 -07:00
123 changed files with 3362 additions and 1073 deletions

View File

@ -8,6 +8,7 @@ void acceleratorThreads(uint32_t t) {accelerator_threads = t;};
#ifdef GRID_CUDA
cudaDeviceProp *gpu_props;
cudaStream_t copyStream;
void acceleratorInit(void)
{
int nDevices = 1;
@ -83,11 +84,11 @@ void acceleratorInit(void)
printf("AcceleratorCudaInit: using default device \n");
printf("AcceleratorCudaInit: assume user either uses a) IBM jsrun, or \n");
printf("AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
printf("AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no \n");
printf("AcceleratorCudaInit: Configure options --enable-setdevice=no \n");
}
#else
printf("AcceleratorCudaInit: rank %d setting device to node rank %d\n",world_rank,rank);
printf("AcceleratorCudaInit: Configure options --enable-select-gpu=yes \n");
printf("AcceleratorCudaInit: Configure options --enable-setdevice=yes \n");
cudaSetDevice(rank);
#endif
if ( world_rank == 0 ) printf("AcceleratorCudaInit: ================================================\n");

View File

@ -39,6 +39,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifdef HAVE_MM_MALLOC_H
#include <mm_malloc.h>
#endif
#ifdef __APPLE__
// no memalign
inline void *memalign(size_t align, size_t bytes) { return malloc(bytes); }
#endif
NAMESPACE_BEGIN(Grid);
@ -101,6 +105,7 @@ void acceleratorInit(void);
#define accelerator_inline __host__ __device__ inline
extern int acceleratorAbortOnGpuError;
extern cudaStream_t copyStream;
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#ifdef GRID_SIMT
@ -209,9 +214,13 @@ inline void *acceleratorAllocDevice(size_t bytes)
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
{
cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
}
inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
inline int acceleratorIsCommunicable(void *ptr)
{
// int uvm=0;
@ -285,7 +294,10 @@ inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*t
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) {
theGridAccelerator->memcpy(to,from,bytes);
}
inline void acceleratorCopySynchronise(void) { theGridAccelerator->wait(); }
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theGridAccelerator->memset(base,value,bytes); theGridAccelerator->wait();}
@ -390,7 +402,8 @@ inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
inline void acceleratorCopySynchronise(void) { }
inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);}
#endif
@ -419,6 +432,8 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(bas
#undef GRID_SIMT
#define accelerator
#define accelerator_inline strong_inline
#define accelerator_for(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
@ -429,7 +444,8 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(bas
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
inline void acceleratorCopySynchronize(void) {};
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);}