mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-09 21:50:45 +01:00
Hip reduction too
This commit is contained in:
parent
556da86ac3
commit
92b342a477
@ -1,7 +1,13 @@
|
|||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
#define WARP_SIZE 32
|
#ifdef GRID_HIP
|
||||||
|
extern hipDeviceProp_t *gpu_props;
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_CUDA
|
||||||
extern cudaDeviceProp *gpu_props;
|
extern cudaDeviceProp *gpu_props;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define WARP_SIZE 32
|
||||||
__device__ unsigned int retirementCount = 0;
|
__device__ unsigned int retirementCount = 0;
|
||||||
|
|
||||||
template <class Iterator>
|
template <class Iterator>
|
||||||
@ -19,7 +25,12 @@ template <class Iterator>
|
|||||||
void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
|
void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
|
||||||
|
|
||||||
int device;
|
int device;
|
||||||
|
#ifdef GRID_CUDA
|
||||||
cudaGetDevice(&device);
|
cudaGetDevice(&device);
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
hipGetDevice(&device);
|
||||||
|
#endif
|
||||||
|
|
||||||
Iterator warpSize = gpu_props[device].warpSize;
|
Iterator warpSize = gpu_props[device].warpSize;
|
||||||
Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock;
|
Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock;
|
||||||
@ -147,7 +158,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
|
|||||||
sobj *smem = (sobj *)shmem_pointer;
|
sobj *smem = (sobj *)shmem_pointer;
|
||||||
|
|
||||||
// wait until all outstanding memory instructions in this thread are finished
|
// wait until all outstanding memory instructions in this thread are finished
|
||||||
__threadfence();
|
acceleratorFence();
|
||||||
|
|
||||||
if (tid==0) {
|
if (tid==0) {
|
||||||
unsigned int ticket = atomicInc(&retirementCount, gridDim.x);
|
unsigned int ticket = atomicInc(&retirementCount, gridDim.x);
|
||||||
@ -156,8 +167,8 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// each thread must read the correct value of amLast
|
// each thread must read the correct value of amLast
|
||||||
__syncthreads();
|
acceleratorSynchroniseAll();
|
||||||
|
|
||||||
if (amLast) {
|
if (amLast) {
|
||||||
// reduce buffer[0], ..., buffer[gridDim.x-1]
|
// reduce buffer[0], ..., buffer[gridDim.x-1]
|
||||||
Iterator i = tid;
|
Iterator i = tid;
|
||||||
@ -199,13 +210,7 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
|
|||||||
sobj *buffer_v = &buffer[0];
|
sobj *buffer_v = &buffer[0];
|
||||||
|
|
||||||
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
|
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
|
||||||
cudaDeviceSynchronize();
|
accelerator_barrier();
|
||||||
|
|
||||||
cudaError err = cudaGetLastError();
|
|
||||||
if ( cudaSuccess != err ) {
|
|
||||||
printf("Cuda error %s\n",cudaGetErrorString( err ));
|
|
||||||
exit(0);
|
|
||||||
}
|
|
||||||
auto result = buffer_v[0];
|
auto result = buffer_v[0];
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user