From 0b905a72ddfafcf02bcc4b6738ff31c74be79ed5 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 29 Oct 2021 02:22:22 +0100 Subject: [PATCH] Better reduction for GPUs --- Grid/lattice/Lattice_reduction_gpu.h | 48 ++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index c2875052..823e497e 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -23,7 +23,7 @@ unsigned int nextPow2(Iterator x) { } template -void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) { +int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) { int device; #ifdef GRID_CUDA @@ -37,13 +37,13 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock; Iterator maxThreadsPerBlock = gpu_props[device].maxThreadsPerBlock; Iterator multiProcessorCount = gpu_props[device].multiProcessorCount; - + /* std::cout << GridLogDebug << "GPU has:" << std::endl; std::cout << GridLogDebug << "\twarpSize = " << warpSize << std::endl; std::cout << GridLogDebug << "\tsharedMemPerBlock = " << sharedMemPerBlock << std::endl; std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << maxThreadsPerBlock << std::endl; std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl; - + */ if (warpSize != WARP_SIZE) { std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl; exit(EXIT_FAILURE); @@ -53,12 +53,12 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator threads = warpSize; if ( threads*sizeofsobj > sharedMemPerBlock ) { std::cout << GridLogError << "The object is too large for the shared memory." << std::endl; - exit(EXIT_FAILURE); + return 0; } while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2; // keep all the streaming multiprocessors busy blocks = nextPow2(multiProcessorCount); - + return 1; } template @@ -198,7 +198,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) { // Possibly promote to double and sum ///////////////////////////////////////////////////////////////////////////////////////////////////////// template -inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) +inline typename vobj::scalar_objectD sumD_gpu_internal(const vobj *lat, Integer osites) { typedef typename vobj::scalar_objectD sobj; typedef decltype(lat) Iterator; @@ -208,6 +208,7 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) Integer numThreads, numBlocks; getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks); + Integer smemSize = numThreads * sizeof(sobj); Vector buffer(numBlocks); @@ -218,6 +219,41 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) auto result = buffer_v[0]; return result; } +template +inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) +{ + typedef typename vobj::vector_type vector; + typedef typename vobj::scalar_typeD scalarD; + typedef typename vobj::scalar_objectD sobj; + sobj ret; + scalarD *ret_p = (scalarD *)&ret; + + const int words = sizeof(vobj)/sizeof(vector); + + Integer nsimd= vobj::Nsimd(); + Integer size = osites*nsimd; + Integer numThreads, numBlocks; + int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks); + + if ( ok ) { + ret = sumD_gpu_internal(lat,osites); + } else { + std::cout << GridLogWarning << " dropping to summing word by word for large object size "< buffer(osites); + vector *dat = (vector *)lat; + vector *buf = &buffer[0]; + iScalar *tbuf =(iScalar *) &buffer[0]; + for(int w=0;w