mirror of
https://github.com/paboyle/Grid.git
synced 2026-05-27 12:34:16 +01:00
sumD_gpu_reduce_words: fuse pack+reduce into single packReduceKernel
Replace the two-kernel pack+reduce sequence with a single fused kernel packReduceKernel<R> that reads R words of each vobj at offset 'base' and accumulates directly into iVector<iScalar<scalarD>,R>, eliminating the intermediate bundle buffer entirely. HBM access per word-group drops from 3x (pack-read + pack-write + reduce-read) to 1x. Thread count comes from getNumBlocksAndThreads (warpSize..256) rather than acceleratorThreads(), so occupancy is correct regardless of the --accelerator-threads setting. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -239,41 +239,136 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Pack R consecutive vector_type words of lat[0..osites-1] starting at word
|
// Fused pack+reduce: reads R words of each vobj at word offset 'base',
|
||||||
// 'base' into a Bundle = iVector<iScalar<vector>,R> per site, then reduce
|
// accumulates directly into iVector<iScalar<scalarD>,R> without staging
|
||||||
// with sumD_gpu_small. Bundle::Nsimd() == vector::Nsimd(), so the existing
|
// through an intermediate bundle buffer. One HBM pass instead of three.
|
||||||
// shared-memory kernel handles SIMD-lane extraction and double-promotion
|
template <int R, class vobj, class sobj, class Iterator>
|
||||||
// correctly. sizeof(Bundle::scalar_objectD) = R*sizeof(scalarD) <= 192 B
|
__device__ void packReduceBlocks(
|
||||||
// for R<=12, safely within sharedMemPerBlock on all supported devices.
|
const iScalar<typename vobj::vector_type> *idat,
|
||||||
|
sobj *g_odata, Iterator osites, int base, int words)
|
||||||
|
{
|
||||||
|
constexpr Iterator nsimd = vobj::Nsimd();
|
||||||
|
Iterator blockSize = blockDim.x;
|
||||||
|
|
||||||
|
extern __shared__ __align__(COALESCE_GRANULARITY) unsigned char shmem_pointer[];
|
||||||
|
sobj *sdata = (sobj *)shmem_pointer;
|
||||||
|
|
||||||
|
Iterator tid = threadIdx.x;
|
||||||
|
Iterator i = blockIdx.x * (blockSize * 2) + threadIdx.x;
|
||||||
|
Iterator gridSize = blockSize * 2 * gridDim.x;
|
||||||
|
sobj mySum = Zero();
|
||||||
|
|
||||||
|
while (i < osites * nsimd) {
|
||||||
|
Iterator lane = i % nsimd;
|
||||||
|
Iterator ss = i / nsimd;
|
||||||
|
sobj tmpD; zeroit(tmpD);
|
||||||
|
for (int k = 0; k < R; k++) {
|
||||||
|
auto w = extractLane(lane, idat[ss * words + base + k]);
|
||||||
|
iScalar<typename vobj::scalar_typeD> wd; wd = w;
|
||||||
|
tmpD._internal[k] = wd;
|
||||||
|
}
|
||||||
|
mySum += tmpD;
|
||||||
|
|
||||||
|
if (i + blockSize < osites * nsimd) {
|
||||||
|
lane = (i + blockSize) % nsimd;
|
||||||
|
ss = (i + blockSize) / nsimd;
|
||||||
|
sobj tmpD2; zeroit(tmpD2);
|
||||||
|
for (int k = 0; k < R; k++) {
|
||||||
|
auto w = extractLane(lane, idat[ss * words + base + k]);
|
||||||
|
iScalar<typename vobj::scalar_typeD> wd; wd = w;
|
||||||
|
tmpD2._internal[k] = wd;
|
||||||
|
}
|
||||||
|
mySum += tmpD2;
|
||||||
|
}
|
||||||
|
i += gridSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
reduceBlock(sdata, mySum, tid);
|
||||||
|
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int R, class vobj, class sobj, class Iterator>
|
||||||
|
__global__ void packReduceKernel(
|
||||||
|
const iScalar<typename vobj::vector_type> *idat,
|
||||||
|
sobj *buffer, Iterator osites, int base, int words)
|
||||||
|
{
|
||||||
|
Iterator blockSize = blockDim.x;
|
||||||
|
|
||||||
|
packReduceBlocks<R, vobj, sobj>(idat, buffer, osites, base, words);
|
||||||
|
|
||||||
|
if (gridDim.x > 1) {
|
||||||
|
const Iterator tid = threadIdx.x;
|
||||||
|
__shared__ bool amLast;
|
||||||
|
extern __shared__ __align__(COALESCE_GRANULARITY) unsigned char shmem_pointer[];
|
||||||
|
sobj *smem = (sobj *)shmem_pointer;
|
||||||
|
|
||||||
|
acceleratorFence();
|
||||||
|
|
||||||
|
if (tid == 0) {
|
||||||
|
unsigned int ticket = atomicInc(&retirementCount, gridDim.x);
|
||||||
|
amLast = (ticket == gridDim.x - 1);
|
||||||
|
}
|
||||||
|
acceleratorSynchroniseAll();
|
||||||
|
|
||||||
|
if (amLast) {
|
||||||
|
Iterator i = tid;
|
||||||
|
sobj mySum = Zero();
|
||||||
|
while (i < (Iterator)gridDim.x) {
|
||||||
|
mySum += buffer[i];
|
||||||
|
i += blockSize;
|
||||||
|
}
|
||||||
|
reduceBlock(smem, mySum, tid);
|
||||||
|
if (tid == 0) {
|
||||||
|
buffer[0] = smem[0];
|
||||||
|
retirementCount = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template<int R, class vobj>
|
template<int R, class vobj>
|
||||||
inline void sumD_gpu_reduce_words(const vobj *lat, Integer osites,
|
inline void sumD_gpu_reduce_words(const vobj *lat, Integer osites,
|
||||||
typename vobj::scalar_typeD *ret_p, int base)
|
typename vobj::scalar_typeD *ret_p, int base)
|
||||||
{
|
{
|
||||||
typedef typename vobj::vector_type vector;
|
typedef typename vobj::vector_type vector;
|
||||||
using Bundle = iVector<iScalar<vector>, R>;
|
typedef typename vobj::scalar_typeD scalarD;
|
||||||
|
using BundleScalarD = iVector<iScalar<scalarD>, R>;
|
||||||
|
|
||||||
const int words = sizeof(vobj) / sizeof(vector);
|
|
||||||
iScalar<vector> *idat = (iScalar<vector> *)lat;
|
|
||||||
|
|
||||||
deviceVector<Bundle> buf(osites);
|
|
||||||
Bundle *buf_p = &buf[0];
|
|
||||||
|
|
||||||
#ifdef GRID_REDUCTION_TIMING
|
|
||||||
RealD t_pack = -usecond();
|
|
||||||
#endif
|
|
||||||
constexpr int Nsimd = vobj::Nsimd();
|
constexpr int Nsimd = vobj::Nsimd();
|
||||||
accelerator_for2d(k, R, ss, osites, Nsimd, {
|
const int words = sizeof(vobj) / sizeof(vector);
|
||||||
coalescedWrite(buf_p[ss]._internal[k], coalescedRead(idat[ss * words + base + k]));
|
const iScalar<vector> *idat = (const iScalar<vector> *)lat;
|
||||||
});
|
|
||||||
|
Integer size = (Integer)osites * Nsimd;
|
||||||
|
Integer numThreads, numBlocks;
|
||||||
|
int ok = getNumBlocksAndThreads(size, sizeof(BundleScalarD), numThreads, numBlocks);
|
||||||
|
GRID_ASSERT(ok);
|
||||||
|
|
||||||
|
Integer smemSize = numThreads * sizeof(BundleScalarD);
|
||||||
|
deviceVector<BundleScalarD> buffer(numBlocks);
|
||||||
|
BundleScalarD *buffer_v = &buffer[0];
|
||||||
|
BundleScalarD result;
|
||||||
|
|
||||||
#ifdef GRID_REDUCTION_TIMING
|
#ifdef GRID_REDUCTION_TIMING
|
||||||
t_pack += usecond();
|
RealD t_kernel = -usecond();
|
||||||
|
#endif
|
||||||
|
packReduceKernel<R, vobj, BundleScalarD, Integer>
|
||||||
|
<<<numBlocks, numThreads, smemSize, computeStream>>>
|
||||||
|
(idat, buffer_v, osites, base, words);
|
||||||
|
accelerator_barrier();
|
||||||
|
#ifdef GRID_REDUCTION_TIMING
|
||||||
|
t_kernel += usecond();
|
||||||
|
RealD t_d2h = -usecond();
|
||||||
|
#endif
|
||||||
|
acceleratorCopyFromDevice(buffer_v, &result, sizeof(result));
|
||||||
|
#ifdef GRID_REDUCTION_TIMING
|
||||||
|
t_d2h += usecond();
|
||||||
std::cout << GridLogMessage << " sumD_gpu_reduce_words R=" << R
|
std::cout << GridLogMessage << " sumD_gpu_reduce_words R=" << R
|
||||||
<< " base=" << base << " pack=" << t_pack << " us" << std::endl;
|
<< " base=" << base
|
||||||
|
<< " kernel=" << t_kernel << " D2H=" << t_d2h << " us" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
auto sum_bundle = sumD_gpu_small(buf_p, osites);
|
|
||||||
for (int k = 0; k < R; k++)
|
for (int k = 0; k < R; k++)
|
||||||
ret_p[base + k] = TensorRemove(sum_bundle._internal[k]);
|
ret_p[base + k] = TensorRemove(result._internal[k]);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class vobj>
|
template <class vobj>
|
||||||
|
|||||||
Reference in New Issue
Block a user