sumD_gpu_reduce_words: fuse pack+reduce into single packReduceKernel

Replace the two-kernel pack+reduce sequence with a single fused kernel packReduceKernel<R> that reads R words of each vobj at offset 'base' and accumulates directly into iVector<iScalar<scalarD>,R>, eliminating the intermediate bundle buffer entirely. HBM access per word-group drops from 3x (pack-read + pack-write + reduce-read) to 1x. Thread count comes from getNumBlocksAndThreads (warpSize..256) rather than acceleratorThreads(), so occupancy is correct regardless of the --accelerator-threads setting. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-07-17 15:43:27 +01:00 · 2026-05-21 12:34:30 -04:00
parent d6b1388741
commit a424775884
1 changed files with 119 additions and 24 deletions
@@ -239,41 +239,136 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
  return result;
 }
-// Pack R consecutive vector_type words of lat[0..osites-1] starting at word
+// Fused pack+reduce: reads R words of each vobj at word offset 'base',
-// 'base' into a Bundle = iVector<iScalar<vector>,R> per site, then reduce
+// accumulates directly into iVector<iScalar<scalarD>,R> without staging
-// with sumD_gpu_small.  Bundle::Nsimd() == vector::Nsimd(), so the existing
+// through an intermediate bundle buffer.  One HBM pass instead of three.
-// shared-memory kernel handles SIMD-lane extraction and double-promotion
+template <int R, class vobj, class sobj, class Iterator>
-// correctly.  sizeof(Bundle::scalar_objectD) = R*sizeof(scalarD) <= 192 B
+__device__ void packReduceBlocks(
-// for R<=12, safely within sharedMemPerBlock on all supported devices.
+    const iScalar<typename vobj::vector_type> *idat,
    sobj *g_odata, Iterator osites, int base, int words)
 {
  constexpr Iterator nsimd = vobj::Nsimd();
  Iterator blockSize = blockDim.x;
  extern __shared__ __align__(COALESCE_GRANULARITY) unsigned char shmem_pointer[];
  sobj *sdata = (sobj *)shmem_pointer;
  Iterator tid      = threadIdx.x;
  Iterator i        = blockIdx.x * (blockSize * 2) + threadIdx.x;
  Iterator gridSize = blockSize * 2 * gridDim.x;
  sobj mySum = Zero();
  while (i < osites * nsimd) {
    Iterator lane = i % nsimd;
    Iterator ss   = i / nsimd;
    sobj tmpD; zeroit(tmpD);
    for (int k = 0; k < R; k++) {
      auto w = extractLane(lane, idat[ss * words + base + k]);
      iScalar<typename vobj::scalar_typeD> wd; wd = w;
      tmpD._internal[k] = wd;
    }
    mySum += tmpD;
    if (i + blockSize < osites * nsimd) {
      lane = (i + blockSize) % nsimd;
      ss   = (i + blockSize) / nsimd;
      sobj tmpD2; zeroit(tmpD2);
      for (int k = 0; k < R; k++) {
        auto w = extractLane(lane, idat[ss * words + base + k]);
        iScalar<typename vobj::scalar_typeD> wd; wd = w;
        tmpD2._internal[k] = wd;
      }
      mySum += tmpD2;
    }
    i += gridSize;
  }
  reduceBlock(sdata, mySum, tid);
  if (tid == 0) g_odata[blockIdx.x] = sdata[0];
 }
 template <int R, class vobj, class sobj, class Iterator>
 __global__ void packReduceKernel(
    const iScalar<typename vobj::vector_type> *idat,
    sobj *buffer, Iterator osites, int base, int words)
 {
  Iterator blockSize = blockDim.x;
  packReduceBlocks<R, vobj, sobj>(idat, buffer, osites, base, words);
  if (gridDim.x > 1) {
    const Iterator tid = threadIdx.x;
    __shared__ bool amLast;
    extern __shared__ __align__(COALESCE_GRANULARITY) unsigned char shmem_pointer[];
    sobj *smem = (sobj *)shmem_pointer;
    acceleratorFence();
    if (tid == 0) {
      unsigned int ticket = atomicInc(&retirementCount, gridDim.x);
      amLast = (ticket == gridDim.x - 1);
    }
    acceleratorSynchroniseAll();
    if (amLast) {
      Iterator i = tid;
      sobj mySum = Zero();
      while (i < (Iterator)gridDim.x) {
        mySum += buffer[i];
        i += blockSize;
      }
      reduceBlock(smem, mySum, tid);
      if (tid == 0) {
        buffer[0] = smem[0];
        retirementCount = 0;
      }
    }
  }
 }
 template<int R, class vobj>
 inline void sumD_gpu_reduce_words(const vobj *lat, Integer osites,
                                   typename vobj::scalar_typeD *ret_p, int base)
 {
-  typedef typename vobj::vector_type vector;
+  typedef typename vobj::vector_type  vector;
-  using Bundle = iVector<iScalar<vector>, R>;
+  typedef typename vobj::scalar_typeD scalarD;
  using BundleScalarD = iVector<iScalar<scalarD>, R>;
  const int words = sizeof(vobj) / sizeof(vector);
  iScalar<vector> *idat = (iScalar<vector> *)lat;
  deviceVector<Bundle> buf(osites);
  Bundle *buf_p = &buf[0];
 #ifdef GRID_REDUCTION_TIMING
  RealD t_pack = -usecond();
 #endif
  constexpr int Nsimd = vobj::Nsimd();
-  accelerator_for2d(k, R, ss, osites, Nsimd, {
+  const int words = sizeof(vobj) / sizeof(vector);
-    coalescedWrite(buf_p[ss]._internal[k], coalescedRead(idat[ss * words + base + k]));
+  const iScalar<vector> *idat = (const iScalar<vector> *)lat;
-  });
+
  Integer size = (Integer)osites * Nsimd;
  Integer numThreads, numBlocks;
  int ok = getNumBlocksAndThreads(size, sizeof(BundleScalarD), numThreads, numBlocks);
  GRID_ASSERT(ok);
  Integer smemSize = numThreads * sizeof(BundleScalarD);
  deviceVector<BundleScalarD> buffer(numBlocks);
  BundleScalarD *buffer_v = &buffer[0];
  BundleScalarD result;
 #ifdef GRID_REDUCTION_TIMING
-  t_pack += usecond();
+  RealD t_kernel = -usecond();
 #endif
  packReduceKernel<R, vobj, BundleScalarD, Integer>
    <<<numBlocks, numThreads, smemSize, computeStream>>>
    (idat, buffer_v, osites, base, words);
  accelerator_barrier();
 #ifdef GRID_REDUCTION_TIMING
  t_kernel += usecond();
  RealD t_d2h = -usecond();
 #endif
  acceleratorCopyFromDevice(buffer_v, &result, sizeof(result));
 #ifdef GRID_REDUCTION_TIMING
  t_d2h += usecond();
  std::cout << GridLogMessage << " sumD_gpu_reduce_words R=" << R
-            << " base=" << base << " pack=" << t_pack << " us" << std::endl;
+            << " base=" << base
            << " kernel=" << t_kernel << " D2H=" << t_d2h << " us" << std::endl;
 #endif
  auto sum_bundle = sumD_gpu_small(buf_p, osites);
  for (int k = 0; k < R; k++)
-    ret_p[base + k] = TensorRemove(sum_bundle._internal[k]);
+    ret_p[base + k] = TensorRemove(result._internal[k]);
 }
 template <class vobj>