sumD_gpu_direct: revert to per-lane write; CUB handles Nsimd*osites inputs

Benchmarking showed the shared-memory lane-summation approach (843d6497) was slower than writing each SIMD lane individually and letting CUB reduce the full nlanes = osites*Nsimd array. CUB's device reduce is more efficient over the larger input than the smem overhead + serialised lane-0 summation. The smem approach also required overriding acceleratorThreads() to avoid the block-size sizing problem. Restore the simpler per-lane path. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-07-08 11:23:30 +01:00 · 2026-05-18 21:23:15 -04:00
parent d358954a84
commit a0f00c0eca
1 changed files with 14 additions and 26 deletions
@@ -63,36 +63,24 @@ inline typename vobj::scalar_objectD sumD_gpu_direct(const vobj *lat, Integer os
  typedef typename vobj::scalar_object  sobj;
  typedef typename vobj::scalar_objectD sobjD;

-  constexpr Integer nsimd = vobj::Nsimd();
+  const Integer nsimd  = vobj::Nsimd();
+  const Integer nlanes = osites * nsimd;

-  deviceVector<sobjD> per_site(osites);
-  sobjD *per_site_p = &per_site[0];
-
-  // Set acceleratorThreads=1 so the block is dim3(nsimd,1,1): one site per block.
-  // smem[nsimd] is then exactly sized; no other site group competes for the same slots.
-  // Restore after the (synchronous) accelerator_for.
-  uint32_t saved_threads = acceleratorThreads();
-  acceleratorThreads(1);
+  deviceVector<sobjD> per_lane(nlanes);
+  sobjD *per_lane_p = &per_lane[0];

 #ifdef GRID_REDUCTION_TIMING
  RealD t_for = -usecond();
 #endif
-  accelerator_for(ss, osites, nsimd, {
-    int lane = acceleratorSIMTlane(nsimd);
-    __shared__ sobjD smem[nsimd];
-    sobj tmp  = extractLane(lane, lat[ss]);
+  accelerator_for(idx, nlanes, 1, {
+    Integer ss   = idx / nsimd;
+    Integer lane = idx % nsimd;
+    sobj tmp = extractLane(lane, lat[ss]);
    sobjD tmpD; tmpD = tmp;
-    smem[lane] = tmpD;
-    acceleratorSynchronise();
-    if (lane == 0) {
-      sobjD acc = smem[0];
-      for (int l = 1; l < nsimd; l++) acc = acc + smem[l];
-      per_site_p[ss] = acc;
-    }
+    per_lane_p[idx] = tmpD;
  });
-  // accelerator_for already issued accelerator_barrier(); restore thread count now.
-  acceleratorThreads(saved_threads);
 #ifdef GRID_REDUCTION_TIMING
+  accelerator_barrier();
  t_for += usecond();
 #endif

@@ -102,8 +90,8 @@ inline typename vobj::scalar_objectD sumD_gpu_direct(const vobj *lat, Integer os
  size_t temp_bytes = 0;

  gpuError_t gpuErr;
-  gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_site_p, d_out,
-                                        (int)osites, gpucub::Sum(), zero, computeStream);
+  gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_lane_p, d_out,
+                                        (int)nlanes, gpucub::Sum(), zero, computeStream);
  if (gpuErr != gpuSuccess) {
    std::cout << GridLogError << "sumD_gpu_direct: DeviceReduce size query failed: "
              << gpuErr << std::endl;
@@ -115,8 +103,8 @@ inline typename vobj::scalar_objectD sumD_gpu_direct(const vobj *lat, Integer os
 #ifdef GRID_REDUCTION_TIMING
  RealD t_cub = -usecond();
 #endif
-  gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_site_p, d_out,
-                                        (int)osites, gpucub::Sum(), zero, computeStream);
+  gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_lane_p, d_out,
+                                        (int)nlanes, gpucub::Sum(), zero, computeStream);
  if (gpuErr != gpuSuccess) {
    std::cout << GridLogError << "sumD_gpu_direct: DeviceReduce failed: "
              << gpuErr << std::endl;