mirror of
https://github.com/paboyle/Grid.git
synced 2026-06-18 01:43:43 +01:00
sumD_gpu_direct: revert to per-lane write; CUB handles Nsimd*osites inputs
Benchmarking showed the shared-memory lane-summation approach (843d6497)
was slower than writing each SIMD lane individually and letting CUB reduce
the full nlanes = osites*Nsimd array. CUB's device reduce is more efficient
over the larger input than the smem overhead + serialised lane-0 summation.
The smem approach also required overriding acceleratorThreads() to avoid
the block-size sizing problem. Restore the simpler per-lane path.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -63,36 +63,24 @@ inline typename vobj::scalar_objectD sumD_gpu_direct(const vobj *lat, Integer os
|
||||
typedef typename vobj::scalar_object sobj;
|
||||
typedef typename vobj::scalar_objectD sobjD;
|
||||
|
||||
constexpr Integer nsimd = vobj::Nsimd();
|
||||
const Integer nsimd = vobj::Nsimd();
|
||||
const Integer nlanes = osites * nsimd;
|
||||
|
||||
deviceVector<sobjD> per_site(osites);
|
||||
sobjD *per_site_p = &per_site[0];
|
||||
|
||||
// Set acceleratorThreads=1 so the block is dim3(nsimd,1,1): one site per block.
|
||||
// smem[nsimd] is then exactly sized; no other site group competes for the same slots.
|
||||
// Restore after the (synchronous) accelerator_for.
|
||||
uint32_t saved_threads = acceleratorThreads();
|
||||
acceleratorThreads(1);
|
||||
deviceVector<sobjD> per_lane(nlanes);
|
||||
sobjD *per_lane_p = &per_lane[0];
|
||||
|
||||
#ifdef GRID_REDUCTION_TIMING
|
||||
RealD t_for = -usecond();
|
||||
#endif
|
||||
accelerator_for(ss, osites, nsimd, {
|
||||
int lane = acceleratorSIMTlane(nsimd);
|
||||
__shared__ sobjD smem[nsimd];
|
||||
sobj tmp = extractLane(lane, lat[ss]);
|
||||
accelerator_for(idx, nlanes, 1, {
|
||||
Integer ss = idx / nsimd;
|
||||
Integer lane = idx % nsimd;
|
||||
sobj tmp = extractLane(lane, lat[ss]);
|
||||
sobjD tmpD; tmpD = tmp;
|
||||
smem[lane] = tmpD;
|
||||
acceleratorSynchronise();
|
||||
if (lane == 0) {
|
||||
sobjD acc = smem[0];
|
||||
for (int l = 1; l < nsimd; l++) acc = acc + smem[l];
|
||||
per_site_p[ss] = acc;
|
||||
}
|
||||
per_lane_p[idx] = tmpD;
|
||||
});
|
||||
// accelerator_for already issued accelerator_barrier(); restore thread count now.
|
||||
acceleratorThreads(saved_threads);
|
||||
#ifdef GRID_REDUCTION_TIMING
|
||||
accelerator_barrier();
|
||||
t_for += usecond();
|
||||
#endif
|
||||
|
||||
@@ -102,8 +90,8 @@ inline typename vobj::scalar_objectD sumD_gpu_direct(const vobj *lat, Integer os
|
||||
size_t temp_bytes = 0;
|
||||
|
||||
gpuError_t gpuErr;
|
||||
gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_site_p, d_out,
|
||||
(int)osites, gpucub::Sum(), zero, computeStream);
|
||||
gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_lane_p, d_out,
|
||||
(int)nlanes, gpucub::Sum(), zero, computeStream);
|
||||
if (gpuErr != gpuSuccess) {
|
||||
std::cout << GridLogError << "sumD_gpu_direct: DeviceReduce size query failed: "
|
||||
<< gpuErr << std::endl;
|
||||
@@ -115,8 +103,8 @@ inline typename vobj::scalar_objectD sumD_gpu_direct(const vobj *lat, Integer os
|
||||
#ifdef GRID_REDUCTION_TIMING
|
||||
RealD t_cub = -usecond();
|
||||
#endif
|
||||
gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_site_p, d_out,
|
||||
(int)osites, gpucub::Sum(), zero, computeStream);
|
||||
gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_lane_p, d_out,
|
||||
(int)nlanes, gpucub::Sum(), zero, computeStream);
|
||||
if (gpuErr != gpuSuccess) {
|
||||
std::cout << GridLogError << "sumD_gpu_direct: DeviceReduce failed: "
|
||||
<< gpuErr << std::endl;
|
||||
|
||||
Reference in New Issue
Block a user