1
0
mirror of https://github.com/paboyle/Grid.git synced 2026-05-24 02:54:16 +01:00

Revert to hand-rolled reduction; drop Lattice_reduction_gpu_cub.h

Remove the CUB/hipCUB direction entirely. Restore Lattice_reduction_gpu.h,
Lattice_reduction_sycl.h, and Lattice_reduction.h to the state before the
CUB rewrite (commit 969b0a39), recovering the original primary function names
(sumD_gpu_small, sumD_gpu_large, sumD_gpu, sum_gpu, sum_gpu_large) and the
hand-rolled shared-memory reduction kernel.

Delete Lattice_reduction_gpu_cub.h. Update Test_reduction to remove the
old/new comparison sections that depended on sum_gpu_old.

The lesson: CUB DeviceReduce is slower than the hand-rolled kernel for small
types, and the smem sizing problem for the extraction pass has no clean
solution within the accelerator_for abstraction. The right improvement is
a higher radix (12 then 4) in sumD_gpu_large, applied directly to the
existing hand-rolled kernel.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Peter Boyle
2026-05-18 21:52:18 -04:00
parent f4fbf7c9ca
commit 068f95ad2d
5 changed files with 28 additions and 431 deletions
+7 -46
View File
@@ -73,36 +73,7 @@ void testReduction(GridCartesian *grid, GridParallelRNG &rng,
Field field(grid);
//--------------------------------------------------------------------
// a) Gaussian random field: sum_gpu (new CUB path) vs sum_gpu_old
// (preserved hand-rolled shared-memory path). Both promote lanes
// to double internally, so results should agree to near-roundoff.
//--------------------------------------------------------------------
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
{
gaussian(rng, field);
autoView(v, field, AcceleratorRead);
sobj new_result = sum_gpu (&v[0], osites);
sobj old_result = sum_gpu_old(&v[0], osites);
sobj diff = new_result - old_result;
RealD diffn = squaredSum(diff);
RealD refn = squaredSum(old_result);
RealD reldiff = (refn > 0.0) ? std::sqrt(diffn / refn) : std::sqrt(diffn);
// Float fields: both paths cast from double to float, expect O(eps_float).
// Double fields: ordering differences at most O(V * eps_double).
RealD tol = isFloat ? 1e-6 : 1e-10;
std::cout << GridLogMessage
<< name << " random reldiff = " << reldiff << std::endl;
check(reldiff < tol, name + " random: sum_gpu agrees with sum_gpu_old");
}
#endif
//--------------------------------------------------------------------
// b) Timing: new (CUB/sycl::reduction) vs old (hand-rolled) path.
// Warmup first, then Niter timed calls; report us/call and GB/s.
// a) Timing: Niter timed calls reporting us/call and GB/s.
//--------------------------------------------------------------------
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
{
@@ -113,38 +84,28 @@ void testReduction(GridCartesian *grid, GridParallelRNG &rng,
{
autoView(v, field, AcceleratorRead);
for (int i = 0; i < Nwarm; i++) sum_gpu (&v[0], osites);
for (int i = 0; i < Nwarm; i++) sum_gpu_old(&v[0], osites);
for (int i = 0; i < Nwarm; i++) sum_gpu(&v[0], osites);
}
RealD t_new, t_old;
RealD t_new;
{
autoView(v, field, AcceleratorRead);
t_new = -usecond();
for (int i = 0; i < Niter; i++) sum_gpu(&v[0], osites);
t_new += usecond();
}
{
autoView(v, field, AcceleratorRead);
t_old = -usecond();
for (int i = 0; i < Niter; i++) sum_gpu_old(&v[0], osites);
t_old += usecond();
}
RealD bytes = (RealD)osites * sizeof(vobj);
RealD GBs_new = bytes / (t_new / Niter) * 1e-3;
RealD GBs_old = bytes / (t_old / Niter) * 1e-3;
RealD bytes = (RealD)osites * sizeof(vobj);
RealD GBs = bytes / (t_new / Niter) * 1e-3;
std::cout << GridLogMessage << name << " timing (" << Niter << " calls):" << std::endl;
std::cout << GridLogMessage
<< " sum_gpu " << t_new/Niter << " us " << GBs_new << " GB/s" << std::endl;
std::cout << GridLogMessage
<< " sum_gpu_old " << t_old/Niter << " us " << GBs_old << " GB/s" << std::endl;
<< " sum_gpu " << t_new/Niter << " us " << GBs << " GB/s" << std::endl;
}
#endif
//--------------------------------------------------------------------
// d) Constant field via field = 1.0.
// b) Constant field via field = 1.0.
//
// Grid's iMatrix::operator=(scalar) sets only the diagonal, so:
// LatticeComplex -> scalar 1.0 (Ncomp = 1 nonzero per site)