1
0
mirror of https://github.com/paboyle/Grid.git synced 2026-05-26 03:54:17 +01:00

Test_reduction: add timing benchmark for new vs old reduction paths

Reports us/call and GB/s for sum_gpu (CUB/sycl::reduction) and
sum_gpu_old (hand-rolled shared-memory) for each field type, with
5-call warmup and 100-call timed loop.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Peter Boyle
2026-05-18 12:31:13 -04:00
parent c93b338bdd
commit baa70d8ec9
+44 -1
View File
@@ -101,7 +101,50 @@ void testReduction(GridCartesian *grid, GridParallelRNG &rng,
#endif
//--------------------------------------------------------------------
// b) Constant field via field = 1.0.
// b) Timing: new (CUB/sycl::reduction) vs old (hand-rolled) path.
// Warmup first, then Niter timed calls; report us/call and GB/s.
//--------------------------------------------------------------------
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
{
const int Nwarm = 5;
const int Niter = 100;
gaussian(rng, field);
{
autoView(v, field, AcceleratorRead);
for (int i = 0; i < Nwarm; i++) sum_gpu (&v[0], osites);
for (int i = 0; i < Nwarm; i++) sum_gpu_old(&v[0], osites);
}
RealD t_new, t_old;
{
autoView(v, field, AcceleratorRead);
t_new = -usecond();
for (int i = 0; i < Niter; i++) sum_gpu(&v[0], osites);
t_new += usecond();
}
{
autoView(v, field, AcceleratorRead);
t_old = -usecond();
for (int i = 0; i < Niter; i++) sum_gpu_old(&v[0], osites);
t_old += usecond();
}
RealD bytes = (RealD)osites * sizeof(vobj);
RealD GBs_new = bytes / (t_new / Niter) * 1e-3;
RealD GBs_old = bytes / (t_old / Niter) * 1e-3;
std::cout << GridLogMessage << name << " timing (" << Niter << " calls):" << std::endl;
std::cout << GridLogMessage
<< " sum_gpu " << t_new/Niter << " us " << GBs_new << " GB/s" << std::endl;
std::cout << GridLogMessage
<< " sum_gpu_old " << t_old/Niter << " us " << GBs_old << " GB/s" << std::endl;
}
#endif
//--------------------------------------------------------------------
// d) Constant field via field = 1.0.
//
// Grid's iMatrix::operator=(scalar) sets only the diagonal, so:
// LatticeComplex -> scalar 1.0 (Ncomp = 1 nonzero per site)