1
0
mirror of https://github.com/paboyle/Grid.git synced 2026-06-19 18:33:43 +01:00
Files
Grid/tests/debug/Test_reduction.cc
T
Peter Boyle f3c3b1c04b Test_reduction: add timing benchmark for new vs old reduction paths
Reports us/call and GB/s for sum_gpu (CUB/sycl::reduction) and
sum_gpu_old (hand-rolled shared-memory) for each field type, with
5-call warmup and 100-call timed loop.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00

207 lines
7.8 KiB
C++

/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/debug/Test_reduction.cc
Copyright (C) 2024
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
static int passed = 0;
static int failed = 0;
static void check(bool ok, const std::string &msg)
{
if (ok) {
std::cout << GridLogMessage << "PASS " << msg << std::endl;
passed++;
} else {
std::cout << GridLogMessage << "FAIL " << msg << std::endl;
failed++;
}
}
// Squared magnitude of a Grid scalar tensor aggregate: innerProduct(a,a).
// For iScalar: real(conj(a)*a)
// For iMatrix<T,N>: sum_{i,j} real(conj(a_ij)*a_ij) (Frobenius)
// Named squaredSum to make clear the squaring is applied to the aggregate
// (the sum), not to individual site values before summing.
template<class T>
RealD squaredSum(const T &a)
{
return (RealD)real(TensorRemove(innerProduct(a, a)));
}
template<class Field>
void testReduction(GridCartesian *grid, GridParallelRNG &rng,
const std::string &name, int Ncomp)
{
typedef typename Field::vector_object vobj;
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
const Integer V = grid->_gsites;
const Integer osites = grid->oSites();
// Detect single vs double precision by comparing fundamental scalar sizes.
const bool isFloat = (sizeof(scalar_type) < sizeof(ComplexD));
std::cout << GridLogMessage << "=== " << name << " ===" << std::endl;
Field field(grid);
//--------------------------------------------------------------------
// a) Gaussian random field: sum_gpu (new CUB path) vs sum_gpu_old
// (preserved hand-rolled shared-memory path). Both promote lanes
// to double internally, so results should agree to near-roundoff.
//--------------------------------------------------------------------
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
{
gaussian(rng, field);
autoView(v, field, AcceleratorRead);
sobj new_result = sum_gpu (&v[0], osites);
sobj old_result = sum_gpu_old(&v[0], osites);
sobj diff = new_result - old_result;
RealD diffn = squaredSum(diff);
RealD refn = squaredSum(old_result);
RealD reldiff = (refn > 0.0) ? std::sqrt(diffn / refn) : std::sqrt(diffn);
// Float fields: both paths cast from double to float, expect O(eps_float).
// Double fields: ordering differences at most O(V * eps_double).
RealD tol = isFloat ? 1e-6 : 1e-10;
std::cout << GridLogMessage
<< name << " random reldiff = " << reldiff << std::endl;
check(reldiff < tol, name + " random: sum_gpu agrees with sum_gpu_old");
}
#endif
//--------------------------------------------------------------------
// b) Timing: new (CUB/sycl::reduction) vs old (hand-rolled) path.
// Warmup first, then Niter timed calls; report us/call and GB/s.
//--------------------------------------------------------------------
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
{
const int Nwarm = 5;
const int Niter = 100;
gaussian(rng, field);
{
autoView(v, field, AcceleratorRead);
for (int i = 0; i < Nwarm; i++) sum_gpu (&v[0], osites);
for (int i = 0; i < Nwarm; i++) sum_gpu_old(&v[0], osites);
}
RealD t_new, t_old;
{
autoView(v, field, AcceleratorRead);
t_new = -usecond();
for (int i = 0; i < Niter; i++) sum_gpu(&v[0], osites);
t_new += usecond();
}
{
autoView(v, field, AcceleratorRead);
t_old = -usecond();
for (int i = 0; i < Niter; i++) sum_gpu_old(&v[0], osites);
t_old += usecond();
}
RealD bytes = (RealD)osites * sizeof(vobj);
RealD GBs_new = bytes / (t_new / Niter) * 1e-3;
RealD GBs_old = bytes / (t_old / Niter) * 1e-3;
std::cout << GridLogMessage << name << " timing (" << Niter << " calls):" << std::endl;
std::cout << GridLogMessage
<< " sum_gpu " << t_new/Niter << " us " << GBs_new << " GB/s" << std::endl;
std::cout << GridLogMessage
<< " sum_gpu_old " << t_old/Niter << " us " << GBs_old << " GB/s" << std::endl;
}
#endif
//--------------------------------------------------------------------
// d) Constant field via field = 1.0.
//
// Grid's iMatrix::operator=(scalar) sets only the diagonal, so:
// LatticeComplex -> scalar 1.0 (Ncomp = 1 nonzero per site)
// LatticeColourMatrix -> Nc x Nc identity (Ncomp = Nc nonzero per site)
// LatticePropagator -> (Ns*Nc)^2 identity (Ncomp = Ns*Nc nonzero per site)
//
// After GlobalSum: sum_result has Ncomp diagonal entries each equal to V,
// all off-diagonal entries zero. Grid's recursive innerProduct computes
// the Frobenius inner product (sum of |element|^2 over all indices), giving
//
// innerProduct(sum_result, sum_result) = Ncomp * V^2
//--------------------------------------------------------------------
{
field = 1.0;
sobj sum_result = sum(field); // uses new GPU path + GlobalSum
RealD got = squaredSum(sum_result);
RealD expected = (RealD)Ncomp * (RealD)V * (RealD)V;
RealD reldiff = std::abs(got - expected) / expected;
std::cout << GridLogMessage
<< name << " const: got " << got
<< " expected " << expected
<< " reldiff " << reldiff << std::endl;
check(reldiff < 1e-8, name + " const: innerProduct(sum,sum) = Ncomp*V^2");
}
}
int main(int argc, char **argv)
{
Grid_init(&argc, &argv);
Coordinate latt = GridDefaultLatt();
Coordinate mpi = GridDefaultMpi();
GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(latt, GridDefaultSimd(Nd, vComplexD::Nsimd()), mpi);
GridCartesian *UGrid_f = SpaceTimeGrid::makeFourDimGrid(latt, GridDefaultSimd(Nd, vComplexF::Nsimd()), mpi);
GridParallelRNG rng(UGrid);
rng.SeedFixedIntegers({1, 2, 3, 4});
GridParallelRNG rng_f(UGrid_f);
rng_f.SeedFixedIntegers({1, 2, 3, 4});
std::cout << GridLogMessage << "Lattice : " << latt << std::endl;
std::cout << GridLogMessage << "Volume : " << UGrid->_gsites << std::endl;
testReduction<LatticeComplexF> (UGrid_f, rng_f, "LatticeComplexF", 1 );
testReduction<LatticeComplexD> (UGrid, rng, "LatticeComplexD", 1 );
testReduction<LatticeColourMatrixF> (UGrid_f, rng_f, "LatticeColourMatrixF", Nc );
testReduction<LatticeColourMatrixD> (UGrid, rng, "LatticeColourMatrixD", Nc );
testReduction<LatticePropagatorF> (UGrid_f, rng_f, "LatticePropagatorF", Ns*Nc );
testReduction<LatticePropagatorD> (UGrid, rng, "LatticePropagatorD", Ns*Nc );
std::cout << GridLogMessage << "==============================" << std::endl;
std::cout << GridLogMessage << passed << " PASSED " << failed << " FAILED" << std::endl;
Grid_finalize();
return (failed > 0) ? EXIT_FAILURE : EXIT_SUCCESS;
}