Add Test_reduction to tests/debug

Tests the new CUB/hipCUB/SYCL lattice reduction (sum_gpu) against the preserved hand-rolled implementation (sum_gpu_old) for LatticeComplexF/D, LatticeColourMatrixF/D and LatticePropagatorF/D. Part a) gaussian random field: checks that old and new agree to within float/double roundoff tolerance. Part b) constant field (= 1.0, identity-matrix init): verifies innerProduct(sum, sum) = Ncomp * V^2 where Ncomp counts the nonzero diagonal scalar components per site (1 / Nc / Ns*Nc respectively). Make.inc is auto-generated by scripts/filelist on bootstrap and is not tracked; the new .cc file is all that is needed. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-23 18:44:17 +01:00 · 2026-05-15 14:31:33 -04:00
parent 969b0a3922
commit 286c29d6fb
1 changed files with 160 additions and 0 deletions
@@ -0,0 +1,160 @@
+/*************************************************************************************
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./tests/debug/Test_reduction.cc
+
+    Copyright (C) 2024
+
+Author: Peter Boyle <pboyle@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+
+static int passed = 0;
+static int failed = 0;
+
+static void check(bool ok, const std::string &msg)
+{
+  if (ok) {
+    std::cout << GridLogMessage << "PASS  " << msg << std::endl;
+    passed++;
+  } else {
+    std::cout << GridLogMessage << "FAIL  " << msg << std::endl;
+    failed++;
+  }
+}
+
+// Frobenius-like norm squared of any Grid scalar tensor object.
+// For iScalar chains:  real(conj(a)*a)
+// For iMatrix<T,N>:    sum_{i,j} real(conj(a_ij)*a_ij)  (all elements)
+// Both old and new paths promote to double before accumulating, so the result
+// is always exact enough for the tolerances used here.
+template<class T>
+RealD scalarNorm2(const T &a)
+{
+  return (RealD)real(TensorRemove(innerProduct(a, a)));
+}
+
+template<class Field>
+void testReduction(GridCartesian *grid, GridParallelRNG &rng,
+                   const std::string &name, int Ncomp)
+{
+  typedef typename Field::vector_object vobj;
+  typedef typename vobj::scalar_object  sobj;
+  typedef typename vobj::scalar_type    scalar_type;
+
+  const Integer V      = grid->_gsites;
+  const Integer osites = grid->oSites();
+
+  // Detect single vs double precision by comparing fundamental scalar sizes.
+  const bool isFloat = (sizeof(scalar_type) < sizeof(ComplexD));
+
+  std::cout << GridLogMessage << "=== " << name << " ===" << std::endl;
+
+  Field field(grid);
+
+  //--------------------------------------------------------------------
+  // a) Gaussian random field: sum_gpu (new CUB path) vs sum_gpu_old
+  //    (preserved hand-rolled shared-memory path).  Both promote lanes
+  //    to double internally, so results should agree to near-roundoff.
+  //--------------------------------------------------------------------
+#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
+  {
+    gaussian(rng, field);
+
+    autoView(v, field, AcceleratorRead);
+    sobj new_result = sum_gpu    (&v[0], osites);
+    sobj old_result = sum_gpu_old(&v[0], osites);
+
+    sobj  diff  = new_result - old_result;
+    RealD diffn = scalarNorm2(diff);
+    RealD refn  = scalarNorm2(old_result);
+    RealD reldiff = (refn > 0.0) ? std::sqrt(diffn / refn) : std::sqrt(diffn);
+
+    // Float fields: both paths cast from double to float, expect O(eps_float).
+    // Double fields: ordering differences at most O(V * eps_double).
+    RealD tol = isFloat ? 1e-6 : 1e-10;
+
+    std::cout << GridLogMessage
+              << name << " random reldiff = " << reldiff << std::endl;
+    check(reldiff < tol, name + " random: sum_gpu agrees with sum_gpu_old");
+  }
+#endif
+
+  //--------------------------------------------------------------------
+  // b) Constant field via field = 1.0.
+  //
+  //    Grid's iMatrix::operator=(scalar) sets only the diagonal, so:
+  //      LatticeComplex       -> scalar 1.0        (Ncomp = 1  nonzero per site)
+  //      LatticeColourMatrix  -> Nc x Nc identity  (Ncomp = Nc nonzero per site)
+  //      LatticePropagator    -> (Ns*Nc)^2 identity (Ncomp = Ns*Nc nonzero per site)
+  //
+  //    After GlobalSum: sum_result has Ncomp diagonal entries each equal to V,
+  //    all off-diagonal entries zero.  Grid's recursive innerProduct computes
+  //    the Frobenius inner product (sum of |element|^2 over all indices), giving
+  //
+  //        innerProduct(sum_result, sum_result) = Ncomp * V^2
+  //--------------------------------------------------------------------
+  {
+    field = 1.0;
+    sobj sum_result = sum(field);   // uses new GPU path + GlobalSum
+
+    RealD got      = scalarNorm2(sum_result);
+    RealD expected = (RealD)Ncomp * (RealD)V * (RealD)V;
+    RealD reldiff  = std::abs(got - expected) / expected;
+
+    std::cout << GridLogMessage
+              << name << " const: got " << got
+              << "  expected " << expected
+              << "  reldiff "  << reldiff << std::endl;
+    check(reldiff < 1e-8, name + " const: innerProduct(sum,sum) = Ncomp*V^2");
+  }
+}
+
+int main(int argc, char **argv)
+{
+  Grid_init(&argc, &argv);
+
+  Coordinate latt = GridDefaultLatt();
+  Coordinate simd = GridDefaultSimd(Nd, vComplexD::Nsimd());
+  Coordinate mpi  = GridDefaultMpi();
+
+  GridCartesian *grid = SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi);
+  GridParallelRNG rng(grid);
+  rng.SeedFixedIntegers({1, 2, 3, 4});
+
+  std::cout << GridLogMessage << "Lattice : " << latt << std::endl;
+  std::cout << GridLogMessage << "Volume  : " << grid->_gsites << std::endl;
+
+  testReduction<LatticeComplexF>      (grid, rng, "LatticeComplexF",      1      );
+  testReduction<LatticeComplexD>      (grid, rng, "LatticeComplexD",      1      );
+  testReduction<LatticeColourMatrixF> (grid, rng, "LatticeColourMatrixF", Nc     );
+  testReduction<LatticeColourMatrixD> (grid, rng, "LatticeColourMatrixD", Nc     );
+  testReduction<LatticePropagatorF>   (grid, rng, "LatticePropagatorF",   Ns*Nc  );
+  testReduction<LatticePropagatorD>   (grid, rng, "LatticePropagatorD",   Ns*Nc  );
+
+  std::cout << GridLogMessage << "==============================" << std::endl;
+  std::cout << GridLogMessage << passed << " PASSED   " << failed << " FAILED" << std::endl;
+
+  Grid_finalize();
+  return (failed > 0) ? EXIT_FAILURE : EXIT_SUCCESS;
+}