From bf2140e74d09ab75eba96eef8209d3f9db03f1fa Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 18 May 2026 21:53:40 -0400 Subject: [PATCH] Lattice_reduction_sycl: fix double-precision accumulation in sumD_gpu_tensor Accumulate in sobjD throughout rather than accumulating in sobj and converting the final sum. For float fields this matters: summing N floats then casting loses O(N*eps_float) relative precision vs accumulating in double from the start. Co-Authored-By: Claude Sonnet 4.6 --- Grid/lattice/Lattice_reduction_sycl.h | 31 +++++++++++++-------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/Grid/lattice/Lattice_reduction_sycl.h b/Grid/lattice/Lattice_reduction_sycl.h index ace7f010..84e7a09a 100644 --- a/Grid/lattice/Lattice_reduction_sycl.h +++ b/Grid/lattice/Lattice_reduction_sycl.h @@ -6,28 +6,27 @@ NAMESPACE_BEGIN(Grid); template -inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites) +inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites) { - typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_objectD sobjD; - sobj identity; zeroit(identity); - sobj ret; zeroit(ret); - Integer nsimd= vobj::Nsimd(); - { - sycl::buffer abuff(&ret, {1}); + sobjD identity; zeroit(identity); + sobjD ret; zeroit(ret); + { + sycl::buffer abuff(&ret, {1}); theGridAccelerator->submit([&](sycl::handler &cgh) { - auto Reduction = sycl::reduction(abuff,cgh,identity,std::plus<>()); - cgh.parallel_for(sycl::range<1>{osites}, - Reduction, - [=] (sycl::id<1> item, auto &sum) { - auto osite = item[0]; - sum +=Reduce(lat[osite]); - }); + auto Reduction = sycl::reduction(abuff, cgh, identity, std::plus<>()); + cgh.parallel_for(sycl::range<1>{(size_t)osites}, + Reduction, + [=](sycl::id<1> item, auto &sum) { + sobj s = Reduce(lat[item[0]]); + sobjD sd; sd = s; + sum += sd; + }); }); } - sobjD dret; convertType(dret,ret); - return dret; + return ret; } template