1
0
mirror of https://github.com/paboyle/Grid.git synced 2026-06-24 20:43:29 +01:00

Lattice_reduction_sycl: fix double-precision accumulation in sumD_gpu_tensor

Accumulate in sobjD throughout rather than accumulating in sobj and
converting the final sum. For float fields this matters: summing N floats
then casting loses O(N*eps_float) relative precision vs accumulating in
double from the start.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Peter Boyle
2026-05-18 21:53:40 -04:00
parent a1119266c1
commit bf2140e74d
+13 -14
View File
@@ -8,26 +8,25 @@ NAMESPACE_BEGIN(Grid);
template <class vobj> template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites) inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_objectD sobjD; typedef typename vobj::scalar_objectD sobjD;
sobj identity; zeroit(identity); sobjD identity; zeroit(identity);
sobj ret; zeroit(ret); sobjD ret; zeroit(ret);
Integer nsimd= vobj::Nsimd();
{ {
sycl::buffer<sobj, 1> abuff(&ret, {1}); sycl::buffer<sobjD, 1> abuff(&ret, {1});
theGridAccelerator->submit([&](sycl::handler &cgh) { theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = sycl::reduction(abuff,cgh,identity,std::plus<>()); auto Reduction = sycl::reduction(abuff, cgh, identity, std::plus<>());
cgh.parallel_for(sycl::range<1>{osites}, cgh.parallel_for(sycl::range<1>{(size_t)osites},
Reduction, Reduction,
[=] (sycl::id<1> item, auto &sum) { [=](sycl::id<1> item, auto &sum) {
auto osite = item[0]; sobj s = Reduce(lat[item[0]]);
sum +=Reduce(lat[osite]); sobjD sd; sd = s;
}); sum += sd;
});
}); });
} }
sobjD dret; convertType(dret,ret); return ret;
return dret;
} }
template <class vobj> template <class vobj>