mirror of
https://github.com/paboyle/Grid.git
synced 2026-05-25 19:44:17 +01:00
Revert to hand-rolled reduction; drop Lattice_reduction_gpu_cub.h
Remove the CUB/hipCUB direction entirely. Restore Lattice_reduction_gpu.h,
Lattice_reduction_sycl.h, and Lattice_reduction.h to the state before the
CUB rewrite (commit 969b0a39), recovering the original primary function names
(sumD_gpu_small, sumD_gpu_large, sumD_gpu, sum_gpu, sum_gpu_large) and the
hand-rolled shared-memory reduction kernel.
Delete Lattice_reduction_gpu_cub.h. Update Test_reduction to remove the
old/new comparison sections that depended on sum_gpu_old.
The lesson: CUB DeviceReduce is slower than the hand-rolled kernel for small
types, and the smem sizing problem for the extraction pass has no clean
solution within the accelerator_for abstraction. The right improvement is
a higher radix (12 then 4) in sumD_gpu_large, applied directly to the
existing hand-rolled kernel.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -31,9 +31,6 @@ Author: Christoph Lehner <christoph@lhnr.de>
|
|||||||
#if defined(GRID_SYCL)
|
#if defined(GRID_SYCL)
|
||||||
#include <Grid/lattice/Lattice_reduction_sycl.h>
|
#include <Grid/lattice/Lattice_reduction_sycl.h>
|
||||||
#endif
|
#endif
|
||||||
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
|
|
||||||
#include <Grid/lattice/Lattice_reduction_gpu_cub.h>
|
|
||||||
#endif
|
|
||||||
#include <Grid/lattice/Lattice_slicesum_core.h>
|
#include <Grid/lattice/Lattice_slicesum_core.h>
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|||||||
@@ -198,7 +198,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
|
|||||||
// Possibly promote to double and sum
|
// Possibly promote to double and sum
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
template <class vobj>
|
template <class vobj>
|
||||||
inline typename vobj::scalar_objectD sumD_gpu_small_old(const vobj *lat, Integer osites)
|
inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
|
||||||
{
|
{
|
||||||
typedef typename vobj::scalar_objectD sobj;
|
typedef typename vobj::scalar_objectD sobj;
|
||||||
typedef decltype(lat) Iterator;
|
typedef decltype(lat) Iterator;
|
||||||
@@ -224,7 +224,7 @@ inline typename vobj::scalar_objectD sumD_gpu_small_old(const vobj *lat, Integer
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class vobj>
|
template <class vobj>
|
||||||
inline typename vobj::scalar_objectD sumD_gpu_large_old(const vobj *lat, Integer osites)
|
inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
|
||||||
{
|
{
|
||||||
typedef typename vobj::vector_type vector;
|
typedef typename vobj::vector_type vector;
|
||||||
typedef typename vobj::scalar_typeD scalarD;
|
typedef typename vobj::scalar_typeD scalarD;
|
||||||
@@ -244,13 +244,13 @@ inline typename vobj::scalar_objectD sumD_gpu_large_old(const vobj *lat, Integer
|
|||||||
buf[ss] = dat[ss*words+w];
|
buf[ss] = dat[ss*words+w];
|
||||||
});
|
});
|
||||||
|
|
||||||
ret_p[w] = sumD_gpu_small_old(tbuf,osites);
|
ret_p[w] = sumD_gpu_small(tbuf,osites);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class vobj>
|
template <class vobj>
|
||||||
inline typename vobj::scalar_objectD sumD_gpu_old(const vobj *lat, Integer osites)
|
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
|
||||||
{
|
{
|
||||||
typedef typename vobj::scalar_objectD sobj;
|
typedef typename vobj::scalar_objectD sobj;
|
||||||
sobj ret;
|
sobj ret;
|
||||||
@@ -261,9 +261,9 @@ inline typename vobj::scalar_objectD sumD_gpu_old(const vobj *lat, Integer osite
|
|||||||
int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
|
int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
|
||||||
|
|
||||||
if ( ok ) {
|
if ( ok ) {
|
||||||
ret = sumD_gpu_small_old(lat,osites);
|
ret = sumD_gpu_small(lat,osites);
|
||||||
} else {
|
} else {
|
||||||
ret = sumD_gpu_large_old(lat,osites);
|
ret = sumD_gpu_large(lat,osites);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
@@ -272,20 +272,20 @@ inline typename vobj::scalar_objectD sumD_gpu_old(const vobj *lat, Integer osite
|
|||||||
// Return as same precision as input performing reduction in double precision though
|
// Return as same precision as input performing reduction in double precision though
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
template <class vobj>
|
template <class vobj>
|
||||||
inline typename vobj::scalar_object sum_gpu_old(const vobj *lat, Integer osites)
|
inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
|
||||||
{
|
{
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
sobj result;
|
sobj result;
|
||||||
result = sumD_gpu_old(lat,osites);
|
result = sumD_gpu(lat,osites);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class vobj>
|
template <class vobj>
|
||||||
inline typename vobj::scalar_object sum_gpu_large_old(const vobj *lat, Integer osites)
|
inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
|
||||||
{
|
{
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
sobj result;
|
sobj result;
|
||||||
result = sumD_gpu_large_old(lat,osites);
|
result = sumD_gpu_large(lat,osites);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,361 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
Source file: ./Grid/lattice/Lattice_reduction_gpu_cub.h
|
|
||||||
Copyright (C) 2015-2024
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#if defined(GRID_CUDA)
|
|
||||||
#include <cub/cub.cuh>
|
|
||||||
#define gpucub cub
|
|
||||||
#define gpuError_t cudaError_t
|
|
||||||
#define gpuSuccess cudaSuccess
|
|
||||||
#elif defined(GRID_HIP)
|
|
||||||
#include <hipcub/hipcub.hpp>
|
|
||||||
#define gpucub hipcub
|
|
||||||
#define gpuError_t hipError_t
|
|
||||||
#define gpuSuccess hipSuccess
|
|
||||||
#endif
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Unified lattice reduction using CUB (CUDA/HIP) and sycl::reduction (SYCL).
|
|
||||||
//
|
|
||||||
// CUDA/HIP: one accelerator_for pass per site to extract SIMD lanes and promote to sobjD,
|
|
||||||
// then CUB/hipCUB DeviceReduce::Reduce over the resulting array.
|
|
||||||
//
|
|
||||||
// rocPRIM's DeviceReduce requires warpSize(64) threads per block, each holding one element
|
|
||||||
// in shared memory: sizeof(T)*64 must fit in sharedMemPerBlock. Large QCD objects such as
|
|
||||||
// LatticePropagator (sobjD = 2304 bytes, 64*2304 = 147 KB) exceed this budget.
|
|
||||||
//
|
|
||||||
// For those types sumD_gpu_large groups the vobj's vector_type words in bundles of 4,
|
|
||||||
// reducing each bundle as an iVector<iScalar<scalarD>,4> (64 bytes, 64*64 = 4 KB — always safe).
|
|
||||||
// Words that do not fill a complete bundle are zero-padded.
|
|
||||||
//
|
|
||||||
// SYCL: sycl::reduction handles any type size through the runtime, so one path suffices.
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
|
||||||
|
|
||||||
#define GRID_REDUCTION_TIMING
|
|
||||||
|
|
||||||
// Direct CUB reduction on the full scalar_objectD.
|
|
||||||
// Only safe when sizeof(sobjD)*64 <= device sharedMemPerBlock.
|
|
||||||
// Do not call directly for large composite types (e.g. LatticePropagator).
|
|
||||||
template<class vobj>
|
|
||||||
inline typename vobj::scalar_objectD sumD_gpu_direct(const vobj *lat, Integer osites)
|
|
||||||
{
|
|
||||||
typedef typename vobj::scalar_object sobj;
|
|
||||||
typedef typename vobj::scalar_objectD sobjD;
|
|
||||||
|
|
||||||
const Integer nsimd = vobj::Nsimd();
|
|
||||||
const Integer nlanes = osites * nsimd;
|
|
||||||
|
|
||||||
deviceVector<sobjD> per_lane(nlanes);
|
|
||||||
sobjD *per_lane_p = &per_lane[0];
|
|
||||||
|
|
||||||
#ifdef GRID_REDUCTION_TIMING
|
|
||||||
RealD t_for = -usecond();
|
|
||||||
#endif
|
|
||||||
accelerator_for(idx, nlanes, 1, {
|
|
||||||
Integer ss = idx / nsimd;
|
|
||||||
Integer lane = idx % nsimd;
|
|
||||||
sobj tmp = extractLane(lane, lat[ss]);
|
|
||||||
sobjD tmpD; tmpD = tmp;
|
|
||||||
per_lane_p[idx] = tmpD;
|
|
||||||
});
|
|
||||||
#ifdef GRID_REDUCTION_TIMING
|
|
||||||
accelerator_barrier();
|
|
||||||
t_for += usecond();
|
|
||||||
#endif
|
|
||||||
|
|
||||||
sobjD zero; zeroit(zero);
|
|
||||||
sobjD *d_out = static_cast<sobjD *>(acceleratorAllocDevice(sizeof(sobjD)));
|
|
||||||
void *d_temp = nullptr;
|
|
||||||
size_t temp_bytes = 0;
|
|
||||||
|
|
||||||
gpuError_t gpuErr;
|
|
||||||
gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_lane_p, d_out,
|
|
||||||
(int)nlanes, gpucub::Sum(), zero, computeStream);
|
|
||||||
if (gpuErr != gpuSuccess) {
|
|
||||||
std::cout << GridLogError << "sumD_gpu_direct: DeviceReduce size query failed: "
|
|
||||||
<< gpuErr << std::endl;
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
|
|
||||||
d_temp = acceleratorAllocDevice(temp_bytes);
|
|
||||||
|
|
||||||
#ifdef GRID_REDUCTION_TIMING
|
|
||||||
RealD t_cub = -usecond();
|
|
||||||
#endif
|
|
||||||
gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, per_lane_p, d_out,
|
|
||||||
(int)nlanes, gpucub::Sum(), zero, computeStream);
|
|
||||||
if (gpuErr != gpuSuccess) {
|
|
||||||
std::cout << GridLogError << "sumD_gpu_direct: DeviceReduce failed: "
|
|
||||||
<< gpuErr << std::endl;
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
|
|
||||||
accelerator_barrier();
|
|
||||||
#ifdef GRID_REDUCTION_TIMING
|
|
||||||
t_cub += usecond();
|
|
||||||
std::cout << GridLogMessage << "sumD_gpu_direct"
|
|
||||||
<< " sizeof(sobjD)=" << sizeof(sobjD)
|
|
||||||
<< " accelerator_for=" << t_for << " us"
|
|
||||||
<< " CUB_reduce=" << t_cub << " us" << std::endl;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
sobjD result;
|
|
||||||
acceleratorCopyFromDevice(d_out, &result, sizeof(sobjD));
|
|
||||||
acceleratorFreeDevice(d_temp);
|
|
||||||
acceleratorFreeDevice(d_out);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Radix-4 word-bundle path for types too large for the direct CUB path.
|
|
||||||
// Treats vobj as words of vector_type; groups them in bundles of 4 and reduces
|
|
||||||
// each bundle as an iVector<iScalar<scalarD>,4> — reusing Grid's existing tensor
|
|
||||||
// type which already has accelerator_inline operator+ and zeroit().
|
|
||||||
// sizeof = 4 * sizeof(scalarD) <= 64 bytes; 64 * 64 = 4096 bytes, safely within
|
|
||||||
// rocPRIM's shared-memory budget on all supported devices.
|
|
||||||
// If words % 4 != 0, the final partial bundle is zero-padded so all unused
|
|
||||||
// slots contribute zero to the sum.
|
|
||||||
template<class vobj>
|
|
||||||
inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
|
|
||||||
{
|
|
||||||
typedef typename vobj::vector_type vector;
|
|
||||||
typedef typename vobj::scalar_typeD scalarD;
|
|
||||||
typedef typename vobj::scalar_objectD sobjD;
|
|
||||||
using R4 = iVector<iScalar<scalarD>, 4>;
|
|
||||||
|
|
||||||
const int words = sizeof(vobj) / sizeof(vector);
|
|
||||||
const int nfull = words / 4;
|
|
||||||
const int rem = words % 4;
|
|
||||||
|
|
||||||
sobjD ret; zeroit(ret);
|
|
||||||
scalarD *ret_p = (scalarD *)&ret;
|
|
||||||
|
|
||||||
iScalar<vector> *idat = (iScalar<vector> *)lat;
|
|
||||||
deviceVector<R4> buf(osites);
|
|
||||||
R4 *buf_p = &buf[0];
|
|
||||||
|
|
||||||
R4 zero4; zeroit(zero4);
|
|
||||||
|
|
||||||
R4 *d_out = static_cast<R4 *>(acceleratorAllocDevice(sizeof(R4)));
|
|
||||||
void *d_temp = nullptr;
|
|
||||||
size_t temp_bytes = 0;
|
|
||||||
|
|
||||||
// Probe workspace size once — type R4 and count osites are fixed across all groups.
|
|
||||||
gpuError_t gpuErr;
|
|
||||||
gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, buf_p, d_out,
|
|
||||||
(int)osites, gpucub::Sum(), zero4, computeStream);
|
|
||||||
if (gpuErr != gpuSuccess) {
|
|
||||||
std::cout << GridLogError << "sumD_gpu_large: DeviceReduce size query failed: "
|
|
||||||
<< gpuErr << std::endl;
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
d_temp = acceleratorAllocDevice(temp_bytes);
|
|
||||||
|
|
||||||
#ifdef GRID_REDUCTION_TIMING
|
|
||||||
RealD t_for_large = 0.0, t_cub_large = 0.0;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Full groups of 4 words.
|
|
||||||
for (int g = 0; g < nfull; g++) {
|
|
||||||
int base = 4 * g;
|
|
||||||
#ifdef GRID_REDUCTION_TIMING
|
|
||||||
t_for_large -= usecond();
|
|
||||||
#endif
|
|
||||||
accelerator_for(ss, osites, 1, {
|
|
||||||
R4 r4;
|
|
||||||
r4._internal[0] = TensorRemove(Reduce(idat[ss * words + base ]));
|
|
||||||
r4._internal[1] = TensorRemove(Reduce(idat[ss * words + base + 1]));
|
|
||||||
r4._internal[2] = TensorRemove(Reduce(idat[ss * words + base + 2]));
|
|
||||||
r4._internal[3] = TensorRemove(Reduce(idat[ss * words + base + 3]));
|
|
||||||
buf_p[ss] = r4;
|
|
||||||
});
|
|
||||||
#ifdef GRID_REDUCTION_TIMING
|
|
||||||
accelerator_barrier();
|
|
||||||
t_for_large += usecond();
|
|
||||||
t_cub_large -= usecond();
|
|
||||||
#endif
|
|
||||||
gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, buf_p, d_out,
|
|
||||||
(int)osites, gpucub::Sum(), zero4, computeStream);
|
|
||||||
if (gpuErr != gpuSuccess) {
|
|
||||||
std::cout << GridLogError << "sumD_gpu_large: DeviceReduce failed (group "
|
|
||||||
<< g << "): " << gpuErr << std::endl;
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
accelerator_barrier();
|
|
||||||
#ifdef GRID_REDUCTION_TIMING
|
|
||||||
t_cub_large += usecond();
|
|
||||||
#endif
|
|
||||||
R4 group_result;
|
|
||||||
acceleratorCopyFromDevice(d_out, &group_result, sizeof(R4));
|
|
||||||
ret_p[base ] = TensorRemove(group_result._internal[0]);
|
|
||||||
ret_p[base + 1] = TensorRemove(group_result._internal[1]);
|
|
||||||
ret_p[base + 2] = TensorRemove(group_result._internal[2]);
|
|
||||||
ret_p[base + 3] = TensorRemove(group_result._internal[3]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Partial last group: zero-pad unused slots so they contribute nothing to the sum.
|
|
||||||
if (rem > 0) {
|
|
||||||
int base = 4 * nfull;
|
|
||||||
#ifdef GRID_REDUCTION_TIMING
|
|
||||||
t_for_large -= usecond();
|
|
||||||
#endif
|
|
||||||
accelerator_for(ss, osites, 1, {
|
|
||||||
R4 r4; zeroit(r4);
|
|
||||||
for (int k = 0; k < rem; k++)
|
|
||||||
r4._internal[k] = TensorRemove(Reduce(idat[ss * words + base + k]));
|
|
||||||
buf_p[ss] = r4;
|
|
||||||
});
|
|
||||||
#ifdef GRID_REDUCTION_TIMING
|
|
||||||
accelerator_barrier();
|
|
||||||
t_for_large += usecond();
|
|
||||||
t_cub_large -= usecond();
|
|
||||||
#endif
|
|
||||||
gpuErr = gpucub::DeviceReduce::Reduce(d_temp, temp_bytes, buf_p, d_out,
|
|
||||||
(int)osites, gpucub::Sum(), zero4, computeStream);
|
|
||||||
if (gpuErr != gpuSuccess) {
|
|
||||||
std::cout << GridLogError << "sumD_gpu_large: DeviceReduce failed (partial group): "
|
|
||||||
<< gpuErr << std::endl;
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
accelerator_barrier();
|
|
||||||
#ifdef GRID_REDUCTION_TIMING
|
|
||||||
t_cub_large += usecond();
|
|
||||||
#endif
|
|
||||||
R4 partial_result;
|
|
||||||
acceleratorCopyFromDevice(d_out, &partial_result, sizeof(R4));
|
|
||||||
for (int k = 0; k < rem; k++)
|
|
||||||
ret_p[4 * nfull + k] = TensorRemove(partial_result._internal[k]);
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef GRID_REDUCTION_TIMING
|
|
||||||
std::cout << GridLogMessage << "sumD_gpu_large"
|
|
||||||
<< " sizeof(sobjD)=" << sizeof(sobjD)
|
|
||||||
<< " words=" << words << " nfull=" << nfull << " rem=" << rem
|
|
||||||
<< " accelerator_for=" << t_for_large << " us"
|
|
||||||
<< " CUB_reduce=" << t_cub_large << " us" << std::endl;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
acceleratorFreeDevice(d_temp);
|
|
||||||
acceleratorFreeDevice(d_out);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Dispatch: direct CUB path for types that fit in the shared-memory budget,
|
|
||||||
// radix-4 word-bundle path for larger types.
|
|
||||||
// Threshold 512 bytes: 64 * 512 = 32768 bytes, within rocPRIM's
|
|
||||||
// ROCPRIM_SHARED_MEMORY_MAX on all supported devices.
|
|
||||||
template<class vobj>
|
|
||||||
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
|
|
||||||
{
|
|
||||||
typedef typename vobj::scalar_objectD sobjD;
|
|
||||||
if constexpr (sizeof(sobjD) > 512) {
|
|
||||||
return sumD_gpu_large(lat, osites);
|
|
||||||
} else {
|
|
||||||
return sumD_gpu_direct(lat, osites);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class vobj>
|
|
||||||
inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
|
|
||||||
{
|
|
||||||
return sumD_gpu(lat, osites);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class vobj>
|
|
||||||
inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
|
|
||||||
{
|
|
||||||
typedef typename vobj::scalar_object sobj;
|
|
||||||
sobj result;
|
|
||||||
result = sumD_gpu(lat, osites);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class vobj>
|
|
||||||
inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
|
|
||||||
{
|
|
||||||
typedef typename vobj::scalar_object sobj;
|
|
||||||
sobj result;
|
|
||||||
result = sumD_gpu_large(lat, osites);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // GRID_CUDA || GRID_HIP
|
|
||||||
|
|
||||||
#if defined(GRID_SYCL)
|
|
||||||
|
|
||||||
// Accumulates in sobjD throughout, fixing the precision bug in the original
|
|
||||||
// Lattice_reduction_sycl.h which accumulated in sobj then converted at the end.
|
|
||||||
template<class vobj>
|
|
||||||
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
|
|
||||||
{
|
|
||||||
typedef typename vobj::scalar_object sobj;
|
|
||||||
typedef typename vobj::scalar_objectD sobjD;
|
|
||||||
|
|
||||||
sobjD identity; zeroit(identity);
|
|
||||||
sobjD ret; zeroit(ret);
|
|
||||||
{
|
|
||||||
sycl::buffer<sobjD, 1> abuff(&ret, {1});
|
|
||||||
theGridAccelerator->submit([&](sycl::handler &cgh) {
|
|
||||||
auto Reduction = sycl::reduction(abuff, cgh, identity, std::plus<>());
|
|
||||||
cgh.parallel_for(sycl::range<1>{(size_t)osites},
|
|
||||||
Reduction,
|
|
||||||
[=](sycl::id<1> item, auto &sum) {
|
|
||||||
sobj s = Reduce(lat[item[0]]);
|
|
||||||
sobjD sd; sd = s;
|
|
||||||
sum += sd;
|
|
||||||
});
|
|
||||||
});
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class vobj>
|
|
||||||
inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
|
|
||||||
{
|
|
||||||
return sumD_gpu(lat, osites);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class vobj>
|
|
||||||
inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
|
|
||||||
{
|
|
||||||
return sumD_gpu(lat, osites);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class vobj>
|
|
||||||
inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
|
|
||||||
{
|
|
||||||
typedef typename vobj::scalar_object sobj;
|
|
||||||
sobj result;
|
|
||||||
result = sumD_gpu(lat, osites);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class vobj>
|
|
||||||
inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
|
|
||||||
{
|
|
||||||
return sum_gpu(lat, osites);
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // GRID_SYCL
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
||||||
@@ -6,7 +6,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
|
|
||||||
template <class vobj>
|
template <class vobj>
|
||||||
inline typename vobj::scalar_objectD sumD_gpu_tensor_old(const vobj *lat, Integer osites)
|
inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites)
|
||||||
{
|
{
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
typedef typename vobj::scalar_objectD sobjD;
|
typedef typename vobj::scalar_objectD sobjD;
|
||||||
@@ -31,40 +31,40 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor_old(const vobj *lat, Intege
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class vobj>
|
template <class vobj>
|
||||||
inline typename vobj::scalar_objectD sumD_gpu_large_old(const vobj *lat, Integer osites)
|
inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
|
||||||
{
|
{
|
||||||
return sumD_gpu_tensor_old(lat,osites);
|
return sumD_gpu_tensor(lat,osites);
|
||||||
}
|
}
|
||||||
template <class vobj>
|
template <class vobj>
|
||||||
inline typename vobj::scalar_objectD sumD_gpu_small_old(const vobj *lat, Integer osites)
|
inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
|
||||||
{
|
{
|
||||||
return sumD_gpu_large_old(lat,osites);
|
return sumD_gpu_large(lat,osites);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class vobj>
|
template <class vobj>
|
||||||
inline typename vobj::scalar_objectD sumD_gpu_old(const vobj *lat, Integer osites)
|
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
|
||||||
{
|
{
|
||||||
return sumD_gpu_large_old(lat,osites);
|
return sumD_gpu_large(lat,osites);
|
||||||
}
|
}
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Return as same precision as input performing reduction in double precision though
|
// Return as same precision as input performing reduction in double precision though
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
template <class vobj>
|
template <class vobj>
|
||||||
inline typename vobj::scalar_object sum_gpu_old(const vobj *lat, Integer osites)
|
inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
|
||||||
{
|
{
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
sobj result;
|
sobj result;
|
||||||
result = sumD_gpu_old(lat,osites);
|
result = sumD_gpu(lat,osites);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class vobj>
|
template <class vobj>
|
||||||
inline typename vobj::scalar_object sum_gpu_large_old(const vobj *lat, Integer osites)
|
inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
|
||||||
{
|
{
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
sobj result;
|
sobj result;
|
||||||
result = sumD_gpu_large_old(lat,osites);
|
result = sumD_gpu_large(lat,osites);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -73,36 +73,7 @@ void testReduction(GridCartesian *grid, GridParallelRNG &rng,
|
|||||||
Field field(grid);
|
Field field(grid);
|
||||||
|
|
||||||
//--------------------------------------------------------------------
|
//--------------------------------------------------------------------
|
||||||
// a) Gaussian random field: sum_gpu (new CUB path) vs sum_gpu_old
|
// a) Timing: Niter timed calls reporting us/call and GB/s.
|
||||||
// (preserved hand-rolled shared-memory path). Both promote lanes
|
|
||||||
// to double internally, so results should agree to near-roundoff.
|
|
||||||
//--------------------------------------------------------------------
|
|
||||||
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
|
||||||
{
|
|
||||||
gaussian(rng, field);
|
|
||||||
|
|
||||||
autoView(v, field, AcceleratorRead);
|
|
||||||
sobj new_result = sum_gpu (&v[0], osites);
|
|
||||||
sobj old_result = sum_gpu_old(&v[0], osites);
|
|
||||||
|
|
||||||
sobj diff = new_result - old_result;
|
|
||||||
RealD diffn = squaredSum(diff);
|
|
||||||
RealD refn = squaredSum(old_result);
|
|
||||||
RealD reldiff = (refn > 0.0) ? std::sqrt(diffn / refn) : std::sqrt(diffn);
|
|
||||||
|
|
||||||
// Float fields: both paths cast from double to float, expect O(eps_float).
|
|
||||||
// Double fields: ordering differences at most O(V * eps_double).
|
|
||||||
RealD tol = isFloat ? 1e-6 : 1e-10;
|
|
||||||
|
|
||||||
std::cout << GridLogMessage
|
|
||||||
<< name << " random reldiff = " << reldiff << std::endl;
|
|
||||||
check(reldiff < tol, name + " random: sum_gpu agrees with sum_gpu_old");
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
//--------------------------------------------------------------------
|
|
||||||
// b) Timing: new (CUB/sycl::reduction) vs old (hand-rolled) path.
|
|
||||||
// Warmup first, then Niter timed calls; report us/call and GB/s.
|
|
||||||
//--------------------------------------------------------------------
|
//--------------------------------------------------------------------
|
||||||
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
||||||
{
|
{
|
||||||
@@ -113,38 +84,28 @@ void testReduction(GridCartesian *grid, GridParallelRNG &rng,
|
|||||||
|
|
||||||
{
|
{
|
||||||
autoView(v, field, AcceleratorRead);
|
autoView(v, field, AcceleratorRead);
|
||||||
for (int i = 0; i < Nwarm; i++) sum_gpu (&v[0], osites);
|
for (int i = 0; i < Nwarm; i++) sum_gpu(&v[0], osites);
|
||||||
for (int i = 0; i < Nwarm; i++) sum_gpu_old(&v[0], osites);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
RealD t_new, t_old;
|
RealD t_new;
|
||||||
{
|
{
|
||||||
autoView(v, field, AcceleratorRead);
|
autoView(v, field, AcceleratorRead);
|
||||||
t_new = -usecond();
|
t_new = -usecond();
|
||||||
for (int i = 0; i < Niter; i++) sum_gpu(&v[0], osites);
|
for (int i = 0; i < Niter; i++) sum_gpu(&v[0], osites);
|
||||||
t_new += usecond();
|
t_new += usecond();
|
||||||
}
|
}
|
||||||
{
|
|
||||||
autoView(v, field, AcceleratorRead);
|
|
||||||
t_old = -usecond();
|
|
||||||
for (int i = 0; i < Niter; i++) sum_gpu_old(&v[0], osites);
|
|
||||||
t_old += usecond();
|
|
||||||
}
|
|
||||||
|
|
||||||
RealD bytes = (RealD)osites * sizeof(vobj);
|
RealD bytes = (RealD)osites * sizeof(vobj);
|
||||||
RealD GBs_new = bytes / (t_new / Niter) * 1e-3;
|
RealD GBs = bytes / (t_new / Niter) * 1e-3;
|
||||||
RealD GBs_old = bytes / (t_old / Niter) * 1e-3;
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << name << " timing (" << Niter << " calls):" << std::endl;
|
std::cout << GridLogMessage << name << " timing (" << Niter << " calls):" << std::endl;
|
||||||
std::cout << GridLogMessage
|
std::cout << GridLogMessage
|
||||||
<< " sum_gpu " << t_new/Niter << " us " << GBs_new << " GB/s" << std::endl;
|
<< " sum_gpu " << t_new/Niter << " us " << GBs << " GB/s" << std::endl;
|
||||||
std::cout << GridLogMessage
|
|
||||||
<< " sum_gpu_old " << t_old/Niter << " us " << GBs_old << " GB/s" << std::endl;
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
//--------------------------------------------------------------------
|
//--------------------------------------------------------------------
|
||||||
// d) Constant field via field = 1.0.
|
// b) Constant field via field = 1.0.
|
||||||
//
|
//
|
||||||
// Grid's iMatrix::operator=(scalar) sets only the diagonal, so:
|
// Grid's iMatrix::operator=(scalar) sets only the diagonal, so:
|
||||||
// LatticeComplex -> scalar 1.0 (Ncomp = 1 nonzero per site)
|
// LatticeComplex -> scalar 1.0 (Ncomp = 1 nonzero per site)
|
||||||
|
|||||||
Reference in New Issue
Block a user