From ab2de131bd28cbc651149381f0fd45678e796555 Mon Sep 17 00:00:00 2001 From: dbollweg Date: Tue, 6 Feb 2024 13:24:45 -0500 Subject: [PATCH] work towards sliceSum for sycl backend --- Grid/lattice/Lattice_reduction.h | 15 ++++ Grid/lattice/Lattice_slicesum_gpu.h | 9 --- Grid/lattice/Lattice_slicesum_sycl.h | 115 +++++++++++++++++++++++++++ Grid/threads/Accelerator.h | 4 +- tests/core/Test_sliceSum.cc | 4 +- 5 files changed, 134 insertions(+), 13 deletions(-) create mode 100644 Grid/lattice/Lattice_slicesum_sycl.h diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index f85ed7e3..bfd41b6c 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -31,6 +31,7 @@ Author: Christoph Lehner #endif #if defined(GRID_SYCL) #include +#include #endif NAMESPACE_BEGIN(Grid); @@ -505,6 +506,20 @@ sliceSum(const Lattice &Data,int orthogdim) return result; } +template inline +std::vector +sliceSumGpu(const Lattice &Data,int orthogdim) +{ + std::vector result; + #if defined(GRID_CUDA) || defined(GRID_HIP) + sliceSumGpu(Data,result,orthogdim); + #elif defined(GRID_SYCL) + sliceSum_sycl(Data,result,orthogdim); + #endif + return result; +} + + template static void sliceInnerProductVector( std::vector & result, const Lattice &lhs,const Lattice &rhs,int orthogdim) { diff --git a/Grid/lattice/Lattice_slicesum_gpu.h b/Grid/lattice/Lattice_slicesum_gpu.h index 8e13808f..5d2ad049 100644 --- a/Grid/lattice/Lattice_slicesum_gpu.h +++ b/Grid/lattice/Lattice_slicesum_gpu.h @@ -177,13 +177,4 @@ template inline void sliceSumGpu(const Lattice &Data,std::vect grid->GlobalSumVector(ptr, words); } -template inline -std::vector -sliceSumGpu(const Lattice &Data,int orthogdim) -{ - std::vector result; - sliceSumGpu(Data,result,orthogdim); - return result; -} - NAMESPACE_END(Grid); \ No newline at end of file diff --git a/Grid/lattice/Lattice_slicesum_sycl.h b/Grid/lattice/Lattice_slicesum_sycl.h new file mode 100644 index 00000000..034e9dd3 --- /dev/null +++ b/Grid/lattice/Lattice_slicesum_sycl.h @@ -0,0 +1,115 @@ +#pragma once + +NAMESPACE_BEGIN(Grid); + +template +inline void sliceSum_sycl(const Lattice &Data, std::vector &result, int orthogdim) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_object::scalar_type scalar_type; + + GridBase *grid = Data.Grid(); + assert(grid!=NULL); + + const int Nd = grid->_ndimension; + const size_t Nsimd = grid->Nsimd(); + + + assert(orthogdim >= 0); + assert(orthogdim < Nd); + + int fd=grid->_fdimensions[orthogdim]; + int ld=grid->_ldimensions[orthogdim]; + int rd=grid->_rdimensions[orthogdim]; + + int e1= grid->_slice_nblock[orthogdim]; + int e2= grid->_slice_block [orthogdim]; + int stride=grid->_slice_stride[orthogdim]; + int ostride=grid->_ostride[orthogdim]; + size_t subvol_size = e1*e2; + + vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator); + vobj vobj_zero; + zeroit(vobj_zero); + + + result.resize(fd); + + Vector lvSum(rd); + Vector lsSum(ld,Zero()); + commVector reduction_buffer(rd*subvol_size); + ExtractBuffer extracted(Nsimd); + + for(int r=0;rsubmit([&](cl::sycl::handler &cgh) { + auto Reduction = cl::sycl::reduction(mysum,vobj_zero,std::plus<>()); + cgh.parallel_for(cl::sycl::range<1>{subvol_size}, + Reduction, + [=](cl::sycl::id<1> item, auto &sum) { + auto s = item[0]; + sum += rb_p[r*subvol_size+s]; + }); + }); + theGridAccelerator->wait(); + lvSum[r] = mysum[0]; + } + + Coordinate icoor(Nd); + + for(int rt=0;rtiCoorFromIindex(icoor,idx); + + int ldx =rt+icoor[orthogdim]*rd; + + lsSum[ldx]=lsSum[ldx]+extracted[idx]; + + } + } + + // sum over nodes. + for(int t=0;t_processor_coor[orthogdim] ) { + result[t]=lsSum[lt]; + } else { + result[t]=Zero(); + } + + } + scalar_type * ptr = (scalar_type *) &result[0]; + int words = fd*sizeof(sobj)/sizeof(scalar_type); + grid->GlobalSumVector(ptr, words); + +} + +NAMESPACE_END(Grid); \ No newline at end of file diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index ff5ccd7a..eaafea5d 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -256,12 +256,12 @@ NAMESPACE_END(Grid); #if 0 #include #include -#include +#include #include #else #include #include -#include +#include #include #endif diff --git a/tests/core/Test_sliceSum.cc b/tests/core/Test_sliceSum.cc index 7dd69cdc..399ab899 100644 --- a/tests/core/Test_sliceSum.cc +++ b/tests/core/Test_sliceSum.cc @@ -26,7 +26,7 @@ int main (int argc, char ** argv) { //warmup for (int sweeps = 0; sweeps < 5; sweeps++) { - sliceSumGpu(test_data,reduction_result,0); + reduction_result = sliceSumGpu(test_data,0); } int trace_id = traceStart("sliceSum benchmark"); @@ -46,7 +46,7 @@ int main (int argc, char ** argv) { RealD tgpu=-usecond(); tracePush("sliceSumGpu"); - sliceSumGpu(test_data,reduction_result,i); + reduction_result = sliceSumGpu(test_data,i); tracePop("sliceSumGpu"); tgpu+=usecond();