diff --git a/Grid/lattice/Lattice_slicesum_core.h b/Grid/lattice/Lattice_slicesum_core.h index 9c4cc051..187f2fb3 100644 --- a/Grid/lattice/Lattice_slicesum_core.h +++ b/Grid/lattice/Lattice_slicesum_core.h @@ -1,5 +1,5 @@ #pragma once -#include + #if defined(GRID_CUDA) #include @@ -90,8 +90,61 @@ template inline void sliceSumReduction_cub_small(const vobj *Data, V } +#endif -template inline void sliceSumReduction_cub_large(const vobj *Data, Vector &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { + +#if defined(GRID_SYCL) +template inline void sliceSumReduction_sycl_small(const vobj *Data, Vector &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) +{ + size_t subvol_size = e1*e2; + + vobj *mysum = (vobj *) malloc_shared(rd*sizeof(vobj),*theGridAccelerator); + vobj vobj_zero; + zeroit(vobj_zero); + for (int r = 0; r reduction_buffer(rd*subvol_size); + + auto rb_p = &reduction_buffer[0]; + + // autoView(Data_v, Data, AcceleratorRead); + + //prepare reduction buffer + accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{ + + int n = s / e2; + int b = s % e2; + int so=r*ostride; // base offset for start of plane + int ss= so+n*stride+b; + + coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss])); + + }); + + for (int r = 0; r < rd; r++) { + theGridAccelerator->submit([&](cl::sycl::handler &cgh) { + auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>()); + cgh.parallel_for(cl::sycl::range<1>{subvol_size}, + Reduction, + [=](cl::sycl::id<1> item, auto &sum) { + auto s = item[0]; + sum += rb_p[r*subvol_size+s]; + }); + }); + + + } + theGridAccelerator->wait(); + for (int r = 0; r < rd; r++) { + lvSum[r] = mysum[r]; + } + free(mysum,*theGridAccelerator); +} +#endif + +template inline void sliceSumReduction_large(const vobj *Data, Vector &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { typedef typename vobj::vector_type vector; const int words = sizeof(vobj)/sizeof(vector); const int osites = rd*e1*e2; @@ -106,8 +159,12 @@ template inline void sliceSumReduction_cub_large(const vobj *Data, V buf[ss] = dat[ss*words+w]; }); - sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd); - + #if defined(GRID_CUDA) || defined(GRID_HIP) + sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd); + #elif defined(GRID_SYCL) + sliceSumReduction_sycl_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd); + #endif + for (int r = 0; r < rd; r++) { lvSum_ptr[w+words*r]=lvSum_small[r]; } @@ -117,66 +174,24 @@ template inline void sliceSumReduction_cub_large(const vobj *Data, V } -template inline void sliceSumReduction_cub(const Lattice &Data, Vector &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) +template inline void sliceSumReduction_gpu(const Lattice &Data, Vector &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { - autoView(Data_v, Data, AcceleratorRead); //hipcub/cub cannot deal with large vobjs so we split into small/large case. + autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case. if constexpr (sizeof(vobj) <= 256) { - sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); + + #if defined(GRID_CUDA) || defined(GRID_HIP) + sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); + #elif defined (GRID_SYCL) + sliceSumReduction_sycl_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); + #endif + } else { - sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); + sliceSumReduction_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); } } -#endif -#if defined(GRID_SYCL) -template inline void sliceSumReduction_sycl(const Lattice &Data, Vector &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) -{ - typedef typename vobj::scalar_object sobj; - size_t subvol_size = e1*e2; - - vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator); - vobj vobj_zero; - zeroit(vobj_zero); - - commVector reduction_buffer(rd*subvol_size); - - auto rb_p = &reduction_buffer[0]; - - autoView(Data_v, Data, AcceleratorRead); - - //prepare reduction buffer - accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{ - - int n = s / e2; - int b = s % e2; - int so=r*ostride; // base offset for start of plane - int ss= so+n*stride+b; - - coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss])); - - }); - - for (int r = 0; r < rd; r++) { - mysum[0] = vobj_zero; //dirty hack: cannot pass vobj_zero as identity to sycl::reduction as its not device_copyable - theGridAccelerator->submit([&](cl::sycl::handler &cgh) { - auto Reduction = cl::sycl::reduction(mysum,std::plus<>()); - cgh.parallel_for(cl::sycl::range<1>{subvol_size}, - Reduction, - [=](cl::sycl::id<1> item, auto &sum) { - auto s = item[0]; - sum += rb_p[r*subvol_size+s]; - }); - }); - theGridAccelerator->wait(); - lvSum[r] = mysum[0]; - } - - free(mysum,*theGridAccelerator); -} -#endif - template inline void sliceSumReduction_cpu(const Lattice &Data, Vector &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) { // sum over reduced dimension planes, breaking out orthog dir @@ -195,13 +210,9 @@ template inline void sliceSumReduction_cpu(const Lattice &Data template inline void sliceSumReduction(const Lattice &Data, Vector &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) { - #if defined(GRID_CUDA) || defined(GRID_HIP) + #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) - sliceSumReduction_cub(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); - - #elif defined(GRID_SYCL) - - sliceSumReduction_sycl(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); + sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); #else sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);