From be94cf1c6ffb293aa06c7f6fb1a4cf023a073249 Mon Sep 17 00:00:00 2001 From: dbollweg Date: Wed, 6 Mar 2024 16:53:13 -0500 Subject: [PATCH 1/3] Fewer wait-calls in sycl slicesum --- Grid/lattice/Lattice_slicesum_core.h | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/Grid/lattice/Lattice_slicesum_core.h b/Grid/lattice/Lattice_slicesum_core.h index 9c4cc051..63737517 100644 --- a/Grid/lattice/Lattice_slicesum_core.h +++ b/Grid/lattice/Lattice_slicesum_core.h @@ -136,10 +136,13 @@ template inline void sliceSumReduction_sycl(const Lattice &Dat typedef typename vobj::scalar_object sobj; size_t subvol_size = e1*e2; - vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator); + vobj *mysum = (vobj *) malloc_shared(rd*sizeof(vobj),*theGridAccelerator); vobj vobj_zero; zeroit(vobj_zero); - + for (int r = 0; r reduction_buffer(rd*subvol_size); auto rb_p = &reduction_buffer[0]; @@ -159,9 +162,8 @@ template inline void sliceSumReduction_sycl(const Lattice &Dat }); for (int r = 0; r < rd; r++) { - mysum[0] = vobj_zero; //dirty hack: cannot pass vobj_zero as identity to sycl::reduction as its not device_copyable theGridAccelerator->submit([&](cl::sycl::handler &cgh) { - auto Reduction = cl::sycl::reduction(mysum,std::plus<>()); + auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>()); cgh.parallel_for(cl::sycl::range<1>{subvol_size}, Reduction, [=](cl::sycl::id<1> item, auto &sum) { @@ -169,10 +171,13 @@ template inline void sliceSumReduction_sycl(const Lattice &Dat sum += rb_p[r*subvol_size+s]; }); }); - theGridAccelerator->wait(); - lvSum[r] = mysum[0]; + + + } + theGridAccelerator->wait(); + for (int r = 0; r < rd; r++) { + lvSum[r] = mysum[r]; } - free(mysum,*theGridAccelerator); } #endif From 31f9971dbf97c1f1389c5ee08d649871f22df514 Mon Sep 17 00:00:00 2001 From: dbollweg Date: Wed, 13 Mar 2024 13:39:26 -0400 Subject: [PATCH 2/3] avoid PI_ERROR_OUT_OF_RESOURCES in sycl sliceSum --- Grid/lattice/Lattice_slicesum_core.h | 46 +++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/Grid/lattice/Lattice_slicesum_core.h b/Grid/lattice/Lattice_slicesum_core.h index 63737517..36580f57 100644 --- a/Grid/lattice/Lattice_slicesum_core.h +++ b/Grid/lattice/Lattice_slicesum_core.h @@ -131,9 +131,8 @@ template inline void sliceSumReduction_cub(const Lattice &Data #if defined(GRID_SYCL) -template inline void sliceSumReduction_sycl(const Lattice &Data, Vector &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) +template inline void sliceSumReduction_sycl_small(const vobj *Data, Vector &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) { - typedef typename vobj::scalar_object sobj; size_t subvol_size = e1*e2; vobj *mysum = (vobj *) malloc_shared(rd*sizeof(vobj),*theGridAccelerator); @@ -147,7 +146,7 @@ template inline void sliceSumReduction_sycl(const Lattice &Dat auto rb_p = &reduction_buffer[0]; - autoView(Data_v, Data, AcceleratorRead); + // autoView(Data_v, Data, AcceleratorRead); //prepare reduction buffer accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{ @@ -157,7 +156,7 @@ template inline void sliceSumReduction_sycl(const Lattice &Dat int so=r*ostride; // base offset for start of plane int ss= so+n*stride+b; - coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss])); + coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss])); }); @@ -180,6 +179,45 @@ template inline void sliceSumReduction_sycl(const Lattice &Dat } free(mysum,*theGridAccelerator); } + + +template inline void sliceSumReduction_sycl_large(const vobj *Data, Vector &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { + typedef typename vobj::vector_type vector; + const int words = sizeof(vobj)/sizeof(vector); + const int osites = rd*e1*e2; + commVectorbuffer(osites); + vector *dat = (vector *)Data; + vector *buf = &buffer[0]; + Vector lvSum_small(rd); + vector *lvSum_ptr = (vector *)&lvSum[0]; + + for (int w = 0; w < words; w++) { + accelerator_for(ss,osites,1,{ + buf[ss] = dat[ss*words+w]; + }); + + sliceSumReduction_sycl_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd); + + for (int r = 0; r < rd; r++) { + lvSum_ptr[w+words*r]=lvSum_small[r]; + } + + } +} + + +template inline void sliceSumReduction_sycl(const Lattice &Data, Vector &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) +{ + autoView(Data_v, Data, AcceleratorRead); //hipcub/cub cannot deal with large vobjs so we split into small/large case. + if constexpr (sizeof(vobj) <= 256) { + sliceSumReduction_sycl_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); + } + else { + sliceSumReduction_sycl_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); + } +} + + #endif template inline void sliceSumReduction_cpu(const Lattice &Data, Vector &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) From 461cd045c647ddd2a1fcd7c4caf6cce431992500 Mon Sep 17 00:00:00 2001 From: dbollweg Date: Wed, 13 Mar 2024 18:18:44 -0400 Subject: [PATCH 3/3] sliceSum cleanup --- Grid/lattice/Lattice_slicesum_core.h | 80 +++++++++------------------- 1 file changed, 24 insertions(+), 56 deletions(-) diff --git a/Grid/lattice/Lattice_slicesum_core.h b/Grid/lattice/Lattice_slicesum_core.h index 36580f57..187f2fb3 100644 --- a/Grid/lattice/Lattice_slicesum_core.h +++ b/Grid/lattice/Lattice_slicesum_core.h @@ -1,5 +1,5 @@ #pragma once -#include + #if defined(GRID_CUDA) #include @@ -90,44 +90,7 @@ template inline void sliceSumReduction_cub_small(const vobj *Data, V } - -template inline void sliceSumReduction_cub_large(const vobj *Data, Vector &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { - typedef typename vobj::vector_type vector; - const int words = sizeof(vobj)/sizeof(vector); - const int osites = rd*e1*e2; - commVectorbuffer(osites); - vector *dat = (vector *)Data; - vector *buf = &buffer[0]; - Vector lvSum_small(rd); - vector *lvSum_ptr = (vector *)&lvSum[0]; - - for (int w = 0; w < words; w++) { - accelerator_for(ss,osites,1,{ - buf[ss] = dat[ss*words+w]; - }); - - sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd); - - for (int r = 0; r < rd; r++) { - lvSum_ptr[w+words*r]=lvSum_small[r]; - } - - } - - -} - -template inline void sliceSumReduction_cub(const Lattice &Data, Vector &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) -{ - autoView(Data_v, Data, AcceleratorRead); //hipcub/cub cannot deal with large vobjs so we split into small/large case. - if constexpr (sizeof(vobj) <= 256) { - sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); - } - else { - sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); - } -} -#endif +#endif #if defined(GRID_SYCL) @@ -179,9 +142,9 @@ template inline void sliceSumReduction_sycl_small(const vobj *Data, } free(mysum,*theGridAccelerator); } +#endif - -template inline void sliceSumReduction_sycl_large(const vobj *Data, Vector &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { +template inline void sliceSumReduction_large(const vobj *Data, Vector &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { typedef typename vobj::vector_type vector; const int words = sizeof(vobj)/sizeof(vector); const int osites = rd*e1*e2; @@ -196,29 +159,38 @@ template inline void sliceSumReduction_sycl_large(const vobj *Data, buf[ss] = dat[ss*words+w]; }); - sliceSumReduction_sycl_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd); - + #if defined(GRID_CUDA) || defined(GRID_HIP) + sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd); + #elif defined(GRID_SYCL) + sliceSumReduction_sycl_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd); + #endif + for (int r = 0; r < rd; r++) { lvSum_ptr[w+words*r]=lvSum_small[r]; } } + + } - -template inline void sliceSumReduction_sycl(const Lattice &Data, Vector &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) +template inline void sliceSumReduction_gpu(const Lattice &Data, Vector &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { - autoView(Data_v, Data, AcceleratorRead); //hipcub/cub cannot deal with large vobjs so we split into small/large case. + autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case. if constexpr (sizeof(vobj) <= 256) { - sliceSumReduction_sycl_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); + + #if defined(GRID_CUDA) || defined(GRID_HIP) + sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); + #elif defined (GRID_SYCL) + sliceSumReduction_sycl_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); + #endif + } else { - sliceSumReduction_sycl_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); + sliceSumReduction_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); } } - -#endif template inline void sliceSumReduction_cpu(const Lattice &Data, Vector &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) { @@ -238,13 +210,9 @@ template inline void sliceSumReduction_cpu(const Lattice &Data template inline void sliceSumReduction(const Lattice &Data, Vector &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) { - #if defined(GRID_CUDA) || defined(GRID_HIP) + #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) - sliceSumReduction_cub(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); - - #elif defined(GRID_SYCL) - - sliceSumReduction_sycl(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); + sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); #else sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);