diff --git a/Grid/lattice/Lattice_slicesum_core.h b/Grid/lattice/Lattice_slicesum_core.h index 9c4cc051..63737517 100644 --- a/Grid/lattice/Lattice_slicesum_core.h +++ b/Grid/lattice/Lattice_slicesum_core.h @@ -136,10 +136,13 @@ template inline void sliceSumReduction_sycl(const Lattice &Dat typedef typename vobj::scalar_object sobj; size_t subvol_size = e1*e2; - vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator); + vobj *mysum = (vobj *) malloc_shared(rd*sizeof(vobj),*theGridAccelerator); vobj vobj_zero; zeroit(vobj_zero); - + for (int r = 0; r reduction_buffer(rd*subvol_size); auto rb_p = &reduction_buffer[0]; @@ -159,9 +162,8 @@ template inline void sliceSumReduction_sycl(const Lattice &Dat }); for (int r = 0; r < rd; r++) { - mysum[0] = vobj_zero; //dirty hack: cannot pass vobj_zero as identity to sycl::reduction as its not device_copyable theGridAccelerator->submit([&](cl::sycl::handler &cgh) { - auto Reduction = cl::sycl::reduction(mysum,std::plus<>()); + auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>()); cgh.parallel_for(cl::sycl::range<1>{subvol_size}, Reduction, [=](cl::sycl::id<1> item, auto &sum) { @@ -169,10 +171,13 @@ template inline void sliceSumReduction_sycl(const Lattice &Dat sum += rb_p[r*subvol_size+s]; }); }); - theGridAccelerator->wait(); - lvSum[r] = mysum[0]; + + + } + theGridAccelerator->wait(); + for (int r = 0; r < rd; r++) { + lvSum[r] = mysum[r]; } - free(mysum,*theGridAccelerator); } #endif