diff --git a/Grid/lattice/Lattice_reduction_sycl.h b/Grid/lattice/Lattice_reduction_sycl.h index 8395eb7c..d1a4186f 100644 --- a/Grid/lattice/Lattice_reduction_sycl.h +++ b/Grid/lattice/Lattice_reduction_sycl.h @@ -9,25 +9,34 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os { typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_objectD sobjD; - sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator); + // sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator); + // sobj *mysum =(sobj *) malloc(sizeof(sobj)); + sobj identity; zeroit(identity); sobj ret ; Integer nsimd= vobj::Nsimd(); - - theGridAccelerator->submit([&](cl::sycl::handler &cgh) { - auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>()); - cgh.parallel_for(cl::sycl::range<1>{osites}, - Reduction, - [=] (cl::sycl::id<1> item, auto &sum) { - auto osite = item[0]; - sum +=Reduce(lat[osite]); - }); - }); + + { + sycl::buffer abuff(&ret, {1}); + theGridAccelerator->submit([&](cl::sycl::handler &cgh) { + + auto Reduction = cl::sycl::reduction(abuff,cgh,identity,std::plus<>()); + cgh.parallel_for(cl::sycl::range<1>{osites}, + Reduction, + [=] (cl::sycl::id<1> item, auto &sum) { + auto osite = item[0]; + sum +=Reduce(lat[osite]); + }); + }); + } theGridAccelerator->wait(); - ret = mysum[0]; - free(mysum,*theGridAccelerator); + // acceleratorCopyFromDevice(mysum,&ret,sizeof(sobj)); + // ret = mysum[0]; + sobjD dret; convertType(dret,ret); + // free(mysum,*theGridAccelerator); + // free(mysum); return dret; } @@ -73,19 +82,24 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite template Word svm_xor(Word *vec,uint64_t L) { Word xorResult; xorResult = 0; - Word *d_sum =(Word *)cl::sycl::malloc_shared(sizeof(Word),*theGridAccelerator); + // Word *d_sum =(Word *)cl::sycl::malloc_shared(sizeof(Word),*theGridAccelerator); Word identity; identity=0; - theGridAccelerator->submit([&](cl::sycl::handler &cgh) { - auto Reduction = cl::sycl::reduction(d_sum,identity,std::bit_xor<>()); - cgh.parallel_for(cl::sycl::range<1>{L}, - Reduction, - [=] (cl::sycl::id<1> index, auto &sum) { - sum ^=vec[index]; - }); - }); + Word ret; + { + sycl::buffer abuff(&ret, {1}); + theGridAccelerator->submit([&](cl::sycl::handler &cgh) { + // auto Reduction = cl::sycl::reduction(d_sum,identity,std::bit_xor<>()); + auto Reduction = cl::sycl::reduction(abuff,cgh,identity,std::bit_xor<>()); + cgh.parallel_for(cl::sycl::range<1>{L}, + Reduction, + [=] (cl::sycl::id<1> index, auto &sum) { + sum ^=vec[index]; + }); + }); + } theGridAccelerator->wait(); - Word ret = d_sum[0]; - free(d_sum,*theGridAccelerator); + // ret = d_sum[0]; + // free(d_sum,*theGridAccelerator); return ret; }