diff --git a/Grid/lattice/Lattice_slicesum_core.h b/Grid/lattice/Lattice_slicesum_core.h index 159a331b..7c3518cd 100644 --- a/Grid/lattice/Lattice_slicesum_core.h +++ b/Grid/lattice/Lattice_slicesum_core.h @@ -4,11 +4,6 @@ #include #define gpucub cub -#define gpuMalloc cudaMalloc -#define gpuFree cudaFree -#define gpuMemcpyAsync cudaMemcpyAsync -#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost -#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuError_t cudaError_t #define gpuSuccess cudaSuccess @@ -16,11 +11,6 @@ #include #define gpucub hipcub -#define gpuMalloc hipMalloc -#define gpuFree hipFree -#define gpuMemcpyAsync hipMemcpyAsync -#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost -#define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuError_t hipError_t #define gpuSuccess hipSuccess @@ -51,38 +41,22 @@ template inline void sliceSumReduction_cub_small(const vobj *Data, V } //Allocate memory for output and offset arrays on device - gpuError_t gpuErr = gpuMalloc(&d_out,rd*sizeof(vobj)); - if (gpuErr != gpuSuccess) { - std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpuMalloc (d_out)! Error: " << gpuErr <(acceleratorAllocDevice(rd*sizeof(vobj))); - gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream); + d_offsets = static_cast(acceleratorAllocDevice((rd+1)*sizeof(int))); + + //copy offsets to device + acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream); + + + gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream); if (gpuErr!=gpuSuccess) { std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce (setup)! Error: " << gpuErr < inline void sliceSumReduction_cub_small(const vobj *Data, V exit(EXIT_FAILURE); } - gpuErr = gpuMemcpyAsync(&lvSum[0],d_out,rd*sizeof(vobj),gpuMemcpyDeviceToHost,computeStream); - if (gpuErr!=gpuSuccess) { - std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpuMemcpy (d_out)! Error: " << gpuErr <