mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Replace cuda/hip memcpy with Grid functions
This commit is contained in:
parent
0a816b5509
commit
6cd2d8fcd5
@ -4,11 +4,6 @@
|
|||||||
|
|
||||||
#include <cub/cub.cuh>
|
#include <cub/cub.cuh>
|
||||||
#define gpucub cub
|
#define gpucub cub
|
||||||
#define gpuMalloc cudaMalloc
|
|
||||||
#define gpuFree cudaFree
|
|
||||||
#define gpuMemcpyAsync cudaMemcpyAsync
|
|
||||||
#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
|
|
||||||
#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
|
|
||||||
#define gpuError_t cudaError_t
|
#define gpuError_t cudaError_t
|
||||||
#define gpuSuccess cudaSuccess
|
#define gpuSuccess cudaSuccess
|
||||||
|
|
||||||
@ -16,11 +11,6 @@
|
|||||||
|
|
||||||
#include <hipcub/hipcub.hpp>
|
#include <hipcub/hipcub.hpp>
|
||||||
#define gpucub hipcub
|
#define gpucub hipcub
|
||||||
#define gpuMalloc hipMalloc
|
|
||||||
#define gpuFree hipFree
|
|
||||||
#define gpuMemcpyAsync hipMemcpyAsync
|
|
||||||
#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
|
|
||||||
#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
|
|
||||||
#define gpuError_t hipError_t
|
#define gpuError_t hipError_t
|
||||||
#define gpuSuccess hipSuccess
|
#define gpuSuccess hipSuccess
|
||||||
|
|
||||||
@ -51,38 +41,22 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
|
|||||||
}
|
}
|
||||||
|
|
||||||
//Allocate memory for output and offset arrays on device
|
//Allocate memory for output and offset arrays on device
|
||||||
gpuError_t gpuErr = gpuMalloc(&d_out,rd*sizeof(vobj));
|
d_out = static_cast<vobj*>(acceleratorAllocDevice(rd*sizeof(vobj)));
|
||||||
if (gpuErr != gpuSuccess) {
|
|
||||||
std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpuMalloc (d_out)! Error: " << gpuErr <<std::endl;
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
|
|
||||||
gpuErr = gpuMalloc(&d_offsets,sizeof(int)*(rd+1));
|
d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
|
||||||
if (gpuErr != gpuSuccess) {
|
|
||||||
std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpuMalloc (d_offsets)! Error: " << gpuErr <<std::endl;
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
|
|
||||||
//copy offsets to device
|
//copy offsets to device
|
||||||
gpuErr = gpuMemcpyAsync(d_offsets,&offsets[0],sizeof(int)*(rd+1),gpuMemcpyHostToDevice,computeStream);
|
acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
|
||||||
if (gpuErr != gpuSuccess) {
|
|
||||||
std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpuMemcpy (d_offsets)! Error: " << gpuErr <<std::endl;
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
|
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
|
||||||
if (gpuErr!=gpuSuccess) {
|
if (gpuErr!=gpuSuccess) {
|
||||||
std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce (setup)! Error: " << gpuErr <<std::endl;
|
std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce (setup)! Error: " << gpuErr <<std::endl;
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
//allocate memory for temp_storage_array
|
//allocate memory for temp_storage_array
|
||||||
gpuErr = gpuMalloc(&temp_storage_array,temp_storage_bytes);
|
temp_storage_array = acceleratorAllocDevice(temp_storage_bytes);
|
||||||
if (gpuErr!=gpuSuccess) {
|
|
||||||
std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpuMalloc (temp_storage_array)! Error: " << gpuErr <<std::endl;
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
|
|
||||||
//prepare buffer for reduction
|
//prepare buffer for reduction
|
||||||
//use non-blocking accelerator_for to avoid syncs (ok because we submit to same computeStream)
|
//use non-blocking accelerator_for to avoid syncs (ok because we submit to same computeStream)
|
||||||
@ -105,18 +79,14 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
|
|||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
gpuErr = gpuMemcpyAsync(&lvSum[0],d_out,rd*sizeof(vobj),gpuMemcpyDeviceToHost,computeStream);
|
acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
|
||||||
if (gpuErr!=gpuSuccess) {
|
|
||||||
std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpuMemcpy (d_out)! Error: " << gpuErr <<std::endl;
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
|
|
||||||
//sync after copy
|
//sync after copy
|
||||||
accelerator_barrier();
|
accelerator_barrier();
|
||||||
|
|
||||||
gpuFree(temp_storage_array);
|
acceleratorFreeDevice(temp_storage_array);
|
||||||
gpuFree(d_out);
|
acceleratorFreeDevice(d_out);
|
||||||
gpuFree(d_offsets);
|
acceleratorFreeDevice(d_offsets);
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -225,6 +225,8 @@ inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
|
|||||||
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
||||||
|
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
|
||||||
|
inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
|
||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
|
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
|
||||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
||||||
{
|
{
|
||||||
@ -442,6 +444,8 @@ inline void acceleratorFreeShared(void *ptr){ auto r=hipFree(ptr);};
|
|||||||
inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);};
|
inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);};
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto r=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
|
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto r=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto r=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
|
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto r=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
|
||||||
|
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);}
|
||||||
|
inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);}
|
||||||
//inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
|
//inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
|
||||||
//inline void acceleratorCopySynchronise(void) { }
|
//inline void acceleratorCopySynchronise(void) { }
|
||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto r=hipMemset(base,value,bytes);}
|
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto r=hipMemset(base,value,bytes);}
|
||||||
|
Loading…
Reference in New Issue
Block a user