1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-05 11:45:56 +01:00

Use X-direction as more bits meaningful on CUDA.

2^31-1 shoulddd always bee enough for SIMD and thread reduced local volume

e.g. 32*2^31 = 2^36 = (2^9)^4 or 512^4 ias big enough.

Where 32 is gpu_threads * Nsimd = 8*4
This commit is contained in:
Peter Boyle 2020-05-12 10:35:49 -04:00
parent 07c0c02f8c
commit d24d8e8398

View File

@ -85,27 +85,27 @@ void acceleratorInit(void);
#define accelerator __host__ __device__
#define accelerator_inline __host__ __device__ inline
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return threadIdx.x; } // CUDA specific
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return threadIdx.z; } // CUDA specific
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
{ \
typedef uint64_t Iterator; \
auto lambda = [=] accelerator \
(Iterator lane,Iterator iter1,Iterator iter2) mutable { \
(Iterator iter1,Iterator iter2,Iterator lane) mutable { \
__VA_ARGS__; \
}; \
int nt=acceleratorThreads(); \
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
dim3 cu_blocks (1,(num1+nt-1)/nt,num2); \
LambdaApply<<<cu_blocks,cu_threads>>>(nsimd,num1,num2,lambda); \
dim3 cu_threads(acceleratorThreads(),1,nsimd); \
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
LambdaApply<<<cu_blocks,cu_threads>>>(num1,num2,nsimd,lambda); \
}
template<typename lambda> __global__
void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
{
uint64_t x = threadIdx.x;//+ blockDim.x*blockIdx.x;
uint64_t x = threadIdx.x + blockDim.x*blockIdx.x;
uint64_t y = threadIdx.y + blockDim.y*blockIdx.y;
uint64_t z = threadIdx.z + blockDim.z*blockIdx.z;
uint64_t z = threadIdx.z;
if ( (x < num1) && (y<num2) && (z<num3) ) {
Lambda(x,y,z);
}
@ -167,26 +167,24 @@ extern cl::sycl::queue *theGridAccelerator;
#define accelerator
#define accelerator_inline strong_inline
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[0]; } // SYCL specific
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2]; } // SYCL specific
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \
int nt=acceleratorThreads(); \
unsigned long unum1 = num1; \
unsigned long unum2 = num2; \
cl::sycl::range<3> local {nsimd,nt,1}; \
cl::sycl::range<3> global{nsimd,unum1,unum2}; \
cl::sycl::range<3> local {nt,1,nsimd}; \
cl::sycl::range<3> global{unum1,unum2,nsimd}; \
cgh.parallel_for<class dslash>( \
cl::sycl::nd_range<3>(global,local), \
[=] (cl::sycl::nd_item<3> item) mutable { \
auto lane = item.get_global_id(0); \
auto iter1 = item.get_global_id(1); \
auto iter2 = item.get_global_id(2); \
auto iter1 = item.get_global_id(0); \
auto iter2 = item.get_global_id(1); \
auto lane = item.get_global_id(2); \
{ __VA_ARGS__ }; \
}); \
});
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
dim3 cu_blocks (1,(num1+nt-1)/n,num2); \
#define accelerator_barrier(dummy) theGridAccelerator->wait();
@ -213,30 +211,29 @@ NAMESPACE_BEGIN(Grid);
#define accelerator_inline __host__ __device__ inline
/*These routines define mapping from thread grid to loop & vector lane indexing */
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return hipThreadIdx_x; } // HIP specific
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return hipThreadIdx_z; } // HIP specific
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
{ \
typedef uint64_t Iterator; \
auto lambda = [=] accelerator \
(Iterator lane,Iterator iter1,Iterator iter2 ) mutable { \
(Iterator iter1,Iterator iter2,Iterator lane ) mutable { \
{ __VA_ARGS__;} \
}; \
int nt=acceleratorThreads(); \
dim3 hip_threads(nsimd,nt,1); \
dim3 hip_blocks (1,(num1+nt-1)/nt,num2); \
dim3 hip_threads(nt,1,nsimd); \
dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \
hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \
0,0, \
nsimd,num1,num2,lambda); \
num1,num2,nsimd,lambda); \
}
template<typename lambda> __global__
void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
{
uint64_t x = hipThreadIdx_x;//+ hipBlockDim_x*hipBlockIdx_x;
uint64_t x = hipThreadIdx_x + hipBlockDim_x*hipBlockIdx_x;
uint64_t y = hipThreadIdx_y + hipBlockDim_y*hipBlockIdx_y;
uint64_t z = hipThreadIdx_z + hipBlockDim_z*hipBlockIdx_z;
uint64_t z = hipThreadIdx_z ;//+ hipBlockDim_z*hipBlockIdx_z;
if ( (x < numx) && (y<numy) && (z<numz) ) {
Lambda(x,y,z);
}