From d24d8e8398ebb9a8e65fcadbccda09ab7a89e3a7 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 12 May 2020 10:35:49 -0400 Subject: [PATCH] Use X-direction as more bits meaningful on CUDA. 2^31-1 shoulddd always bee enough for SIMD and thread reduced local volume e.g. 32*2^31 = 2^36 = (2^9)^4 or 512^4 ias big enough. Where 32 is gpu_threads * Nsimd = 8*4 --- Grid/threads/Accelerator.h | 43 ++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 1569b22b..0a5103a2 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -85,27 +85,27 @@ void acceleratorInit(void); #define accelerator __host__ __device__ #define accelerator_inline __host__ __device__ inline -accelerator_inline int acceleratorSIMTlane(int Nsimd) { return threadIdx.x; } // CUDA specific +accelerator_inline int acceleratorSIMTlane(int Nsimd) { return threadIdx.z; } // CUDA specific #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ { \ typedef uint64_t Iterator; \ auto lambda = [=] accelerator \ - (Iterator lane,Iterator iter1,Iterator iter2) mutable { \ + (Iterator iter1,Iterator iter2,Iterator lane) mutable { \ __VA_ARGS__; \ }; \ int nt=acceleratorThreads(); \ - dim3 cu_threads(nsimd,acceleratorThreads(),1); \ - dim3 cu_blocks (1,(num1+nt-1)/nt,num2); \ - LambdaApply<<>>(nsimd,num1,num2,lambda); \ + dim3 cu_threads(acceleratorThreads(),1,nsimd); \ + dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \ + LambdaApply<<>>(num1,num2,nsimd,lambda); \ } template __global__ void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) { - uint64_t x = threadIdx.x;//+ blockDim.x*blockIdx.x; + uint64_t x = threadIdx.x + blockDim.x*blockIdx.x; uint64_t y = threadIdx.y + blockDim.y*blockIdx.y; - uint64_t z = threadIdx.z + blockDim.z*blockIdx.z; + uint64_t z = threadIdx.z; if ( (x < num1) && (y>()[0]; } // SYCL specific +accelerator_inline int acceleratorSIMTlane(int Nsimd) { return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2]; } // SYCL specific #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \ int nt=acceleratorThreads(); \ unsigned long unum1 = num1; \ unsigned long unum2 = num2; \ - cl::sycl::range<3> local {nsimd,nt,1}; \ - cl::sycl::range<3> global{nsimd,unum1,unum2}; \ + cl::sycl::range<3> local {nt,1,nsimd}; \ + cl::sycl::range<3> global{unum1,unum2,nsimd}; \ cgh.parallel_for( \ cl::sycl::nd_range<3>(global,local), \ [=] (cl::sycl::nd_item<3> item) mutable { \ - auto lane = item.get_global_id(0); \ - auto iter1 = item.get_global_id(1); \ - auto iter2 = item.get_global_id(2); \ + auto iter1 = item.get_global_id(0); \ + auto iter2 = item.get_global_id(1); \ + auto lane = item.get_global_id(2); \ { __VA_ARGS__ }; \ }); \ }); -dim3 cu_threads(nsimd,acceleratorThreads(),1); \ -dim3 cu_blocks (1,(num1+nt-1)/n,num2); \ #define accelerator_barrier(dummy) theGridAccelerator->wait(); @@ -213,30 +211,29 @@ NAMESPACE_BEGIN(Grid); #define accelerator_inline __host__ __device__ inline /*These routines define mapping from thread grid to loop & vector lane indexing */ -accelerator_inline int acceleratorSIMTlane(int Nsimd) { return hipThreadIdx_x; } // HIP specific +accelerator_inline int acceleratorSIMTlane(int Nsimd) { return hipThreadIdx_z; } // HIP specific #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ { \ typedef uint64_t Iterator; \ auto lambda = [=] accelerator \ - (Iterator lane,Iterator iter1,Iterator iter2 ) mutable { \ + (Iterator iter1,Iterator iter2,Iterator lane ) mutable { \ { __VA_ARGS__;} \ }; \ int nt=acceleratorThreads(); \ - dim3 hip_threads(nsimd,nt,1); \ - dim3 hip_blocks (1,(num1+nt-1)/nt,num2); \ + dim3 hip_threads(nt,1,nsimd); \ + dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \ hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \ 0,0, \ - nsimd,num1,num2,lambda); \ + num1,num2,nsimd,lambda); \ } - template __global__ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda) { - uint64_t x = hipThreadIdx_x;//+ hipBlockDim_x*hipBlockIdx_x; + uint64_t x = hipThreadIdx_x + hipBlockDim_x*hipBlockIdx_x; uint64_t y = hipThreadIdx_y + hipBlockDim_y*hipBlockIdx_y; - uint64_t z = hipThreadIdx_z + hipBlockDim_z*hipBlockIdx_z; + uint64_t z = hipThreadIdx_z ;//+ hipBlockDim_z*hipBlockIdx_z; if ( (x < numx) && (y