From 8f70cfeda93319cb5e1db7e45c2d23264e8264e3 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 18 Oct 2024 13:56:53 -0400 Subject: [PATCH] Clean up --- Grid/threads/Accelerator.h | 59 +++++++------------------------------- 1 file changed, 11 insertions(+), 48 deletions(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index e37b5fb7..dc68fd2d 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -132,27 +132,17 @@ inline void cuda_mem(void) #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ { \ - int nt=acceleratorThreads(); \ - typedef uint64_t Iterator; \ - auto lambda = [=] accelerator \ - (Iterator iter1,Iterator iter2,Iterator lane) mutable { \ - __VA_ARGS__; \ - }; \ - dim3 cu_threads(nsimd,acceleratorThreads(),1); \ - dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \ - LambdaApply<<>>(num1,num2,nsimd,lambda); \ - } -#define prof_accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ - { \ - int nt=acceleratorThreads(); \ - typedef uint64_t Iterator; \ - auto lambda = [=] accelerator \ - (Iterator iter1,Iterator iter2,Iterator lane) mutable { \ - __VA_ARGS__; \ - }; \ - dim3 cu_threads(nsimd,acceleratorThreads(),1); \ - dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \ - ProfileLambdaApply<<>>(num1,num2,nsimd,lambda); \ + if ( num1*num2 ) { \ + int nt=acceleratorThreads(); \ + typedef uint64_t Iterator; \ + auto lambda = [=] accelerator \ + (Iterator iter1,Iterator iter2,Iterator lane) mutable { \ + __VA_ARGS__; \ + }; \ + dim3 cu_threads(nsimd,acceleratorThreads(),1); \ + dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \ + LambdaApply<<>>(num1,num2,nsimd,lambda); \ + } \ } #define accelerator_for6dNB(iter1, num1, \ @@ -175,19 +165,6 @@ inline void cuda_mem(void) } -#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ - { \ - int nt=acceleratorThreads(); \ - typedef uint64_t Iterator; \ - auto lambda = [=] accelerator \ - (Iterator iter1,Iterator iter2,Iterator lane) mutable { \ - __VA_ARGS__; \ - }; \ - dim3 cu_threads(nsimd,acceleratorThreads(),1); \ - dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \ - LambdaApply<<>>(num1,num2,nsimd,lambda); \ - } - template __global__ void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) { @@ -199,17 +176,6 @@ void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) Lambda(x,y,z); } } -template __global__ -void ProfileLambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) -{ - // Weird permute is to make lane coalesce for large blocks - uint64_t x = threadIdx.y + blockDim.y*blockIdx.x; - uint64_t y = threadIdx.z + blockDim.z*blockIdx.y; - uint64_t z = threadIdx.x; - if ( (x < num1) && (y __global__ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3, @@ -523,9 +489,6 @@ inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize #if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP) // FIXME -- the non-blocking nature got broken March 30 2023 by PAB #define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} ); -#define prof_accelerator_for( iter1, num1, nsimd, ... ) \ - prof_accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );\ - accelerator_barrier(dummy); #define accelerator_for( iter, num, nsimd, ... ) \ accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } ); \