From c564611ba78c4a61dda5aa367080f45f9b11504f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 29 Sep 2023 17:11:12 -0400 Subject: [PATCH] Annoying hack that is useful to preserve for profiling --- Grid/threads/Accelerator.h | 41 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 2aeb9fa7..c5544a88 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -137,6 +137,18 @@ inline void cuda_mem(void) dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \ LambdaApply<<>>(num1,num2,nsimd,lambda); \ } +#define prof_accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ + { \ + int nt=acceleratorThreads(); \ + typedef uint64_t Iterator; \ + auto lambda = [=] accelerator \ + (Iterator iter1,Iterator iter2,Iterator lane) mutable { \ + __VA_ARGS__; \ + }; \ + dim3 cu_threads(nsimd,acceleratorThreads(),1); \ + dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \ + ProfileLambdaApply<<>>(num1,num2,nsimd,lambda); \ + } #define accelerator_for6dNB(iter1, num1, \ iter2, num2, \ @@ -157,6 +169,20 @@ inline void cuda_mem(void) Lambda6Apply<<>>(num1,num2,num3,num4,num5,num6,lambda); \ } + +#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ + { \ + int nt=acceleratorThreads(); \ + typedef uint64_t Iterator; \ + auto lambda = [=] accelerator \ + (Iterator iter1,Iterator iter2,Iterator lane) mutable { \ + __VA_ARGS__; \ + }; \ + dim3 cu_threads(nsimd,acceleratorThreads(),1); \ + dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \ + LambdaApply<<>>(num1,num2,nsimd,lambda); \ + } + template __global__ void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) { @@ -168,6 +194,17 @@ void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) Lambda(x,y,z); } } +template __global__ +void ProfileLambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) +{ + // Weird permute is to make lane coalesce for large blocks + uint64_t x = threadIdx.y + blockDim.y*blockIdx.x; + uint64_t y = threadIdx.z + blockDim.z*blockIdx.y; + uint64_t z = threadIdx.x; + if ( (x < num1) && (y __global__ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3, @@ -208,6 +245,7 @@ inline void *acceleratorAllocShared(size_t bytes) if( err != cudaSuccess ) { ptr = (void *) NULL; printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err)); + assert(0); } return ptr; }; @@ -460,6 +498,9 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); #if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP) // FIXME -- the non-blocking nature got broken March 30 2023 by PAB #define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} ); +#define prof_accelerator_for( iter1, num1, nsimd, ... ) \ + prof_accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );\ + accelerator_barrier(dummy); #define accelerator_for( iter, num, nsimd, ... ) \ accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } ); \