From d836ce3b78a4e2a1b3fb83ca27c636c44121a5d3 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 15 Jun 2019 08:14:21 +0100 Subject: [PATCH] Clean up of acceleration and threading primitives --- Grid/threads/Pragmas.h | 177 +++++++++++------------------------------ 1 file changed, 46 insertions(+), 131 deletions(-) diff --git a/Grid/threads/Pragmas.h b/Grid/threads/Pragmas.h index 737691e9..b36e8159 100644 --- a/Grid/threads/Pragmas.h +++ b/Grid/threads/Pragmas.h @@ -34,138 +34,54 @@ Author: paboyle #endif #define strong_inline __attribute__((always_inline)) inline +#define UNROLL _Pragma("unroll") + +////////////////////////////////////////////////////////////////////////////////// +// New primitives; explicit host thread calls, and accelerator data parallel calls +////////////////////////////////////////////////////////////////////////////////// #ifdef _OPENMP #define GRID_OMP #include #endif -#ifdef __NVCC__ -#define GRID_NVCC -#endif - -#define UNROLL _Pragma("unroll") - -////////////////////////////////////////////////////////////////////////////////// -// New primitives; explicit host thread calls, and accelerator data parallel calls -////////////////////////////////////////////////////////////////////////////////// #ifdef GRID_OMP - #define DO_PRAGMA_(x) _Pragma (#x) #define DO_PRAGMA(x) DO_PRAGMA_(x) - -#define thread_loop( range , ... ) DO_PRAGMA(omp parallel for schedule(static))for range { __VA_ARGS__ }; -#define thread_loop_collapse2( range , ... ) DO_PRAGMA(omp parallel for collapse(2)) for range { __VA_ARGS__ }; -#define thread_loop_collapse( N , range , ... ) DO_PRAGMA(omp parallel for collapse ( N ) ) for range { __VA_ARGS__ }; -#define thread_loop_in_region( range , ... ) DO_PRAGMA(omp for schedule(static)) for range { __VA_ARGS__ }; -#define thread_loop_collapse_in_region( N , range , ... ) DO_PRAGMA(omp for collapse ( N )) for range { __VA_ARGS__ }; -#define thread_region DO_PRAGMA(omp parallel) -#define thread_critical DO_PRAGMA(omp critical) #define thread_num(a) omp_get_thread_num() #define thread_max(a) omp_get_max_threads() -#else -#define thread_loop( range , ... ) for range { __VA_ARGS__ ; }; -#define thread_loop_collapse2( range , ... ) for range { __VA_ARGS__ ; }; -#define thread_loop_collapse( N , range , ... ) for range { __VA_ARGS__ ; }; -#define thread_region -#define thread_loop_in_region( range , ... ) for range { __VA_ARGS__ ; }; -#define thread_loop_collapse_in_region( N, range , ... ) for range { __VA_ARGS__ ; }; - -#define thread_critical +#else +#define DO_PRAGMA_(x) +#define DO_PRAGMA(x) #define thread_num(a) (0) #define thread_max(a) (1) #endif +#define naked_for(i,num,...) for ( uint64_t i=0;i __global__ -void LambdaApply(uint64_t base, uint64_t Num, lambda Lambda) -{ - uint64_t ss = blockIdx.x*blockDim.x + threadIdx.x; - if ( ss < Num ) { - Lambda(ss+base); - } -} - #define accelerator __host__ __device__ #define accelerator_inline __host__ __device__ inline -#define accelerator_loop( iterator, range, ... ) \ - typedef decltype(range.begin()) Iterator; \ - auto lambda = [=] accelerator (Iterator iterator) mutable { \ - __VA_ARGS__; \ - }; \ - Iterator num = range.end() - range.begin(); \ - Iterator base = range.begin(); \ - Iterator num_block = (num+gpu_threads-1)/gpu_threads; \ - LambdaApply<<>>(base,num,lambda); \ - cudaDeviceSynchronize(); \ - cudaError err = cudaGetLastError(); \ - if ( cudaSuccess != err ) { \ - printf("Cuda error %s\n",cudaGetErrorString( err )); \ - exit(0); \ - } - -#define accelerator_loopN( iterator, num, ... ) \ - typedef decltype(num) Iterator; \ - if ( num > 0 ) { \ - auto lambda = [=] accelerator (Iterator iterator) mutable { \ - __VA_ARGS__; \ - }; \ - Iterator base = 0; \ - Iterator num_block = (num+gpu_threads-1)/gpu_threads; \ - LambdaApply<<>>(base,num,lambda); \ - cudaDeviceSynchronize(); \ - cudaError err = cudaGetLastError(); \ - if ( cudaSuccess != err ) { \ - printf("Cuda error %s\n",cudaGetErrorString( err )); \ - exit(0); \ - } \ - } - -#define accelerator_loopNB( iterator, num, ... ) \ - typedef decltype(num) Iterator; \ - if ( num > 0 ) { \ - auto lambda = [=] accelerator (Iterator iterator) mutable { \ - __VA_ARGS__; \ - }; \ - Iterator base = 0; \ - Iterator num_block = (num+gpu_threads-1)/gpu_threads; \ - LambdaApply<<>>(base,num,lambda); \ - } - -#define cpu_loop( iterator, range, ... ) thread_loop( (auto iterator = range.begin();iterator __global__ -void LambdaApply2D(uint64_t Osites, uint64_t Isites, lambda Lambda) -{ - uint64_t site = threadIdx.x + blockIdx.x*blockDim.x; - uint64_t osite = site / Isites; - if ( (osite >>(num,(uint64_t)nsimd,lambda); \ - cudaDeviceSynchronize(); \ - cudaError err = cudaGetLastError(); \ - if ( cudaSuccess != err ) { \ - printf("Cuda error %s\n",cudaGetErrorString( err )); \ - exit(0); \ - } template __global__ void LambdaApplySIMT(uint64_t Isites, uint64_t Osites, lambda Lambda) @@ -177,40 +93,39 @@ void LambdaApplySIMT(uint64_t Isites, uint64_t Osites, lambda Lambda) } } -#define SIMT_loop( iterator, num, nsimd, ... ) \ +///////////////////////////////////////////////////////////////// +// Internal only really... but need to call when +///////////////////////////////////////////////////////////////// +#define accelerator_barrier(dummy) \ + { \ + cudaDeviceSynchronize(); \ + cudaError err = cudaGetLastError(); \ + if ( cudaSuccess != err ) { \ + printf("Cuda error %s\n",cudaGetErrorString( err )); \ + exit(0); \ + } \ + } + +// Copy the for_each_n style ; Non-blocking variant +#define accelerator_forNB( iterator, num, nsimd, ... ) \ typedef uint64_t Iterator; \ auto lambda = [=] accelerator (Iterator lane,Iterator iterator) mutable { \ __VA_ARGS__; \ }; \ dim3 cu_threads(gpu_threads,nsimd); \ dim3 cu_blocks ((num+gpu_threads-1)/gpu_threads); \ - LambdaApplySIMT<<>>(nsimd,num,lambda); \ - cudaDeviceSynchronize(); \ - cudaError err = cudaGetLastError(); \ - if ( cudaSuccess != err ) { \ - printf("Cuda error %s\n",cudaGetErrorString( err )); \ - exit(0); \ - } + LambdaApplySIMT<<>>(nsimd,num,lambda); + +// Copy the for_each_n style ; Non-blocking variant (default +#define accelerator_for( iterator, num, nsimd, ... ) \ + accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } ); \ + accelerator_barrier(dummy); #else #define accelerator #define accelerator_inline strong_inline -#define accelerator_loop( iterator, range, ... ) \ - thread_loop( (auto iterator = range.begin();iterator