/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/Threads.h Copyright (C) 2015 Author: Peter Boyle Author: paboyle This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ #pragma once #ifndef MAX #define MAX(x,y) ((x)>(y)?(x):(y)) #define MIN(x,y) ((x)>(y)?(y):(x)) #endif #define strong_inline __attribute__((always_inline)) inline #ifdef _OPENMP #define GRID_OMP #include #endif #ifdef __NVCC__ #define GRID_NVCC #endif ////////////////////////////////////////////////////////////////////////////////// // New primitives; explicit host thread calls, and accelerator data parallel calls ////////////////////////////////////////////////////////////////////////////////// #ifdef GRID_OMP #define thread_loop( range , ... ) _Pragma("omp parallel for schedule(static)") for range { __VA_ARGS__ ; }; #define thread_loop_in_region( range , ... ) _Pragma("omp for schedule(static)") for range { __VA_ARGS__ ; }; #define thread_loop_collapse2( range , ... ) _Pragma("omp parallel for collapse(2)") for range { __VA_ARGS__ }; #define thread_loop_collapse3( range , ... ) _Pragma("omp parallel for collapse(3)") for range { __VA_ARGS__ }; #define thread_loop_collapse4( range , ... ) _Pragma("omp parallel for collapse(4)") for range { __VA_ARGS__ }; #define thread_region _Pragma("omp parallel") #define thread_critical _Pragma("omp critical") #define thread_num(a) omp_get_thread_num() #define thread_max(a) omp_get_max_threads() #else #define thread_loop( range , ... ) for range { __VA_ARGS__ ; }; #define thread_loop_in_region( range , ... ) for range { __VA_ARGS__ ; }; #define thread_loop_collapse2( range , ... ) for range { __VA_ARGS__ ; }; #define thread_loop_collapse3( range , ... ) for range { __VA_ARGS__ ; }; #define thread_loop_collapse4( range , ... ) for range { __VA_ARGS__ ; }; #define thread_region #define thread_critical #define thread_num(a) (0) #define thread_max(a) (1) #endif ////////////////////////////////////////////////////////////////////////////////// // Accelerator primitives; fall back to threading ////////////////////////////////////////////////////////////////////////////////// #ifdef GRID_NVCC extern uint32_t gpu_threads; template __global__ void LambdaApply(uint64_t base, uint64_t Num, lambda Lambda) { uint64_t ss = blockIdx.x*blockDim.x + threadIdx.x; if ( ss < Num ) { Lambda(ss+base); } } #define accelerator __host__ __device__ #define accelerator_inline __host__ __device__ inline #define accelerator_loop( iterator, range, ... ) \ typedef decltype(range.begin()) Iterator; \ auto lambda = [=] accelerator (Iterator iterator) mutable { \ __VA_ARGS__; \ }; \ Iterator num = range.end() - range.begin(); \ Iterator base = range.begin(); \ Iterator num_block = (num+gpu_threads-1)/gpu_threads; \ LambdaApply<<>>(base,num,lambda); \ cudaDeviceSynchronize(); \ cudaError err = cudaGetLastError(); \ if ( cudaSuccess != err ) { \ printf("Cuda error %s\n",cudaGetErrorString( err )); \ exit(0); \ } #define accelerator_loopN( iterator, num, ... ) \ typedef decltype(num) Iterator; \ auto lambda = [=] accelerator (Iterator iterator) mutable { \ __VA_ARGS__; \ }; \ Iterator base = 0; \ Iterator num_block = (num+gpu_threads-1)/gpu_threads; \ LambdaApply<<>>(base,num,lambda); \ cudaDeviceSynchronize(); \ cudaError err = cudaGetLastError(); \ if ( cudaSuccess != err ) { \ printf("Cuda error %s\n",cudaGetErrorString( err )); \ exit(0); \ } #define cpu_loop( iterator, range, ... ) thread_loop( (auto iterator = range.begin();iterator __global__ void LambdaApply2D(uint64_t Osites, uint64_t Isites, lambda Lambda) { uint64_t site = threadIdx.x + blockIdx.x*blockDim.x; uint64_t osite = site / Isites; if ( (osite >>(num,(uint64_t)nsimd,lambda); \ cudaDeviceSynchronize(); \ cudaError err = cudaGetLastError(); \ if ( cudaSuccess != err ) { \ printf("Cuda error %s\n",cudaGetErrorString( err )); \ exit(0); \ } #else #define accelerator #define accelerator_inline strong_inline #define accelerator_loop( iterator, range, ... ) \ thread_loop( (auto iterator = range.begin();iterator