mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-30 19:44:32 +00:00 
			
		
		
		
	GPU work allocation improved
This commit is contained in:
		| @@ -77,26 +77,33 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
| ////////////////////////////////////////////////////////////////////////////////// | ////////////////////////////////////////////////////////////////////////////////// | ||||||
| #ifdef GRID_NVCC | #ifdef GRID_NVCC | ||||||
|  |  | ||||||
|  | constexpr uint32_t gpu_threads = 8; | ||||||
|  |  | ||||||
| template<typename lambda>  __global__ | template<typename lambda>  __global__ | ||||||
| void LambdaApply(uint64_t Num, lambda Lambda) | void LambdaApply(uint64_t Num, lambda Lambda) | ||||||
| { | { | ||||||
|   uint64_t ss = blockIdx.x; |   uint64_t ss = blockIdx.x*blockDim.x + threadIdx.x; | ||||||
|   if ( ss < Num ) { |   if ( ss < Num ) { | ||||||
|     Lambda(ss); |     Lambda(ss); | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| #define accelerator __host__ __device__ | #define accelerator        __host__ __device__ | ||||||
| #define accelerator_inline __host__ __device__ inline | #define accelerator_inline __host__ __device__ inline | ||||||
|  |  | ||||||
| #define accelerator_loop( iterator, range, ... )			\ | #define accelerator_loop( iterator, range, ... )			\ | ||||||
|   typedef decltype(range.begin()) Iterator;				\ |   typedef decltype(range.begin()) Iterator;				\ | ||||||
|   auto lambda = [=] accelerator (Iterator iterator) mutable {		\ |   auto lambda = [=] accelerator (Iterator iterator) mutable {		\ | ||||||
|     __VA_ARGS__;							\ |     __VA_ARGS__;							\ | ||||||
|   };									\ |   };									\ | ||||||
|   Iterator num = range.end();						\ |   Iterator num = range.end();						\ | ||||||
|   LambdaApply<<<num,1>>>(num,lambda);					\ |   Iterator num_block = (num + gpu_threads - 1)/gpu_threads;		\ | ||||||
|   cudaDeviceSynchronize(); |   LambdaApply<<<num_block,gpu_threads>>>(num,lambda);			\ | ||||||
|  |   cudaError err = cudaGetLastError();					\ | ||||||
|  |   cudaDeviceSynchronize();						\ | ||||||
|  |   if ( cudaSuccess != err ) {						\ | ||||||
|  |     printf("Cuda error %s\n",cudaGetErrorString( err ));		\ | ||||||
|  |     exit(0);								\ | ||||||
|  |   }									\ | ||||||
|  |  | ||||||
| #define cpu_loop( iterator, range, ... )   thread_loop( (auto iterator = range.begin();iterator<range.end();iterator++), { __VA_ARGS__ }); | #define cpu_loop( iterator, range, ... )   thread_loop( (auto iterator = range.begin();iterator<range.end();iterator++), { __VA_ARGS__ }); | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user