1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

Testig out approaches to kernel writing introducing SIMT_loop temporarily

This commit is contained in:
Peter Boyle 2019-06-08 13:47:04 +01:00
parent 29a244e423
commit 8adc5da7dd

View File

@ -157,7 +157,6 @@ void LambdaApply2D(uint64_t Osites, uint64_t Isites, lambda Lambda)
__VA_ARGS__; \
}; \
Iterator num = range.end() - range.begin(); \
Iterator base = range.begin(); \
Iterator cu_threads= gpu_threads; \
Iterator cu_blocks = num*nsimd/cu_threads; \
LambdaApply2D<<<cu_blocks,cu_threads>>>(num,(uint64_t)nsimd,lambda); \
@ -168,6 +167,31 @@ void LambdaApply2D(uint64_t Osites, uint64_t Isites, lambda Lambda)
exit(0); \
}
template<typename lambda> __global__
void LambdaApplySIMT(uint64_t Isites, uint64_t Osites, lambda Lambda)
{
uint64_t isite = threadIdx.y;
uint64_t osite = threadIdx.x+blockDim.x*blockIdx.x;
if ( (osite <Osites) && (isite<Isites) ) {
Lambda(isite,osite);
}
}
#define SIMT_loop( iterator, num, nsimd, ... ) \
typedef uint64_t Iterator; \
auto lambda = [=] accelerator (Iterator lane,Iterator iterator) mutable { \
__VA_ARGS__; \
}; \
dim3 cu_threads(gpu_threads,nsimd); \
dim3 cu_blocks ((num+gpu_threads-1)/gpu_threads); \
LambdaApplySIMT<<<cu_blocks,cu_threads>>>(nsimd,num,lambda); \
cudaDeviceSynchronize(); \
cudaError err = cudaGetLastError(); \
if ( cudaSuccess != err ) { \
printf("Cuda error %s\n",cudaGetErrorString( err )); \
exit(0); \
}
#else
#define accelerator
@ -186,4 +210,7 @@ void LambdaApply2D(uint64_t Osites, uint64_t Isites, lambda Lambda)
#define coalesce_loop( iterator, range, nsimd, ... ) cpu_loop(iterator,range,{__VA_ARGS__})
#define SIMT_loop( iterator, num, nsimd, ... ) accelerator_loopN( iterator, num, {__VA_ARGS__})
#endif