mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Annoying hack that is useful to preserve for profiling
This commit is contained in:
parent
e187bcb85c
commit
c564611ba7
@ -137,6 +137,18 @@ inline void cuda_mem(void)
|
|||||||
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
|
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
|
||||||
LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
|
LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
|
||||||
}
|
}
|
||||||
|
#define prof_accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
|
||||||
|
{ \
|
||||||
|
int nt=acceleratorThreads(); \
|
||||||
|
typedef uint64_t Iterator; \
|
||||||
|
auto lambda = [=] accelerator \
|
||||||
|
(Iterator iter1,Iterator iter2,Iterator lane) mutable { \
|
||||||
|
__VA_ARGS__; \
|
||||||
|
}; \
|
||||||
|
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
|
||||||
|
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
|
||||||
|
ProfileLambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
|
||||||
|
}
|
||||||
|
|
||||||
#define accelerator_for6dNB(iter1, num1, \
|
#define accelerator_for6dNB(iter1, num1, \
|
||||||
iter2, num2, \
|
iter2, num2, \
|
||||||
@ -157,6 +169,20 @@ inline void cuda_mem(void)
|
|||||||
Lambda6Apply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,num3,num4,num5,num6,lambda); \
|
Lambda6Apply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,num3,num4,num5,num6,lambda); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
|
||||||
|
{ \
|
||||||
|
int nt=acceleratorThreads(); \
|
||||||
|
typedef uint64_t Iterator; \
|
||||||
|
auto lambda = [=] accelerator \
|
||||||
|
(Iterator iter1,Iterator iter2,Iterator lane) mutable { \
|
||||||
|
__VA_ARGS__; \
|
||||||
|
}; \
|
||||||
|
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
|
||||||
|
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
|
||||||
|
LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
|
||||||
|
}
|
||||||
|
|
||||||
template<typename lambda> __global__
|
template<typename lambda> __global__
|
||||||
void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
|
void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
|
||||||
{
|
{
|
||||||
@ -168,6 +194,17 @@ void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
|
|||||||
Lambda(x,y,z);
|
Lambda(x,y,z);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
template<typename lambda> __global__
|
||||||
|
void ProfileLambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
|
||||||
|
{
|
||||||
|
// Weird permute is to make lane coalesce for large blocks
|
||||||
|
uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
|
||||||
|
uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
|
||||||
|
uint64_t z = threadIdx.x;
|
||||||
|
if ( (x < num1) && (y<num2) && (z<num3) ) {
|
||||||
|
Lambda(x,y,z);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template<typename lambda> __global__
|
template<typename lambda> __global__
|
||||||
void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
|
void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
|
||||||
@ -208,6 +245,7 @@ inline void *acceleratorAllocShared(size_t bytes)
|
|||||||
if( err != cudaSuccess ) {
|
if( err != cudaSuccess ) {
|
||||||
ptr = (void *) NULL;
|
ptr = (void *) NULL;
|
||||||
printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
|
printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
|
||||||
|
assert(0);
|
||||||
}
|
}
|
||||||
return ptr;
|
return ptr;
|
||||||
};
|
};
|
||||||
@ -460,6 +498,9 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream);
|
|||||||
#if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP)
|
#if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP)
|
||||||
// FIXME -- the non-blocking nature got broken March 30 2023 by PAB
|
// FIXME -- the non-blocking nature got broken March 30 2023 by PAB
|
||||||
#define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );
|
#define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );
|
||||||
|
#define prof_accelerator_for( iter1, num1, nsimd, ... ) \
|
||||||
|
prof_accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );\
|
||||||
|
accelerator_barrier(dummy);
|
||||||
|
|
||||||
#define accelerator_for( iter, num, nsimd, ... ) \
|
#define accelerator_for( iter, num, nsimd, ... ) \
|
||||||
accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } ); \
|
accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } ); \
|
||||||
|
Loading…
Reference in New Issue
Block a user