mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-04 19:25:56 +01:00
10TF/s on 32^3 x 64 on single node
This commit is contained in:
parent
2cb5bedc15
commit
95b640cb6b
@ -107,7 +107,7 @@ void acceleratorInit(void);
|
||||
|
||||
extern int acceleratorAbortOnGpuError;
|
||||
extern cudaStream_t copyStream;
|
||||
extern cudaStream_t cpuStream;
|
||||
extern cudaStream_t computeStream;
|
||||
|
||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
||||
#ifdef GRID_SIMT
|
||||
@ -135,7 +135,7 @@ inline void cuda_mem(void)
|
||||
}; \
|
||||
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
|
||||
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
|
||||
LambdaApply<<<cu_blocks,cu_threads,0,cpuStream>>>(num1,num2,nsimd,lambda); \
|
||||
LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
|
||||
}
|
||||
|
||||
#define accelerator_for6dNB(iter1, num1, \
|
||||
@ -154,7 +154,7 @@ inline void cuda_mem(void)
|
||||
}; \
|
||||
dim3 cu_blocks (num1,num2,num3); \
|
||||
dim3 cu_threads(num4,num5,num6); \
|
||||
Lambda6Apply<<<cu_blocks,cu_threads,0,cpuStream>>>(num1,num2,num3,num4,num5,num6,lambda); \
|
||||
Lambda6Apply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,num3,num4,num5,num6,lambda); \
|
||||
}
|
||||
|
||||
template<typename lambda> __global__
|
||||
@ -190,7 +190,7 @@ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
|
||||
|
||||
#define accelerator_barrier(dummy) \
|
||||
{ \
|
||||
cudaDeviceSynchronize(); \
|
||||
cudaStreamSynchronize(computeStream); \
|
||||
cudaError err = cudaGetLastError(); \
|
||||
if ( cudaSuccess != err ) { \
|
||||
printf("accelerator_barrier(): Cuda error %s \n", \
|
||||
@ -340,7 +340,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
#define accelerator_inline __host__ __device__ inline
|
||||
|
||||
extern hipStream_t copyStream;
|
||||
extern hipStream_t cpuStream;
|
||||
extern hipStream_t computeStream;
|
||||
/*These routines define mapping from thread grid to loop & vector lane indexing */
|
||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
||||
#ifdef GRID_SIMT
|
||||
@ -362,16 +362,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
||||
dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \
|
||||
if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \
|
||||
hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads, \
|
||||
0,0/*cpuStream*/, \
|
||||
0,computeStream, \
|
||||
num1,num2,nsimd, lambda); \
|
||||
} else { \
|
||||
hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \
|
||||
0,0/*cpuStream*/, \
|
||||
0,computeStream, \
|
||||
num1,num2,nsimd, lambda); \
|
||||
} \
|
||||
}
|
||||
// Works with MPI if barrier here
|
||||
// accelerator_barrier();
|
||||
|
||||
template<typename lambda> __global__
|
||||
__launch_bounds__(64,1)
|
||||
@ -401,7 +399,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
|
||||
|
||||
#define accelerator_barrier(dummy) \
|
||||
{ \
|
||||
hipDeviceSynchronize(); \
|
||||
hipStreamSynchronize(computeStream); \
|
||||
auto err = hipGetLastError(); \
|
||||
if ( err != hipSuccess ) { \
|
||||
printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
|
||||
|
Loading…
x
Reference in New Issue
Block a user