1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

10TF/s on 32^3 x 64 on single node

This commit is contained in:
Peter Boyle 2022-08-04 15:43:52 -04:00
parent 2cb5bedc15
commit 95b640cb6b

View File

@ -107,7 +107,7 @@ void acceleratorInit(void);
extern int acceleratorAbortOnGpuError; extern int acceleratorAbortOnGpuError;
extern cudaStream_t copyStream; extern cudaStream_t copyStream;
extern cudaStream_t cpuStream; extern cudaStream_t computeStream;
accelerator_inline int acceleratorSIMTlane(int Nsimd) { accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#ifdef GRID_SIMT #ifdef GRID_SIMT
@ -135,7 +135,7 @@ inline void cuda_mem(void)
}; \ }; \
dim3 cu_threads(nsimd,acceleratorThreads(),1); \ dim3 cu_threads(nsimd,acceleratorThreads(),1); \
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \ dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
LambdaApply<<<cu_blocks,cu_threads,0,cpuStream>>>(num1,num2,nsimd,lambda); \ LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
} }
#define accelerator_for6dNB(iter1, num1, \ #define accelerator_for6dNB(iter1, num1, \
@ -154,7 +154,7 @@ inline void cuda_mem(void)
}; \ }; \
dim3 cu_blocks (num1,num2,num3); \ dim3 cu_blocks (num1,num2,num3); \
dim3 cu_threads(num4,num5,num6); \ dim3 cu_threads(num4,num5,num6); \
Lambda6Apply<<<cu_blocks,cu_threads,0,cpuStream>>>(num1,num2,num3,num4,num5,num6,lambda); \ Lambda6Apply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,num3,num4,num5,num6,lambda); \
} }
template<typename lambda> __global__ template<typename lambda> __global__
@ -190,7 +190,7 @@ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
#define accelerator_barrier(dummy) \ #define accelerator_barrier(dummy) \
{ \ { \
cudaDeviceSynchronize(); \ cudaStreamSynchronize(computeStream); \
cudaError err = cudaGetLastError(); \ cudaError err = cudaGetLastError(); \
if ( cudaSuccess != err ) { \ if ( cudaSuccess != err ) { \
printf("accelerator_barrier(): Cuda error %s \n", \ printf("accelerator_barrier(): Cuda error %s \n", \
@ -340,7 +340,7 @@ NAMESPACE_BEGIN(Grid);
#define accelerator_inline __host__ __device__ inline #define accelerator_inline __host__ __device__ inline
extern hipStream_t copyStream; extern hipStream_t copyStream;
extern hipStream_t cpuStream; extern hipStream_t computeStream;
/*These routines define mapping from thread grid to loop & vector lane indexing */ /*These routines define mapping from thread grid to loop & vector lane indexing */
accelerator_inline int acceleratorSIMTlane(int Nsimd) { accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#ifdef GRID_SIMT #ifdef GRID_SIMT
@ -362,16 +362,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \ dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \
if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \ if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \
hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads, \ hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads, \
0,0/*cpuStream*/, \ 0,computeStream, \
num1,num2,nsimd, lambda); \ num1,num2,nsimd, lambda); \
} else { \ } else { \
hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \ hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \
0,0/*cpuStream*/, \ 0,computeStream, \
num1,num2,nsimd, lambda); \ num1,num2,nsimd, lambda); \
} \ } \
} }
// Works with MPI if barrier here
// accelerator_barrier();
template<typename lambda> __global__ template<typename lambda> __global__
__launch_bounds__(64,1) __launch_bounds__(64,1)
@ -401,7 +399,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
#define accelerator_barrier(dummy) \ #define accelerator_barrier(dummy) \
{ \ { \
hipDeviceSynchronize(); \ hipStreamSynchronize(computeStream); \
auto err = hipGetLastError(); \ auto err = hipGetLastError(); \
if ( err != hipSuccess ) { \ if ( err != hipSuccess ) { \
printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \ printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \