1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-09-20 09:15:38 +01:00

10TF/s on 32^3 x 64 on single node

This commit is contained in:
Peter Boyle 2022-08-04 15:43:52 -04:00
parent 2cb5bedc15
commit 95b640cb6b

View File

@ -107,7 +107,7 @@ void acceleratorInit(void);
extern int acceleratorAbortOnGpuError;
extern cudaStream_t copyStream;
extern cudaStream_t cpuStream;
extern cudaStream_t computeStream;
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#ifdef GRID_SIMT
@ -135,7 +135,7 @@ inline void cuda_mem(void)
}; \
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
LambdaApply<<<cu_blocks,cu_threads,0,cpuStream>>>(num1,num2,nsimd,lambda); \
LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
}
#define accelerator_for6dNB(iter1, num1, \
@ -154,7 +154,7 @@ inline void cuda_mem(void)
}; \
dim3 cu_blocks (num1,num2,num3); \
dim3 cu_threads(num4,num5,num6); \
Lambda6Apply<<<cu_blocks,cu_threads,0,cpuStream>>>(num1,num2,num3,num4,num5,num6,lambda); \
Lambda6Apply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,num3,num4,num5,num6,lambda); \
}
template<typename lambda> __global__
@ -190,7 +190,7 @@ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
#define accelerator_barrier(dummy) \
{ \
cudaDeviceSynchronize(); \
cudaStreamSynchronize(computeStream); \
cudaError err = cudaGetLastError(); \
if ( cudaSuccess != err ) { \
printf("accelerator_barrier(): Cuda error %s \n", \
@ -340,7 +340,7 @@ NAMESPACE_BEGIN(Grid);
#define accelerator_inline __host__ __device__ inline
extern hipStream_t copyStream;
extern hipStream_t cpuStream;
extern hipStream_t computeStream;
/*These routines define mapping from thread grid to loop & vector lane indexing */
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#ifdef GRID_SIMT
@ -362,16 +362,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \
if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \
hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads, \
0,0/*cpuStream*/, \
0,computeStream, \
num1,num2,nsimd, lambda); \
} else { \
hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \
0,0/*cpuStream*/, \
0,computeStream, \
num1,num2,nsimd, lambda); \
} \
}
// Works with MPI if barrier here
// accelerator_barrier();
template<typename lambda> __global__
__launch_bounds__(64,1)
@ -401,7 +399,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
#define accelerator_barrier(dummy) \
{ \
hipDeviceSynchronize(); \
hipStreamSynchronize(computeStream); \
auto err = hipGetLastError(); \
if ( err != hipSuccess ) { \
printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \