From 95b640cb6ba6b3211554f799d2ede5744ae37403 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 4 Aug 2022 15:43:52 -0400 Subject: [PATCH] 10TF/s on 32^3 x 64 on single node --- Grid/threads/Accelerator.h | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index bd09a880..4e476abb 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -107,7 +107,7 @@ void acceleratorInit(void); extern int acceleratorAbortOnGpuError; extern cudaStream_t copyStream; -extern cudaStream_t cpuStream; +extern cudaStream_t computeStream; accelerator_inline int acceleratorSIMTlane(int Nsimd) { #ifdef GRID_SIMT @@ -135,7 +135,7 @@ inline void cuda_mem(void) }; \ dim3 cu_threads(nsimd,acceleratorThreads(),1); \ dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \ - LambdaApply<<>>(num1,num2,nsimd,lambda); \ + LambdaApply<<>>(num1,num2,nsimd,lambda); \ } #define accelerator_for6dNB(iter1, num1, \ @@ -154,7 +154,7 @@ inline void cuda_mem(void) }; \ dim3 cu_blocks (num1,num2,num3); \ dim3 cu_threads(num4,num5,num6); \ - Lambda6Apply<<>>(num1,num2,num3,num4,num5,num6,lambda); \ + Lambda6Apply<<>>(num1,num2,num3,num4,num5,num6,lambda); \ } template __global__ @@ -190,7 +190,7 @@ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3, #define accelerator_barrier(dummy) \ { \ - cudaDeviceSynchronize(); \ + cudaStreamSynchronize(computeStream); \ cudaError err = cudaGetLastError(); \ if ( cudaSuccess != err ) { \ printf("accelerator_barrier(): Cuda error %s \n", \ @@ -340,7 +340,7 @@ NAMESPACE_BEGIN(Grid); #define accelerator_inline __host__ __device__ inline extern hipStream_t copyStream; -extern hipStream_t cpuStream; +extern hipStream_t computeStream; /*These routines define mapping from thread grid to loop & vector lane indexing */ accelerator_inline int acceleratorSIMTlane(int Nsimd) { #ifdef GRID_SIMT @@ -362,16 +362,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \ if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \ hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads, \ - 0,0/*cpuStream*/, \ + 0,computeStream, \ num1,num2,nsimd, lambda); \ } else { \ hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \ - 0,0/*cpuStream*/, \ + 0,computeStream, \ num1,num2,nsimd, lambda); \ } \ } -// Works with MPI if barrier here -// accelerator_barrier(); template __global__ __launch_bounds__(64,1) @@ -401,7 +399,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda) #define accelerator_barrier(dummy) \ { \ - hipDeviceSynchronize(); \ + hipStreamSynchronize(computeStream); \ auto err = hipGetLastError(); \ if ( err != hipSuccess ) { \ printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \