diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 2c18796d..38e8072d 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -432,22 +432,20 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ { \ - typedef uint64_t Iterator; \ - auto lambda = [=] accelerator \ - (Iterator iter1,Iterator iter2,Iterator lane ) mutable { \ - { __VA_ARGS__;} \ - }; \ - int nt=acceleratorThreads(); \ - dim3 hip_threads(nsimd, nt, 1); \ - dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \ - if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \ - hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads, \ - 0,computeStream, \ - num1,num2,nsimd, lambda); \ - } else { \ - hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \ - 0,computeStream, \ - num1,num2,nsimd, lambda); \ + if (num1*num2) { \ + typedef uint64_t Iterator; \ + auto lambda = [=] accelerator \ + (Iterator iter1,Iterator iter2,Iterator lane ) mutable { \ + { __VA_ARGS__;} \ + }; \ + int nt=acceleratorThreads(); \ + dim3 hip_threads(nsimd, nt, 1); \ + dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \ + if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \ + LambdaApply64<<>>(num1,num2,nsimd,lambda); \ + } else { \ + LambdaApply<<>>(num1,num2,nsimd,lambda); \ + } \ } \ }