From d0bb033ea2990065ef0b1afdf54dc3d5014b6110 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 22 Mar 2023 19:07:32 -0400 Subject: [PATCH] Device resident GPU block buffer instead of UVM as hit likely UVM bug. Code worked on CUDA 11.4 but fails on later drivers (certainly 530.30.02, but need to find the perlmutter driver version). --- Grid/lattice/Lattice_reduction_gpu.h | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index 5f5c6cc0..4bdcce0b 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -211,13 +211,28 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi assert(ok); Integer smemSize = numThreads * sizeof(sobj); - - Vector buffer(numBlocks); + // UVM seems to be buggy under later CUDA drivers + // This fails on A100 and driver 5.30.02 / CUDA 12.1 + // Fails with multiple NVCC versions back to 11.4, + // which worked with earlier drivers. + // Not sure which driver had first fail and this bears checking + // Is awkward as must install multiple driver versions +#undef UVM_BLOCK_BUFFER +#ifndef UVM_BLOCK_BUFFER + commVector buffer(numBlocks); sobj *buffer_v = &buffer[0]; - + sobj result; reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size); accelerator_barrier(); - auto result = buffer_v[0]; + acceleratorCopyFromDevice(buffer_v,&result,sizeof(result)); +#else + Vector buffer(numBlocks); + sobj *buffer_v = &buffer[0]; + sobj result; + reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size); + accelerator_barrier(); + result = *buffer_v; +#endif return result; }