diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index 5f5c6cc0..4bdcce0b 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -211,13 +211,28 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi assert(ok); Integer smemSize = numThreads * sizeof(sobj); - - Vector buffer(numBlocks); + // UVM seems to be buggy under later CUDA drivers + // This fails on A100 and driver 5.30.02 / CUDA 12.1 + // Fails with multiple NVCC versions back to 11.4, + // which worked with earlier drivers. + // Not sure which driver had first fail and this bears checking + // Is awkward as must install multiple driver versions +#undef UVM_BLOCK_BUFFER +#ifndef UVM_BLOCK_BUFFER + commVector buffer(numBlocks); sobj *buffer_v = &buffer[0]; - + sobj result; reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size); accelerator_barrier(); - auto result = buffer_v[0]; + acceleratorCopyFromDevice(buffer_v,&result,sizeof(result)); +#else + Vector buffer(numBlocks); + sobj *buffer_v = &buffer[0]; + sobj result; + reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size); + accelerator_barrier(); + result = *buffer_v; +#endif return result; }