From d0bb033ea2990065ef0b1afdf54dc3d5014b6110 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Wed, 22 Mar 2023 19:07:32 -0400
Subject: [PATCH] Device resident GPU block buffer instead of UVM as hit likely
 UVM bug. Code worked on CUDA 11.4 but fails on later drivers (certainly
 530.30.02, but need to find the perlmutter driver version).

---
 Grid/lattice/Lattice_reduction_gpu.h | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)
diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h
index 5f5c6cc0..4bdcce0b 100644
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -211,13 +211,28 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
   assert(ok);
 
   Integer smemSize = numThreads * sizeof(sobj);
-
-  Vector<sobj> buffer(numBlocks);
+  // UVM seems to be buggy under later CUDA drivers
+  // This fails on A100 and driver 5.30.02 / CUDA 12.1
+  // Fails with multiple NVCC versions back to 11.4,
+  // which worked with earlier drivers.
+  // Not sure which driver had first fail and this bears checking
+  // Is awkward as must install multiple driver versions
+#undef UVM_BLOCK_BUFFER  
+#ifndef UVM_BLOCK_BUFFER  
+  commVector<sobj> buffer(numBlocks);
   sobj *buffer_v = &buffer[0];
-  
+  sobj result;
   reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
   accelerator_barrier();
-  auto result = buffer_v[0];
+  acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
+#else
+  Vector<sobj> buffer(numBlocks);
+  sobj *buffer_v = &buffer[0];
+  sobj result;
+  reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
+  accelerator_barrier();
+  result = *buffer_v;
+#endif
   return result;
 }