From 0b905a72ddfafcf02bcc4b6738ff31c74be79ed5 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 29 Oct 2021 02:22:22 +0100
Subject: [PATCH 1/5] Better reduction for GPUs

---
 Grid/lattice/Lattice_reduction_gpu.h | 48 ++++++++++++++++++++++++----
 1 file changed, 42 insertions(+), 6 deletions(-)
diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h
index c2875052..823e497e 100644
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -23,7 +23,7 @@ unsigned int nextPow2(Iterator x) {
 }
 
 template <class Iterator>
-void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
+int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
   
   int device;
 #ifdef GRID_CUDA
@@ -37,13 +37,13 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
   Iterator sharedMemPerBlock   = gpu_props[device].sharedMemPerBlock;
   Iterator maxThreadsPerBlock  = gpu_props[device].maxThreadsPerBlock;
   Iterator multiProcessorCount = gpu_props[device].multiProcessorCount;
-  
+  /*  
   std::cout << GridLogDebug << "GPU has:" << std::endl;
   std::cout << GridLogDebug << "\twarpSize            = " << warpSize << std::endl;
   std::cout << GridLogDebug << "\tsharedMemPerBlock   = " << sharedMemPerBlock << std::endl;
   std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << maxThreadsPerBlock << std::endl;
   std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
-  
+  */  
   if (warpSize != WARP_SIZE) {
     std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl;
     exit(EXIT_FAILURE);
@@ -53,12 +53,12 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
   threads = warpSize;
   if ( threads*sizeofsobj > sharedMemPerBlock ) {
     std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
-    exit(EXIT_FAILURE);
+    return 0;
   }
   while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
   // keep all the streaming multiprocessors busy
   blocks = nextPow2(multiProcessorCount);
-  
+  return 1;
 }
 
 template <class sobj, class Iterator>
@@ -198,7 +198,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
 // Possibly promote to double and sum
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) 
+inline typename vobj::scalar_objectD sumD_gpu_internal(const vobj *lat, Integer osites) 
 {
   typedef typename vobj::scalar_objectD sobj;
   typedef decltype(lat) Iterator;
@@ -208,6 +208,7 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
 
   Integer numThreads, numBlocks;
   getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
+
   Integer smemSize = numThreads * sizeof(sobj);
 
   Vector<sobj> buffer(numBlocks);
@@ -218,6 +219,41 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
   auto result = buffer_v[0];
   return result;
 }
+template <class vobj>
+inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
+{
+  typedef typename vobj::vector_type  vector;
+  typedef typename vobj::scalar_typeD scalarD;
+  typedef typename vobj::scalar_objectD sobj;
+  sobj ret;
+  scalarD *ret_p = (scalarD *)&ret;
+  
+  const int words = sizeof(vobj)/sizeof(vector);
+
+  Integer nsimd= vobj::Nsimd();
+  Integer size = osites*nsimd;
+  Integer numThreads, numBlocks;
+  int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
+  
+  if ( ok ) {
+    ret = sumD_gpu_internal(lat,osites);
+  } else {
+    std::cout << GridLogWarning << " dropping to summing word by word for large object size "<<sizeof(vobj)<<std::endl;
+    Vector<vector> buffer(osites);
+    vector *dat = (vector *)lat;
+    vector *buf = &buffer[0];
+    iScalar<vector> *tbuf =(iScalar<vector> *)  &buffer[0];
+    for(int w=0;w<words;w++) {
+
+      accelerator_for(ss,osites,1,{
+	 buf[ss] = dat[ss*words+w];
+      });
+      
+      ret_p[w] = sumD_gpu_internal(tbuf,osites);
+    }
+  }
+  return ret;
+}
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Return as same precision as input performing reduction in double precision though
 /////////////////////////////////////////////////////////////////////////////////////////////////////////

From 42d56ea6b63778ba4013cd0da0d325b5619f23cb Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 29 Oct 2021 02:23:08 +0100
Subject: [PATCH 2/5] Verbosity

---
 Grid/lattice/Lattice_reduction_gpu.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h
index 823e497e..73a704f5 100644
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -238,7 +238,6 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
   if ( ok ) {
     ret = sumD_gpu_internal(lat,osites);
   } else {
-    std::cout << GridLogWarning << " dropping to summing word by word for large object size "<<sizeof(vobj)<<std::endl;
     Vector<vector> buffer(osites);
     vector *dat = (vector *)lat;
     vector *buf = &buffer[0];

From 3e882f555dcb75a29e640c6c70fd7e7f23973b73 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 1 Mar 2022 08:54:45 -0500
Subject: [PATCH 3/5] Large / small sumD options

---
 Grid/lattice/Lattice_reduction.h     |  9 +++++
 Grid/lattice/Lattice_reduction_gpu.h | 53 ++++++++++++++++++++--------
 2 files changed, 47 insertions(+), 15 deletions(-)

diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h
index 326b9ea3..c3478ab4 100644
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -142,6 +142,15 @@ inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
   return sumD_cpu(arg,osites);
 #endif  
 }
+template<class vobj>
+inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
+{
+#if defined(GRID_CUDA)||defined(GRID_HIP)
+  return sumD_gpu_large(arg,osites);
+#else
+  return sumD_cpu(arg,osites);
+#endif  
+}
 
 template<class vobj>
 inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h
index 73a704f5..c685a2c0 100644
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -198,7 +198,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
 // Possibly promote to double and sum
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 template <class vobj>
-inline typename vobj::scalar_objectD sumD_gpu_internal(const vobj *lat, Integer osites) 
+inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites) 
 {
   typedef typename vobj::scalar_objectD sobj;
   typedef decltype(lat) Iterator;
@@ -207,7 +207,8 @@ inline typename vobj::scalar_objectD sumD_gpu_internal(const vobj *lat, Integer
   Integer size = osites*nsimd;
 
   Integer numThreads, numBlocks;
-  getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
+  int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
+  assert(ok);
 
   Integer smemSize = numThreads * sizeof(sobj);
 
@@ -219,6 +220,37 @@ inline typename vobj::scalar_objectD sumD_gpu_internal(const vobj *lat, Integer
   auto result = buffer_v[0];
   return result;
 }
+
+template <class vobj>
+inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
+{
+  typedef typename vobj::vector_type  vector;
+  typedef typename vobj::scalar_typeD scalarD;
+  typedef typename vobj::scalar_objectD sobj;
+  sobj ret;
+  scalarD *ret_p = (scalarD *)&ret;
+  
+  const int words = sizeof(vobj)/sizeof(vector);
+
+  Integer nsimd= vobj::Nsimd();
+  Integer size = osites*nsimd;
+  Integer numThreads, numBlocks;
+
+  Vector<vector> buffer(osites);
+  vector *dat = (vector *)lat;
+  vector *buf = &buffer[0];
+  iScalar<vector> *tbuf =(iScalar<vector> *)  &buffer[0];
+  for(int w=0;w<words;w++) {
+
+    accelerator_for(ss,osites,1,{
+	buf[ss] = dat[ss*words+w];
+      });
+      
+    ret_p[w] = sumD_gpu_small(tbuf,osites);
+  }
+  return ret;
+}
+
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
 {
@@ -236,23 +268,14 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
   int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
   
   if ( ok ) {
-    ret = sumD_gpu_internal(lat,osites);
+    ret = sumD_gpu_small(lat,osites);
   } else {
-    Vector<vector> buffer(osites);
-    vector *dat = (vector *)lat;
-    vector *buf = &buffer[0];
-    iScalar<vector> *tbuf =(iScalar<vector> *)  &buffer[0];
-    for(int w=0;w<words;w++) {
-
-      accelerator_for(ss,osites,1,{
-	 buf[ss] = dat[ss*words+w];
-      });
-      
-      ret_p[w] = sumD_gpu_internal(tbuf,osites);
-    }
+    ret = sumD_gpu_large(lat,osites);
   }
   return ret;
 }
+
+
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Return as same precision as input performing reduction in double precision though
 /////////////////////////////////////////////////////////////////////////////////////////////////////////

From d4ae71b8806e535145a718a9e1e00a7315aaf5bc Mon Sep 17 00:00:00 2001
From: Fabian Joswig <fabian.joswig@ed.ac.uk>
Date: Wed, 2 Mar 2022 15:40:18 +0000
Subject: [PATCH 4/5] sum_gpu_large and sum_gpu templates added.

---
 Grid/lattice/Lattice_reduction.h     | 16 ++++++++++++++++
 Grid/lattice/Lattice_reduction_gpu.h |  8 ++++++++
 2 files changed, 24 insertions(+)

diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h
index c3478ab4..0ddac437 100644
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -168,6 +168,22 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
   return ssum;
 }
 
+template<class vobj>
+inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
+{
+#if defined(GRID_CUDA)||defined(GRID_HIP)
+  autoView( arg_v, arg, AcceleratorRead);
+  Integer osites = arg.Grid()->oSites();
+  auto ssum= sum_gpu_large(&arg_v[0],osites);
+#else
+  autoView(arg_v, arg, CpuRead);
+  Integer osites = arg.Grid()->oSites();
+  auto ssum= sum_cpu(&arg_v[0],osites);
+#endif
+  arg.Grid()->GlobalSum(ssum);
+  return ssum;
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Deterministic Reduction operations
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h
index c685a2c0..c3422af3 100644
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -288,6 +288,14 @@ inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
   return result;
 }
 
+template <class vobj>
+inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
+{
+  typedef typename vobj::scalar_object sobj;
+  sobj result;
+  result = sumD_gpu_large(lat,osites);
+  return result;
+}
 
 
 NAMESPACE_END(Grid);

From d1decee4cce0b9a9f02bb5521cb06840b20022ad Mon Sep 17 00:00:00 2001
From: Fabian Joswig <fabian.joswig@ed.ac.uk>
Date: Wed, 2 Mar 2022 16:54:23 +0000
Subject: [PATCH 5/5] Cleaned up unused variables in Lattice_reduction_gpu.h

---
 Grid/lattice/Lattice_reduction_gpu.h | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h
index c3422af3..bad86d2a 100644
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -232,10 +232,6 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi
   
   const int words = sizeof(vobj)/sizeof(vector);
 
-  Integer nsimd= vobj::Nsimd();
-  Integer size = osites*nsimd;
-  Integer numThreads, numBlocks;
-
   Vector<vector> buffer(osites);
   vector *dat = (vector *)lat;
   vector *buf = &buffer[0];
@@ -258,10 +254,7 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
   typedef typename vobj::scalar_typeD scalarD;
   typedef typename vobj::scalar_objectD sobj;
   sobj ret;
-  scalarD *ret_p = (scalarD *)&ret;
   
-  const int words = sizeof(vobj)/sizeof(vector);
-
   Integer nsimd= vobj::Nsimd();
   Integer size = osites*nsimd;
   Integer numThreads, numBlocks;
@@ -275,7 +268,6 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
   return ret;
 }
 
-
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Return as same precision as input performing reduction in double precision though
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -297,5 +289,4 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
   return result;
 }
 
-
 NAMESPACE_END(Grid);