From 3e49dc8a671b6b6eeff8d63513876a9e01b6f6ca Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Wed, 14 Aug 2019 15:18:34 +0100
Subject: [PATCH] Reduction finished and hopefully fixes CI regression fail on
 single precisoin and force

---
 Grid/lattice/Lattice_reduction.h     | 123 ++++++++++++++++++---------
 Grid/lattice/Lattice_reduction_gpu.h |  69 ++++++++-------
 2 files changed, 119 insertions(+), 73 deletions(-)
diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h
index a5366435..c8f4e7b9 100644
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -29,55 +29,58 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #endif
 
 NAMESPACE_BEGIN(Grid);
+
+//////////////////////////////////////////////////////
+// FIXME this should promote to double and accumulate
+//////////////////////////////////////////////////////
 template<class vobj>
-inline typename vobj::scalar_object sum_cpu(const Lattice<vobj> &arg)
+inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
 {
-  GridBase *grid=arg.Grid();
-  int Nsimd = grid->Nsimd();
-  
-  Vector<vobj> sumarray(grid->SumArraySize());
-  for(int i=0;i<grid->SumArraySize();i++){
+  typedef typename vobj::scalar_object  sobj;
+
+  const int Nsimd = vobj::Nsimd();
+  const int nthread = GridThread::GetThreads();
+
+  Vector<sobj> sumarray(nthread);
+  for(int i=0;i<nthread;i++){
     sumarray[i]=Zero();
   }
   
-  auto arg_v=arg.View();
-  thread_for(thr,grid->SumArraySize(), {
+  thread_for(thr,nthread, {
     int nwork, mywork, myoff;
-    nwork = grid->oSites();
+    nwork = osites;
     GridThread::GetWork(nwork,thr,mywork,myoff);
-    
     vobj vvsum=Zero();
     for(int ss=myoff;ss<mywork+myoff; ss++){
-      vvsum = vvsum + arg_v[ss];
+      vvsum = vvsum + arg[ss];
     }
-    sumarray[thr]=vvsum;
+    sumarray[thr]=Reduce(vvsum);
   });
   
-  vobj vsum=Zero();  // sum across threads
-  for(int i=0;i<grid->SumArraySize();i++){
-    vsum = vsum+sumarray[i];
+  sobj ssum=Zero();  // sum across threads
+  for(int i=0;i<nthread;i++){
+    ssum = ssum+sumarray[i];
   } 
   
-  typedef typename vobj::scalar_object sobj;
-  sobj ssum=Zero();
-  
-  ExtractBuffer<sobj>               buf(Nsimd);
-  extract(vsum,buf);
-  
-  for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i];
-  arg.Grid()->GlobalSum(ssum);
-  
   return ssum;
 }
-
+template<class vobj>
+inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
+{
+#ifdef GRID_NVCC
+  return sum_gpu(arg,osites);
+#else
+  return sum_cpu(arg,osites);
+#endif  
+}
 template<class vobj>
 inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
 {
-#ifdef GRID_NVCC
-  return sum_gpu(arg);
-#else
-  return sum_cpu(arg);
-#endif  
+  auto arg_v = arg.View();
+  Integer osites = arg.Grid()->oSites();
+  auto ssum= sum(&arg_v[0],osites);
+  arg.Grid()->GlobalSum(ssum);
+  return ssum;
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -94,7 +97,7 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
 {
   typedef typename vobj::scalar_type scalar_type;
   typedef typename vobj::vector_typeD vector_type;
-  scalar_type  nrm;
+  ComplexD  nrm;
   
   GridBase *grid = left.Grid();
   
@@ -104,20 +107,43 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
 
   const uint64_t nsimd = grid->Nsimd();
   const uint64_t sites = grid->oSites();
+
+  typename vobj::scalar_object  f;
+  typename vobj::scalar_objectD d;
+  f=Zero();
+  d=f;
   
+#ifdef GRID_NVCC
+  // GPU - SIMT lane compliance...
   typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
-  Lattice<inner_t> inner_tmp(grid);
-  auto inner_tmp_v = inner_tmp.View();
+  Vector<inner_t> inner_tmp(sites);
+  auto inner_tmp_v = &inner_tmp[0];
   
+
   accelerator_for( ss, sites, nsimd,{
       auto x_l = left_v(ss);
       auto y_l = right_v(ss);
       coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
   })
 
-  nrm = TensorRemove(sum(inner_tmp));
+  // This is in single precision and fails some tests
+  // Need a sumD that sums in double
+  nrm = TensorRemove(sumD_gpu(inner_tmp_v,sites));  
+#else
+  // CPU 
+  typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
+  Vector<inner_t> inner_tmp(sites);
+  auto inner_tmp_v = &inner_tmp[0];
+  
+  accelerator_for( ss, sites, nsimd,{
+      auto x_l = left_v[ss];
+      auto y_l = right_v[ss];
+      inner_tmp_v[ss]=innerProductD(x_l,y_l);
+  })
+  nrm = TensorRemove(sum(inner_tmp_v,sites));
+#endif
+  grid->GlobalSum(nrm);
 
-  //  right.Grid()->GlobalSum(nrm);
   return nrm;
 }
 
@@ -153,19 +179,34 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
   const uint64_t nsimd = grid->Nsimd();
   const uint64_t sites = grid->oSites();
   
+#ifdef GRID_NVCC
+  // GPU
   typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
-  Lattice<inner_t> inner_tmp(grid);
-  auto inner_tmp_v = inner_tmp.View();
+  Vector<inner_t> inner_tmp(sites);
+  auto inner_tmp_v = &inner_tmp[0];
 
   accelerator_for( ss, sites, nsimd,{
       auto tmp = a*x_v(ss)+b*y_v(ss);
       coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
       coalescedWrite(z_v[ss],tmp);
-  })
-  
-  nrm = real(TensorRemove(sum(inner_tmp)));
+  });
 
-  //  z.Grid()->GlobalSum(nrm);
+  nrm = real(TensorRemove(sumD_gpu(inner_tmp_v,sites)));
+#else
+  // CPU 
+  typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
+  Vector<inner_t> inner_tmp(sites);
+  auto inner_tmp_v = &inner_tmp[0];
+  
+  accelerator_for( ss, sites, nsimd,{
+      auto tmp = a*x_v(ss)+b*y_v(ss);
+      inner_tmp_v[ss]=innerProductD(tmp,tmp);
+      z_v[ss]=tmp;
+  });
+  // Already promoted to double
+  nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
+#endif
+  grid->GlobalSum(nrm);
   return nrm; 
 }
 
diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h
index d2906492..c5d75356 100644
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -82,13 +82,11 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
   __syncthreads();
 }
 
-template <class vobj, class Iterator>
-__device__ void reduceBlocks(const LatticeView<vobj> g_idata, typename vobj::scalar_object *g_odata, Iterator n) {
-  
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
-  typedef typename vobj::scalar_object sobj;
-  constexpr Iterator nsimd = sizeof(vector_type)/sizeof(scalar_type);
+
+template <class vobj, class sobj, class Iterator>
+__device__ void reduceBlocks(const vobj *g_idata, sobj *g_odata, Iterator n) 
+{
+  constexpr Iterator nsimd = vobj::Nsimd();
   
   Iterator blockSize = blockDim.x;
   
@@ -109,13 +107,16 @@ __device__ void reduceBlocks(const LatticeView<vobj> g_idata, typename vobj::sca
     Iterator lane = i % nsimd;
     Iterator ss   = i / nsimd;
     auto tmp = extractLane(lane,g_idata[ss]);
-    mySum += tmp;
+    sobj tmpD;
+    tmpD=tmp;
+    mySum   +=tmpD;
     
     if (i + blockSize < n) {
       lane = (i+blockSize) % nsimd;
       ss   = (i+blockSize) / nsimd;
       tmp = extractLane(lane,g_idata[ss]);
-      mySum += tmp;
+      tmpD = tmp;
+      mySum += tmpD;
     }
     i += gridSize;
   }
@@ -126,9 +127,8 @@ __device__ void reduceBlocks(const LatticeView<vobj> g_idata, typename vobj::sca
   if (tid == 0) g_odata[blockIdx.x] = sdata[0];
 }
 
-template <class vobj, class Iterator>
-__global__ void reduceKernel(const LatticeView<vobj> lat, typename vobj::scalar_object *buffer, Iterator n) {
-  typedef typename vobj::scalar_object sobj;
+template <class vobj, class sobj,class Iterator>
+__global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
   
   Iterator blockSize = blockDim.x;
   
@@ -179,32 +179,26 @@ __global__ void reduceKernel(const LatticeView<vobj> lat, typename vobj::scalar_
   }
 }
 
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Possibly promote to double and sum
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
 template <class vobj>
-inline typename vobj::scalar_object sum_gpu(const Lattice<vobj> &lat) 
+inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) 
 {
+  typedef typename vobj::scalar_objectD sobj;
+  typedef decltype(lat) Iterator;
   
-  LatticeView<vobj> lat_v = lat.View();
-  
-  typedef typename vobj::scalar_object sobj;
-  typedef decltype(lat_v.begin()) Iterator;
-  
-  Iterator size = lat.Grid()->lSites();
-  
-  Iterator numThreads, numBlocks;
+  Integer nsimd= vobj::Nsimd();
+  Integer size = osites*nsimd;
+
+  Integer numThreads, numBlocks;
   getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
-  Iterator smemSize = numThreads * sizeof(sobj);
-  /*  
-  std::cout << GridLogDebug << "Starting reduction with:" << std::endl;
-  std::cout << GridLogDebug << "\tsize       = " << size << std::endl;
-  std::cout << GridLogDebug << "\tnumThreads = " << numThreads << std::endl;
-  std::cout << GridLogDebug << "\tnumBlocks  = " << numBlocks << std::endl;
-  std::cout << GridLogDebug << "\tsmemSize   = " << smemSize << std::endl;
-  */
+  Integer smemSize = numThreads * sizeof(sobj);
 
   Vector<sobj> buffer(numBlocks);
   sobj *buffer_v = &buffer[0];
   
-  reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat_v, buffer_v, size);
+  reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
   cudaDeviceSynchronize();
   
   cudaError err = cudaGetLastError();
@@ -212,10 +206,21 @@ inline typename vobj::scalar_object sum_gpu(const Lattice<vobj> &lat)
     printf("Cuda error %s\n",cudaGetErrorString( err ));
     exit(0);
   }
-  
   auto result = buffer_v[0];
-  lat.Grid()->GlobalSum(result);
+  return result;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Return as same precision as input performing reduction in double precision though
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <class vobj>
+inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites) 
+{
+  typedef typename vobj::scalar_object sobj;
+  sobj result;
+  result = sumD_gpu(lat,osites);
   return result;
 }
 
+
+
 NAMESPACE_END(Grid);