Boost precision in inner products in single

2025-07-25 08:57:08 +01:00 · 2020-06-24 12:52:31 -04:00
parent 093d1ee21b
commit 22cfbdbbb3
1 changed files with 25 additions and 25 deletions
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -93,7 +93,9 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
    ssum = ssum+sumarray[i];
  } 
  
-  return ssum;
+  typedef typename vobj::scalar_object ssobj;
+  ssobj ret = ssum;
+  return ret;
 }


@@ -154,7 +156,7 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
  const uint64_t sites = grid->oSites();
  
  // Might make all code paths go this way.
-  typedef decltype(innerProduct(vobj(),vobj())) inner_t;
+  typedef decltype(Reduce(innerProductD(vobj(),vobj()))) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
    
@@ -163,16 +165,15 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
    autoView( right_v,right, AcceleratorRead);

    // GPU - SIMT lane compliance...
-    accelerator_for( ss, sites, nsimd,{
-	auto x_l = left_v(ss);
-	auto y_l = right_v(ss);
-	coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
-      })
+    accelerator_for( ss, sites, 1,{
+	auto x_l = left_v[ss];
+	auto y_l = right_v[ss];
+	inner_tmp_v[ss]=Reduce(innerProductD(x_l,y_l));
+    });
  }

  // This is in single precision and fails some tests
-  // Need a sumD that sums in double
-  nrm = TensorRemove(sumD(inner_tmp_v,sites));  
+  nrm = TensorRemove(sum(inner_tmp_v,sites));  
  return nrm;
 }

@@ -218,16 +219,16 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
  autoView( y_v, y, AcceleratorRead);
  autoView( z_v, z, AcceleratorWrite);

-  typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
+  typedef decltype(Reduce(innerProductD(x_v[0],y_v[0]))) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];

-  accelerator_for( ss, sites, nsimd,{
-      auto tmp = a*x_v(ss)+b*y_v(ss);
-      coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
-      coalescedWrite(z_v[ss],tmp);
+  accelerator_for( ss, sites, 1,{
+      auto tmp = a*x_v[ss]+b*y_v[ss];
+      inner_tmp_v[ss]=Reduce(innerProductD(tmp,tmp));
+      z_v[ss]=tmp;
  });
-  nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
+  nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
  grid->GlobalSum(nrm);
  return nrm; 
 }
@@ -243,29 +244,28 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti

  GridBase *grid = left.Grid();

-
  const uint64_t nsimd = grid->Nsimd();
  const uint64_t sites = grid->oSites();

  // GPU
-  typedef decltype(innerProduct(vobj(),vobj())) inner_t;
-  typedef decltype(innerProduct(vobj(),vobj())) norm_t;
+  typedef decltype(Reduce(innerProductD(vobj(),vobj()))) inner_t;
+  typedef decltype(Reduce(innerProductD(vobj(),vobj()))) norm_t;
  Vector<inner_t> inner_tmp(sites);
-  Vector<norm_t> norm_tmp(sites);
+  Vector<norm_t>  norm_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
  auto norm_tmp_v = &norm_tmp[0];
  {
    autoView(left_v,left, AcceleratorRead);
    autoView(right_v,right,AcceleratorRead);
-    accelerator_for( ss, sites, nsimd,{
-	auto left_tmp = left_v(ss);
-	coalescedWrite(inner_tmp_v[ss],innerProduct(left_tmp,right_v(ss)));
-	coalescedWrite(norm_tmp_v[ss],innerProduct(left_tmp,left_tmp));
+    accelerator_for( ss, sites, 1,{
+	auto left_tmp = left_v[ss];
+	inner_tmp_v[ss]=Reduce(innerProductD(left_tmp,right_v[ss]));
+        norm_tmp_v [ss]=Reduce(innerProductD(left_tmp,left_tmp));
      });
  }

-  tmp[0] = TensorRemove(sumD(inner_tmp_v,sites));
-  tmp[1] = TensorRemove(sumD(norm_tmp_v,sites));
+  tmp[0] = TensorRemove(sum(inner_tmp_v,sites));
+  tmp[1] = TensorRemove(sum(norm_tmp_v,sites));

  grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector
  ip = tmp[0];