From 96ac56cacea12c5949d79e2d5aee43c250420d07 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Wed, 14 Aug 2019 13:08:01 +0100
Subject: [PATCH] Double precision variants for summation accuracy

---
 Grid/tensors/Tensor_class.h  |  18 +++++-
 Grid/tensors/Tensor_inner.h  | 106 +++++++++++++++++++++++++++++++++--
 Grid/tensors/Tensor_traits.h |  28 +++++++++
 3 files changed, 144 insertions(+), 8 deletions(-)
diff --git a/Grid/tensors/Tensor_class.h b/Grid/tensors/Tensor_class.h
index 2b578387..75e42721 100644
--- a/Grid/tensors/Tensor_class.h
+++ b/Grid/tensors/Tensor_class.h
@@ -47,9 +47,11 @@ class GridTensorBase {};
   using element = vtype; \
   using scalar_type     = typename Traits::scalar_type; \
   using vector_type     = typename Traits::vector_type; \
+  using scalar_typeD    = typename Traits::scalar_typeD; \
   using vector_typeD    = typename Traits::vector_typeD; \
   using tensor_reduced  = typename Traits::tensor_reduced; \
   using scalar_object   = typename Traits::scalar_object; \
+  using scalar_objectD  = typename Traits::scalar_objectD; \
   using Complexified    = typename Traits::Complexified; \
   using Realified       = typename Traits::Realified; \
   using DoublePrecision = typename Traits::DoublePrecision; \
@@ -149,7 +151,7 @@ public:
 
   // Convert elements
   template <class ttype>
-  accelerator_inline iScalar<vtype> operator=(iScalar<ttype> &&arg) {
+  accelerator_inline iScalar<vtype> operator=(const iScalar<ttype> &arg) {
     _internal = arg._internal;
     return *this;
   }
@@ -200,6 +202,14 @@ public:
   accelerator iVector() = default;
   accelerator_inline iVector(const Zero &z) { zeroit(*this); };
 
+  template<class other>
+  accelerator_inline iVector<vtype, N> &operator=(const iVector<other,N> &him)
+  {
+    for (int i = 0; i < N; i++) {
+      _internal[i] = him._internal[i];
+    }    
+    return *this;
+  } 
   accelerator_inline iVector<vtype, N> &operator=(const Zero &hero) {
     zeroit(*this);
     return *this;
@@ -296,10 +306,12 @@ public:
   accelerator_inline iMatrix(const Zero &z) { zeroit(*this); };
   accelerator iMatrix() = default;
 
-  accelerator_inline iMatrix &operator=(const iMatrix &rhs) {
+  // Allow for type conversion.
+  template<class other>
+  accelerator_inline iMatrix &operator=(const iMatrix<other,N> &rhs) {
     for (int i = 0; i < N; i++)
       for (int j = 0; j < N; j++) 
-	vstream(_internal[i][j], rhs._internal[i][j]);
+	_internal[i][j] = rhs._internal[i][j];
     return *this;
   };
 
diff --git a/Grid/tensors/Tensor_inner.h b/Grid/tensors/Tensor_inner.h
index ca35e851..03f72966 100644
--- a/Grid/tensors/Tensor_inner.h
+++ b/Grid/tensors/Tensor_inner.h
@@ -41,10 +41,103 @@ template<class sobj> accelerator_inline RealD norm2(const sobj &arg){
   RealD ret = real(nrm);
   return ret;
 }
-//////////////////////////////////////
-// If single promote to double and sum 2x
-//////////////////////////////////////
 
+////////////////////////////////////////////////////////////////
+// FIXME horizontal sum: single promote to double and sum 
+// Not yet used; make sum_cpu in LatticeReduction.h call these
+// GPU sum_gpu is using a double precision promotion.
+////////////////////////////////////////////////////////////////
+#if 0
+accelerator_inline ComplexD ReduceD(const ComplexF &l) { return l; };
+accelerator_inline ComplexD ReduceD(const ComplexD &l) { return l; };
+accelerator_inline RealD    ReduceD(const RealD    &l) { return l; };
+accelerator_inline RealD    ReduceD(const RealF    &l) { return l; };
+
+accelerator_inline ComplexD ReduceD(const vComplexD &l) { return Reduce(l); };
+accelerator_inline RealD    ReduceD(const vRealD    &l) { return Reduce(l); };
+accelerator_inline ComplexD ReduceD(const vComplexF  &l) 
+{ 
+  vComplexD la,lb;
+  Optimization::PrecisionChange::StoD(l.v,la.v,lb.v);
+  return Reduce(la)+Reduce(lb); 
+}
+accelerator_inline RealD    ReduceD(const vRealF    &l) 
+{
+  vRealD la,lb;
+  Optimization::PrecisionChange::StoD(l.v,la.v,lb.v);
+  return Reduce(la)+Reduce(lb); 
+};
+///////////////////////////////////////////////////////
+// Now do it for vector, matrix, scalar
+///////////////////////////////////////////////////////
+template<class l,int N> accelerator_inline
+auto ReduceD (const iVector<l,N>& lhs) -> iVector<decltype(ReduceD(lhs._internal[0])),N>
+{
+  typedef decltype(ReduceD(lhs._internal[0])) ret_t;
+  iVector<ret_t,N> ret;
+  for(int c1=0;c1<N;c1++){
+    ret._internal[c1] = ReduceD(lhs._internal[c1]);
+  }
+  return ret;
+}
+template<class l,int N> accelerator_inline
+auto ReduceD (const iMatrix<l,N>& lhs) -> iMatrix<decltype(ReduceD(lhs._internal[0][0])),N>
+{
+  typedef decltype(ReduceD(lhs._internal[0][0])) ret_t;
+  iMatrix<ret_t,N> ret;
+  for(int c1=0;c1<N;c1++){
+  for(int c2=0;c2<N;c2++){
+    ret._internal[c1][c2]=ReduceD(lhs._internal[c1][c2]);
+  }}
+  return ret;
+}
+template<class l> accelerator_inline
+auto ReduceD (const iScalar<l>& lhs) -> iScalar<decltype(ReduceD(lhs._internal))>
+{
+  typedef decltype(ReduceD(lhs._internal)) ret_t;
+  iScalar<ret_t> ret;
+  ret._internal = ReduceD(lhs._internal);
+  return ret;
+}
+#endif
+///////////////////////////////////////////////////////
+// Now do Reduce for vector, matrix, scalar
+///////////////////////////////////////////////////////
+template<class l,int N> accelerator_inline
+auto Reduce (const iVector<l,N>& lhs) -> iVector<decltype(Reduce(lhs._internal[0])),N>
+{
+  typedef decltype(Reduce(lhs._internal[0])) ret_t;
+  iVector<ret_t,N> ret;
+  for(int c1=0;c1<N;c1++){
+    ret._internal[c1] = Reduce(lhs._internal[c1]);
+  }
+  return ret;
+}
+template<class l,int N> accelerator_inline
+auto Reduce (const iMatrix<l,N>& lhs) -> iMatrix<decltype(Reduce(lhs._internal[0][0])),N>
+{
+  typedef decltype(Reduce(lhs._internal[0][0])) ret_t;
+  iMatrix<ret_t,N> ret;
+  for(int c1=0;c1<N;c1++){
+  for(int c2=0;c2<N;c2++){
+    ret._internal[c1][c2]=Reduce(lhs._internal[c1][c2]);
+  }}
+  return ret;
+}
+template<class l> accelerator_inline
+auto Reduce (const iScalar<l>& lhs) -> iScalar<decltype(Reduce(lhs._internal))>
+{
+  typedef decltype(Reduce(lhs._internal)) ret_t;
+  iScalar<ret_t> ret;
+  ret._internal = Reduce(lhs._internal);
+  return ret;
+}
+
+
+
+//////////////////////////////////////
+// innerProductD : if single promote to double and evaluate with sum 2x
+//////////////////////////////////////
 accelerator_inline ComplexD innerProductD(const ComplexF &l,const ComplexF &r){  return innerProduct(l,r); }
 accelerator_inline ComplexD innerProductD(const ComplexD &l,const ComplexD &r){  return innerProduct(l,r); }
 accelerator_inline RealD    innerProductD(const RealD    &l,const RealD    &r){  return innerProduct(l,r); }
@@ -52,14 +145,16 @@ accelerator_inline RealD    innerProductD(const RealF    &l,const RealF    &r){
 
 accelerator_inline vComplexD innerProductD(const vComplexD &l,const vComplexD &r){  return innerProduct(l,r); }
 accelerator_inline vRealD    innerProductD(const vRealD    &l,const vRealD    &r){  return innerProduct(l,r); }
-accelerator_inline vComplexD innerProductD(const vComplexF &l,const vComplexF &r){  
+accelerator_inline vComplexD innerProductD(const vComplexF &l,const vComplexF &r)
+{  
   vComplexD la,lb;
   vComplexD ra,rb;
   Optimization::PrecisionChange::StoD(l.v,la.v,lb.v);
   Optimization::PrecisionChange::StoD(r.v,ra.v,rb.v);
   return innerProduct(la,ra) + innerProduct(lb,rb); 
 }
-accelerator_inline vRealD innerProductD(const vRealF &l,const vRealF &r){  
+accelerator_inline vRealD innerProductD(const vRealF &l,const vRealF &r)
+{  
   vRealD la,lb;
   vRealD ra,rb;
   Optimization::PrecisionChange::StoD(l.v,la.v,lb.v);
@@ -67,6 +162,7 @@ accelerator_inline vRealD innerProductD(const vRealF &l,const vRealF &r){
   return innerProduct(la,ra) + innerProduct(lb,rb); 
 }
 
+// Now do it for vector, matrix, scalar
 template<class l,class r,int N> accelerator_inline
 auto innerProductD (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(innerProductD(lhs._internal[0],rhs._internal[0]))>
 {
diff --git a/Grid/tensors/Tensor_traits.h b/Grid/tensors/Tensor_traits.h
index afd89f78..9067d43d 100644
--- a/Grid/tensors/Tensor_traits.h
+++ b/Grid/tensors/Tensor_traits.h
@@ -72,50 +72,60 @@ NAMESPACE_BEGIN(Grid);
 
   template<> struct GridTypeMapper<RealF> : public GridTypeMapper_Base {
     typedef RealF scalar_type;
+    typedef RealD scalar_typeD;
     typedef RealF vector_type;
     typedef RealD vector_typeD;
     typedef RealF tensor_reduced ;
     typedef RealF scalar_object;
+    typedef RealD scalar_objectD;
     typedef ComplexF Complexified;
     typedef RealF Realified;
     typedef RealD DoublePrecision;
   };
   template<> struct GridTypeMapper<RealD> : public GridTypeMapper_Base {
     typedef RealD scalar_type;
+    typedef RealD scalar_typeD;
     typedef RealD vector_type;
     typedef RealD vector_typeD;
     typedef RealD tensor_reduced;
     typedef RealD scalar_object;
+    typedef RealD scalar_objectD;
     typedef ComplexD Complexified;
     typedef RealD Realified;
     typedef RealD DoublePrecision;
   };
   template<> struct GridTypeMapper<ComplexF> : public GridTypeMapper_Base {
     typedef ComplexF scalar_type;
+    typedef ComplexD scalar_typeD;
     typedef ComplexF vector_type;
     typedef ComplexD vector_typeD;
     typedef ComplexF tensor_reduced;
     typedef ComplexF scalar_object;
+    typedef ComplexD scalar_objectD;
     typedef ComplexF Complexified;
     typedef RealF Realified;
     typedef ComplexD DoublePrecision;
   };
   template<> struct GridTypeMapper<ComplexD> : public GridTypeMapper_Base {
     typedef ComplexD scalar_type;
+    typedef ComplexD scalar_typeD;
     typedef ComplexD vector_type;
     typedef ComplexD vector_typeD;
     typedef ComplexD tensor_reduced;
     typedef ComplexD scalar_object;
+    typedef ComplexD scalar_objectD;
     typedef ComplexD Complexified;
     typedef RealD Realified;
     typedef ComplexD DoublePrecision;
   };
   template<> struct GridTypeMapper<Integer> : public GridTypeMapper_Base {
     typedef Integer scalar_type;
+    typedef Integer scalar_typeD;
     typedef Integer vector_type;
     typedef Integer vector_typeD;
     typedef Integer tensor_reduced;
     typedef Integer scalar_object;
+    typedef Integer scalar_objectD;
     typedef void Complexified;
     typedef void Realified;
     typedef void DoublePrecision;
@@ -123,20 +133,24 @@ NAMESPACE_BEGIN(Grid);
 
   template<> struct GridTypeMapper<vRealF> : public GridTypeMapper_Base {
     typedef RealF  scalar_type;
+    typedef RealD  scalar_typeD;
     typedef vRealF vector_type;
     typedef vRealD vector_typeD;
     typedef vRealF tensor_reduced;
     typedef RealF  scalar_object;
+    typedef RealD  scalar_objectD;
     typedef vComplexF Complexified;
     typedef vRealF Realified;
     typedef vRealD DoublePrecision;
   };
   template<> struct GridTypeMapper<vRealD> : public GridTypeMapper_Base {
     typedef RealD  scalar_type;
+    typedef RealD  scalar_typeD;
     typedef vRealD vector_type;
     typedef vRealD vector_typeD;
     typedef vRealD tensor_reduced;
     typedef RealD  scalar_object;
+    typedef RealD  scalar_objectD;
     typedef vComplexD Complexified;
     typedef vRealD Realified;
     typedef vRealD DoublePrecision;
@@ -144,10 +158,12 @@ NAMESPACE_BEGIN(Grid);
   template<> struct GridTypeMapper<vRealH> : public GridTypeMapper_Base {
     // Fixme this is incomplete until Grid supports fp16 or bfp16 arithmetic types
     typedef RealF  scalar_type;
+    typedef RealD  scalar_typeD;
     typedef vRealH vector_type;
     typedef vRealD vector_typeD;
     typedef vRealH tensor_reduced;
     typedef RealF  scalar_object;
+    typedef RealD  scalar_objectD;
     typedef vComplexH Complexified;
     typedef vRealH Realified;
     typedef vRealD DoublePrecision;
@@ -155,40 +171,48 @@ NAMESPACE_BEGIN(Grid);
   template<> struct GridTypeMapper<vComplexH> : public GridTypeMapper_Base {
     // Fixme this is incomplete until Grid supports fp16 or bfp16 arithmetic types
     typedef ComplexF  scalar_type;
+    typedef ComplexD  scalar_typeD;
     typedef vComplexH vector_type;
     typedef vComplexD vector_typeD;
     typedef vComplexH tensor_reduced;
     typedef ComplexF  scalar_object;
+    typedef ComplexD  scalar_objectD;
     typedef vComplexH Complexified;
     typedef vRealH Realified;
     typedef vComplexD DoublePrecision;
   };
   template<> struct GridTypeMapper<vComplexF> : public GridTypeMapper_Base {
     typedef ComplexF  scalar_type;
+    typedef ComplexD  scalar_typeD;
     typedef vComplexF vector_type;
     typedef vComplexD vector_typeD;
     typedef vComplexF tensor_reduced;
     typedef ComplexF  scalar_object;
+    typedef ComplexD  scalar_objectD;
     typedef vComplexF Complexified;
     typedef vRealF Realified;
     typedef vComplexD DoublePrecision;
   };
   template<> struct GridTypeMapper<vComplexD> : public GridTypeMapper_Base {
     typedef ComplexD  scalar_type;
+    typedef ComplexD  scalar_typeD;
     typedef vComplexD vector_type;
     typedef vComplexD vector_typeD;
     typedef vComplexD tensor_reduced;
     typedef ComplexD  scalar_object;
+    typedef ComplexD  scalar_objectD;
     typedef vComplexD Complexified;
     typedef vRealD Realified;
     typedef vComplexD DoublePrecision;
   };
   template<> struct GridTypeMapper<vInteger> : public GridTypeMapper_Base {
     typedef  Integer scalar_type;
+    typedef  Integer scalar_typeD;
     typedef vInteger vector_type;
     typedef vInteger vector_typeD;
     typedef vInteger tensor_reduced;
     typedef  Integer scalar_object;
+    typedef  Integer scalar_objectD;
     typedef void Complexified;
     typedef void Realified;
     typedef void DoublePrecision;
@@ -198,6 +222,7 @@ NAMESPACE_BEGIN(Grid);
   using BaseTraits   = GridTypeMapper<T>; \
   using scalar_type  = typename BaseTraits::scalar_type; \
   using vector_type  = typename BaseTraits::vector_type; \
+  using scalar_typeD = typename BaseTraits::scalar_typeD; \
   using vector_typeD = typename BaseTraits::vector_typeD; \
   static constexpr int TensorLevel = BaseTraits::TensorLevel + 1
 
@@ -205,6 +230,7 @@ NAMESPACE_BEGIN(Grid);
     GridTypeMapper_RepeatedTypes;
     using tensor_reduced  = iScalar<typename BaseTraits::tensor_reduced>;
     using scalar_object   = iScalar<typename BaseTraits::scalar_object>;
+    using scalar_objectD  = iScalar<typename BaseTraits::scalar_objectD>;
     using Complexified    = iScalar<typename BaseTraits::Complexified>;
     using Realified       = iScalar<typename BaseTraits::Realified>;
     using DoublePrecision = iScalar<typename BaseTraits::DoublePrecision>;
@@ -218,6 +244,7 @@ NAMESPACE_BEGIN(Grid);
     GridTypeMapper_RepeatedTypes;
     using tensor_reduced  = iScalar<typename BaseTraits::tensor_reduced>;
     using scalar_object   = iVector<typename BaseTraits::scalar_object,   N>;
+    using scalar_objectD  = iVector<typename BaseTraits::scalar_objectD,  N>;
     using Complexified    = iVector<typename BaseTraits::Complexified,    N>;
     using Realified       = iVector<typename BaseTraits::Realified,       N>;
     using DoublePrecision = iVector<typename BaseTraits::DoublePrecision, N>;
@@ -231,6 +258,7 @@ NAMESPACE_BEGIN(Grid);
     GridTypeMapper_RepeatedTypes;
     using tensor_reduced  = iScalar<typename BaseTraits::tensor_reduced>;
     using scalar_object   = iMatrix<typename BaseTraits::scalar_object,   N>;
+    using scalar_objectD  = iMatrix<typename BaseTraits::scalar_objectD,  N>;
     using Complexified    = iMatrix<typename BaseTraits::Complexified,    N>;
     using Realified       = iMatrix<typename BaseTraits::Realified,       N>;
     using DoublePrecision = iMatrix<typename BaseTraits::DoublePrecision, N>;