From 96ac56cacea12c5949d79e2d5aee43c250420d07 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 14 Aug 2019 13:08:01 +0100 Subject: [PATCH] Double precision variants for summation accuracy --- Grid/tensors/Tensor_class.h | 18 +++++- Grid/tensors/Tensor_inner.h | 106 +++++++++++++++++++++++++++++++++-- Grid/tensors/Tensor_traits.h | 28 +++++++++ 3 files changed, 144 insertions(+), 8 deletions(-) diff --git a/Grid/tensors/Tensor_class.h b/Grid/tensors/Tensor_class.h index 2b578387..75e42721 100644 --- a/Grid/tensors/Tensor_class.h +++ b/Grid/tensors/Tensor_class.h @@ -47,9 +47,11 @@ class GridTensorBase {}; using element = vtype; \ using scalar_type = typename Traits::scalar_type; \ using vector_type = typename Traits::vector_type; \ + using scalar_typeD = typename Traits::scalar_typeD; \ using vector_typeD = typename Traits::vector_typeD; \ using tensor_reduced = typename Traits::tensor_reduced; \ using scalar_object = typename Traits::scalar_object; \ + using scalar_objectD = typename Traits::scalar_objectD; \ using Complexified = typename Traits::Complexified; \ using Realified = typename Traits::Realified; \ using DoublePrecision = typename Traits::DoublePrecision; \ @@ -149,7 +151,7 @@ public: // Convert elements template - accelerator_inline iScalar operator=(iScalar &&arg) { + accelerator_inline iScalar operator=(const iScalar &arg) { _internal = arg._internal; return *this; } @@ -200,6 +202,14 @@ public: accelerator iVector() = default; accelerator_inline iVector(const Zero &z) { zeroit(*this); }; + template + accelerator_inline iVector &operator=(const iVector &him) + { + for (int i = 0; i < N; i++) { + _internal[i] = him._internal[i]; + } + return *this; + } accelerator_inline iVector &operator=(const Zero &hero) { zeroit(*this); return *this; @@ -296,10 +306,12 @@ public: accelerator_inline iMatrix(const Zero &z) { zeroit(*this); }; accelerator iMatrix() = default; - accelerator_inline iMatrix &operator=(const iMatrix &rhs) { + // Allow for type conversion. + template + accelerator_inline iMatrix &operator=(const iMatrix &rhs) { for (int i = 0; i < N; i++) for (int j = 0; j < N; j++) - vstream(_internal[i][j], rhs._internal[i][j]); + _internal[i][j] = rhs._internal[i][j]; return *this; }; diff --git a/Grid/tensors/Tensor_inner.h b/Grid/tensors/Tensor_inner.h index ca35e851..03f72966 100644 --- a/Grid/tensors/Tensor_inner.h +++ b/Grid/tensors/Tensor_inner.h @@ -41,10 +41,103 @@ template accelerator_inline RealD norm2(const sobj &arg){ RealD ret = real(nrm); return ret; } -////////////////////////////////////// -// If single promote to double and sum 2x -////////////////////////////////////// +//////////////////////////////////////////////////////////////// +// FIXME horizontal sum: single promote to double and sum +// Not yet used; make sum_cpu in LatticeReduction.h call these +// GPU sum_gpu is using a double precision promotion. +//////////////////////////////////////////////////////////////// +#if 0 +accelerator_inline ComplexD ReduceD(const ComplexF &l) { return l; }; +accelerator_inline ComplexD ReduceD(const ComplexD &l) { return l; }; +accelerator_inline RealD ReduceD(const RealD &l) { return l; }; +accelerator_inline RealD ReduceD(const RealF &l) { return l; }; + +accelerator_inline ComplexD ReduceD(const vComplexD &l) { return Reduce(l); }; +accelerator_inline RealD ReduceD(const vRealD &l) { return Reduce(l); }; +accelerator_inline ComplexD ReduceD(const vComplexF &l) +{ + vComplexD la,lb; + Optimization::PrecisionChange::StoD(l.v,la.v,lb.v); + return Reduce(la)+Reduce(lb); +} +accelerator_inline RealD ReduceD(const vRealF &l) +{ + vRealD la,lb; + Optimization::PrecisionChange::StoD(l.v,la.v,lb.v); + return Reduce(la)+Reduce(lb); +}; +/////////////////////////////////////////////////////// +// Now do it for vector, matrix, scalar +/////////////////////////////////////////////////////// +template accelerator_inline +auto ReduceD (const iVector& lhs) -> iVector +{ + typedef decltype(ReduceD(lhs._internal[0])) ret_t; + iVector ret; + for(int c1=0;c1 accelerator_inline +auto ReduceD (const iMatrix& lhs) -> iMatrix +{ + typedef decltype(ReduceD(lhs._internal[0][0])) ret_t; + iMatrix ret; + for(int c1=0;c1 accelerator_inline +auto ReduceD (const iScalar& lhs) -> iScalar +{ + typedef decltype(ReduceD(lhs._internal)) ret_t; + iScalar ret; + ret._internal = ReduceD(lhs._internal); + return ret; +} +#endif +/////////////////////////////////////////////////////// +// Now do Reduce for vector, matrix, scalar +/////////////////////////////////////////////////////// +template accelerator_inline +auto Reduce (const iVector& lhs) -> iVector +{ + typedef decltype(Reduce(lhs._internal[0])) ret_t; + iVector ret; + for(int c1=0;c1 accelerator_inline +auto Reduce (const iMatrix& lhs) -> iMatrix +{ + typedef decltype(Reduce(lhs._internal[0][0])) ret_t; + iMatrix ret; + for(int c1=0;c1 accelerator_inline +auto Reduce (const iScalar& lhs) -> iScalar +{ + typedef decltype(Reduce(lhs._internal)) ret_t; + iScalar ret; + ret._internal = Reduce(lhs._internal); + return ret; +} + + + +////////////////////////////////////// +// innerProductD : if single promote to double and evaluate with sum 2x +////////////////////////////////////// accelerator_inline ComplexD innerProductD(const ComplexF &l,const ComplexF &r){ return innerProduct(l,r); } accelerator_inline ComplexD innerProductD(const ComplexD &l,const ComplexD &r){ return innerProduct(l,r); } accelerator_inline RealD innerProductD(const RealD &l,const RealD &r){ return innerProduct(l,r); } @@ -52,14 +145,16 @@ accelerator_inline RealD innerProductD(const RealF &l,const RealF &r){ accelerator_inline vComplexD innerProductD(const vComplexD &l,const vComplexD &r){ return innerProduct(l,r); } accelerator_inline vRealD innerProductD(const vRealD &l,const vRealD &r){ return innerProduct(l,r); } -accelerator_inline vComplexD innerProductD(const vComplexF &l,const vComplexF &r){ +accelerator_inline vComplexD innerProductD(const vComplexF &l,const vComplexF &r) +{ vComplexD la,lb; vComplexD ra,rb; Optimization::PrecisionChange::StoD(l.v,la.v,lb.v); Optimization::PrecisionChange::StoD(r.v,ra.v,rb.v); return innerProduct(la,ra) + innerProduct(lb,rb); } -accelerator_inline vRealD innerProductD(const vRealF &l,const vRealF &r){ +accelerator_inline vRealD innerProductD(const vRealF &l,const vRealF &r) +{ vRealD la,lb; vRealD ra,rb; Optimization::PrecisionChange::StoD(l.v,la.v,lb.v); @@ -67,6 +162,7 @@ accelerator_inline vRealD innerProductD(const vRealF &l,const vRealF &r){ return innerProduct(la,ra) + innerProduct(lb,rb); } +// Now do it for vector, matrix, scalar template accelerator_inline auto innerProductD (const iVector& lhs,const iVector& rhs) -> iScalar { diff --git a/Grid/tensors/Tensor_traits.h b/Grid/tensors/Tensor_traits.h index afd89f78..9067d43d 100644 --- a/Grid/tensors/Tensor_traits.h +++ b/Grid/tensors/Tensor_traits.h @@ -72,50 +72,60 @@ NAMESPACE_BEGIN(Grid); template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef RealF scalar_type; + typedef RealD scalar_typeD; typedef RealF vector_type; typedef RealD vector_typeD; typedef RealF tensor_reduced ; typedef RealF scalar_object; + typedef RealD scalar_objectD; typedef ComplexF Complexified; typedef RealF Realified; typedef RealD DoublePrecision; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef RealD scalar_type; + typedef RealD scalar_typeD; typedef RealD vector_type; typedef RealD vector_typeD; typedef RealD tensor_reduced; typedef RealD scalar_object; + typedef RealD scalar_objectD; typedef ComplexD Complexified; typedef RealD Realified; typedef RealD DoublePrecision; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef ComplexF scalar_type; + typedef ComplexD scalar_typeD; typedef ComplexF vector_type; typedef ComplexD vector_typeD; typedef ComplexF tensor_reduced; typedef ComplexF scalar_object; + typedef ComplexD scalar_objectD; typedef ComplexF Complexified; typedef RealF Realified; typedef ComplexD DoublePrecision; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef ComplexD scalar_type; + typedef ComplexD scalar_typeD; typedef ComplexD vector_type; typedef ComplexD vector_typeD; typedef ComplexD tensor_reduced; typedef ComplexD scalar_object; + typedef ComplexD scalar_objectD; typedef ComplexD Complexified; typedef RealD Realified; typedef ComplexD DoublePrecision; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef Integer scalar_type; + typedef Integer scalar_typeD; typedef Integer vector_type; typedef Integer vector_typeD; typedef Integer tensor_reduced; typedef Integer scalar_object; + typedef Integer scalar_objectD; typedef void Complexified; typedef void Realified; typedef void DoublePrecision; @@ -123,20 +133,24 @@ NAMESPACE_BEGIN(Grid); template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef RealF scalar_type; + typedef RealD scalar_typeD; typedef vRealF vector_type; typedef vRealD vector_typeD; typedef vRealF tensor_reduced; typedef RealF scalar_object; + typedef RealD scalar_objectD; typedef vComplexF Complexified; typedef vRealF Realified; typedef vRealD DoublePrecision; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef RealD scalar_type; + typedef RealD scalar_typeD; typedef vRealD vector_type; typedef vRealD vector_typeD; typedef vRealD tensor_reduced; typedef RealD scalar_object; + typedef RealD scalar_objectD; typedef vComplexD Complexified; typedef vRealD Realified; typedef vRealD DoublePrecision; @@ -144,10 +158,12 @@ NAMESPACE_BEGIN(Grid); template<> struct GridTypeMapper : public GridTypeMapper_Base { // Fixme this is incomplete until Grid supports fp16 or bfp16 arithmetic types typedef RealF scalar_type; + typedef RealD scalar_typeD; typedef vRealH vector_type; typedef vRealD vector_typeD; typedef vRealH tensor_reduced; typedef RealF scalar_object; + typedef RealD scalar_objectD; typedef vComplexH Complexified; typedef vRealH Realified; typedef vRealD DoublePrecision; @@ -155,40 +171,48 @@ NAMESPACE_BEGIN(Grid); template<> struct GridTypeMapper : public GridTypeMapper_Base { // Fixme this is incomplete until Grid supports fp16 or bfp16 arithmetic types typedef ComplexF scalar_type; + typedef ComplexD scalar_typeD; typedef vComplexH vector_type; typedef vComplexD vector_typeD; typedef vComplexH tensor_reduced; typedef ComplexF scalar_object; + typedef ComplexD scalar_objectD; typedef vComplexH Complexified; typedef vRealH Realified; typedef vComplexD DoublePrecision; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef ComplexF scalar_type; + typedef ComplexD scalar_typeD; typedef vComplexF vector_type; typedef vComplexD vector_typeD; typedef vComplexF tensor_reduced; typedef ComplexF scalar_object; + typedef ComplexD scalar_objectD; typedef vComplexF Complexified; typedef vRealF Realified; typedef vComplexD DoublePrecision; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef ComplexD scalar_type; + typedef ComplexD scalar_typeD; typedef vComplexD vector_type; typedef vComplexD vector_typeD; typedef vComplexD tensor_reduced; typedef ComplexD scalar_object; + typedef ComplexD scalar_objectD; typedef vComplexD Complexified; typedef vRealD Realified; typedef vComplexD DoublePrecision; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef Integer scalar_type; + typedef Integer scalar_typeD; typedef vInteger vector_type; typedef vInteger vector_typeD; typedef vInteger tensor_reduced; typedef Integer scalar_object; + typedef Integer scalar_objectD; typedef void Complexified; typedef void Realified; typedef void DoublePrecision; @@ -198,6 +222,7 @@ NAMESPACE_BEGIN(Grid); using BaseTraits = GridTypeMapper; \ using scalar_type = typename BaseTraits::scalar_type; \ using vector_type = typename BaseTraits::vector_type; \ + using scalar_typeD = typename BaseTraits::scalar_typeD; \ using vector_typeD = typename BaseTraits::vector_typeD; \ static constexpr int TensorLevel = BaseTraits::TensorLevel + 1 @@ -205,6 +230,7 @@ NAMESPACE_BEGIN(Grid); GridTypeMapper_RepeatedTypes; using tensor_reduced = iScalar; using scalar_object = iScalar; + using scalar_objectD = iScalar; using Complexified = iScalar; using Realified = iScalar; using DoublePrecision = iScalar; @@ -218,6 +244,7 @@ NAMESPACE_BEGIN(Grid); GridTypeMapper_RepeatedTypes; using tensor_reduced = iScalar; using scalar_object = iVector; + using scalar_objectD = iVector; using Complexified = iVector; using Realified = iVector; using DoublePrecision = iVector; @@ -231,6 +258,7 @@ NAMESPACE_BEGIN(Grid); GridTypeMapper_RepeatedTypes; using tensor_reduced = iScalar; using scalar_object = iMatrix; + using scalar_objectD = iMatrix; using Complexified = iMatrix; using Realified = iMatrix; using DoublePrecision = iMatrix;