From 441a52ee5d9917ef0c7d2de179adcc9dab71804d Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 15 Apr 2017 10:57:21 +0100 Subject: [PATCH] First cut at higher precision reduction --- lib/lattice/Lattice_reduction.h | 197 ++++++++++++++++---------------- lib/simd/Grid_vector_types.h | 1 - lib/tensors/Tensor_class.h | 3 + lib/tensors/Tensor_inner.h | 142 ++++++++++++++++------- lib/tensors/Tensor_traits.h | 10 ++ 5 files changed, 210 insertions(+), 143 deletions(-) diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h index 45a88a64..bb7808b8 100644 --- a/lib/lattice/Lattice_reduction.h +++ b/lib/lattice/Lattice_reduction.h @@ -38,110 +38,108 @@ namespace Grid { //////////////////////////////////////////////////////////////////////////////////////////////////// // Deterministic Reduction operations //////////////////////////////////////////////////////////////////////////////////////////////////// - template inline RealD norm2(const Lattice &arg){ - ComplexD nrm = innerProduct(arg,arg); - return std::real(nrm); +template inline RealD norm2(const Lattice &arg){ + ComplexD nrm = innerProduct(arg,arg); + return std::real(nrm); +} + +template +inline ComplexD innerProduct(const Lattice &left,const Lattice &right) +{ + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_typeD vector_type; + scalar_type nrm; + + GridBase *grid = left._grid; + + std::vector > sumarray(grid->SumArraySize()); + for(int i=0;iSumArraySize();i++){ + sumarray[i]=zero; } - - template - inline ComplexD innerProduct(const Lattice &left,const Lattice &right) - { - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - scalar_type nrm; - - GridBase *grid = left._grid; - - std::vector > sumarray(grid->SumArraySize()); - for(int i=0;iSumArraySize();i++){ - sumarray[i]=zero; - } - - parallel_for(int thr=0;thrSumArraySize();thr++){ - int nwork, mywork, myoff; - GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); - - decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation - for(int ss=myoff;ssSumArraySize();i++){ - vvnrm = vvnrm+sumarray[i]; - } - nrm = Reduce(vvnrm);// sum across simd - right._grid->GlobalSum(nrm); - return nrm; + + parallel_for(int thr=0;thrSumArraySize();thr++){ + int nwork, mywork, myoff; + GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); + + decltype(innerProductD(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation + for(int ss=myoff;ssSumArraySize();i++){ + vvnrm = vvnrm+sumarray[i]; + } + nrm = Reduce(vvnrm);// sum across simd + right._grid->GlobalSum(nrm); + return nrm; +} + +template +inline auto sum(const LatticeUnaryExpression & expr) + ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object +{ + return sum(closure(expr)); +} - template - inline auto sum(const LatticeUnaryExpression & expr) - ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object - { - return sum(closure(expr)); - } - - template - inline auto sum(const LatticeBinaryExpression & expr) +template +inline auto sum(const LatticeBinaryExpression & expr) ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),eval(0,std::get<1>(expr.second))))::scalar_object - { - return sum(closure(expr)); +{ + return sum(closure(expr)); +} + + +template +inline auto sum(const LatticeTrinaryExpression & expr) + ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)), + eval(0,std::get<1>(expr.second)), + eval(0,std::get<2>(expr.second)) + ))::scalar_object +{ + return sum(closure(expr)); +} + +template +inline typename vobj::scalar_object sum(const Lattice &arg) +{ + GridBase *grid=arg._grid; + int Nsimd = grid->Nsimd(); + + std::vector > sumarray(grid->SumArraySize()); + for(int i=0;iSumArraySize();i++){ + sumarray[i]=zero; + } + + parallel_for(int thr=0;thrSumArraySize();thr++){ + int nwork, mywork, myoff; + GridThread::GetWork(grid->oSites(),thr,mywork,myoff); + + vobj vvsum=zero; + for(int ss=myoff;ss - inline auto sum(const LatticeTrinaryExpression & expr) - ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)), - eval(0,std::get<1>(expr.second)), - eval(0,std::get<2>(expr.second)) - ))::scalar_object - { - return sum(closure(expr)); - } - - template - inline typename vobj::scalar_object sum(const Lattice &arg){ - - GridBase *grid=arg._grid; - int Nsimd = grid->Nsimd(); - - std::vector > sumarray(grid->SumArraySize()); - for(int i=0;iSumArraySize();i++){ - sumarray[i]=zero; - } - - parallel_for(int thr=0;thrSumArraySize();thr++){ - int nwork, mywork, myoff; - GridThread::GetWork(grid->oSites(),thr,mywork,myoff); - - vobj vvsum=zero; - for(int ss=myoff;ssSumArraySize();i++){ - vsum = vsum+sumarray[i]; - } - - typedef typename vobj::scalar_object sobj; - sobj ssum=zero; - - std::vector buf(Nsimd); - extract(vsum,buf); - - for(int i=0;iGlobalSum(ssum); - - return ssum; - } - - + sumarray[thr]=vvsum; + } + + vobj vsum=zero; // sum across threads + for(int i=0;iSumArraySize();i++){ + vsum = vsum+sumarray[i]; + } + + typedef typename vobj::scalar_object sobj; + sobj ssum=zero; + + std::vector buf(Nsimd); + extract(vsum,buf); + + for(int i=0;iGlobalSum(ssum); + + return ssum; +} template inline void sliceSum(const Lattice &Data,std::vector &result,int orthogdim) { @@ -214,7 +212,6 @@ template inline void sliceSum(const Lattice &Data,std::vector< result[t]=gsum; } - } diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index ccedf99c..248a625c 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -694,7 +694,6 @@ inline Grid_simd innerProduct(const Grid_simd &l, const Grid_simd &r) { return conjugate(l) * r; } - template inline Grid_simd outerProduct(const Grid_simd &l, const Grid_simd &r) { diff --git a/lib/tensors/Tensor_class.h b/lib/tensors/Tensor_class.h index e0b69eb0..9b806d75 100644 --- a/lib/tensors/Tensor_class.h +++ b/lib/tensors/Tensor_class.h @@ -56,6 +56,7 @@ class iScalar { typedef vtype element; typedef typename GridTypeMapper::scalar_type scalar_type; typedef typename GridTypeMapper::vector_type vector_type; + typedef typename GridTypeMapper::vector_typeD vector_typeD; typedef typename GridTypeMapper::tensor_reduced tensor_reduced_v; typedef iScalar tensor_reduced; typedef typename GridTypeMapper::scalar_object recurse_scalar_object; @@ -193,6 +194,7 @@ class iVector { typedef vtype element; typedef typename GridTypeMapper::scalar_type scalar_type; typedef typename GridTypeMapper::vector_type vector_type; + typedef typename GridTypeMapper::vector_typeD vector_typeD; typedef typename GridTypeMapper::tensor_reduced tensor_reduced_v; typedef typename GridTypeMapper::scalar_object recurse_scalar_object; typedef iScalar tensor_reduced; @@ -305,6 +307,7 @@ class iMatrix { typedef vtype element; typedef typename GridTypeMapper::scalar_type scalar_type; typedef typename GridTypeMapper::vector_type vector_type; + typedef typename GridTypeMapper::vector_typeD vector_typeD; typedef typename GridTypeMapper::tensor_reduced tensor_reduced_v; typedef typename GridTypeMapper::scalar_object recurse_scalar_object; diff --git a/lib/tensors/Tensor_inner.h b/lib/tensors/Tensor_inner.h index 22284be4..46185652 100644 --- a/lib/tensors/Tensor_inner.h +++ b/lib/tensors/Tensor_inner.h @@ -29,51 +29,109 @@ Author: Peter Boyle #ifndef GRID_MATH_INNER_H #define GRID_MATH_INNER_H namespace Grid { - /////////////////////////////////////////////////////////////////////////////////////// - // innerProduct Scalar x Scalar -> Scalar - // innerProduct Vector x Vector -> Scalar - // innerProduct Matrix x Matrix -> Scalar - /////////////////////////////////////////////////////////////////////////////////////// - template inline RealD norm2(const sobj &arg){ - typedef typename sobj::scalar_type scalar; - decltype(innerProduct(arg,arg)) nrm; - nrm = innerProduct(arg,arg); - RealD ret = real(nrm); - return ret; - } + /////////////////////////////////////////////////////////////////////////////////////// + // innerProduct Scalar x Scalar -> Scalar + // innerProduct Vector x Vector -> Scalar + // innerProduct Matrix x Matrix -> Scalar + /////////////////////////////////////////////////////////////////////////////////////// + template inline RealD norm2(const sobj &arg){ + auto nrm = innerProductD(arg,arg); + RealD ret = real(nrm); + return ret; + } + ////////////////////////////////////// + // If single promote to double and sum 2x + ////////////////////////////////////// - template inline - auto innerProduct (const iVector& lhs,const iVector& rhs) -> iScalar - { - typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t; - iScalar ret; - ret=zero; - for(int c1=0;c1 inline + auto innerProductD (const iVector& lhs,const iVector& rhs) -> iScalar + { + typedef decltype(innerProductD(lhs._internal[0],rhs._internal[0])) ret_t; + iScalar ret; + ret=zero; + for(int c1=0;c1 inline - auto innerProduct (const iMatrix& lhs,const iMatrix& rhs) -> iScalar - { - typedef decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t; - iScalar ret; - iScalar tmp; - ret=zero; - for(int c1=0;c1 inline - auto innerProduct (const iScalar& lhs,const iScalar& rhs) -> iScalar - { - typedef decltype(innerProduct(lhs._internal,rhs._internal)) ret_t; - iScalar ret; - ret._internal = innerProduct(lhs._internal,rhs._internal); - return ret; + return ret; + } + template inline + auto innerProductD (const iMatrix& lhs,const iMatrix& rhs) -> iScalar + { + typedef decltype(innerProductD(lhs._internal[0][0],rhs._internal[0][0])) ret_t; + iScalar ret; + iScalar tmp; + ret=zero; + for(int c1=0;c1 inline + auto innerProductD (const iScalar& lhs,const iScalar& rhs) -> iScalar + { + typedef decltype(innerProductD(lhs._internal,rhs._internal)) ret_t; + iScalar ret; + ret._internal = innerProductD(lhs._internal,rhs._internal); + return ret; + } + ////////////////////// + // Keep same precison + ////////////////////// + template inline + auto innerProduct (const iVector& lhs,const iVector& rhs) -> iScalar + { + typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t; + iScalar ret; + ret=zero; + for(int c1=0;c1 inline + auto innerProduct (const iMatrix& lhs,const iMatrix& rhs) -> iScalar + { + typedef decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t; + iScalar ret; + iScalar tmp; + ret=zero; + for(int c1=0;c1 inline + auto innerProduct (const iScalar& lhs,const iScalar& rhs) -> iScalar + { + typedef decltype(innerProduct(lhs._internal,rhs._internal)) ret_t; + iScalar ret; + ret._internal = innerProduct(lhs._internal,rhs._internal); + return ret; + } } #endif diff --git a/lib/tensors/Tensor_traits.h b/lib/tensors/Tensor_traits.h index 777b398d..4dcfd9b1 100644 --- a/lib/tensors/Tensor_traits.h +++ b/lib/tensors/Tensor_traits.h @@ -53,6 +53,7 @@ namespace Grid { public: typedef typename T::scalar_type scalar_type; typedef typename T::vector_type vector_type; + typedef typename T::vector_typeD vector_typeD; typedef typename T::tensor_reduced tensor_reduced; typedef typename T::scalar_object scalar_object; typedef typename T::Complexified Complexified; @@ -67,6 +68,7 @@ namespace Grid { public: typedef RealF scalar_type; typedef RealF vector_type; + typedef RealD vector_typeD; typedef RealF tensor_reduced ; typedef RealF scalar_object; typedef ComplexF Complexified; @@ -77,6 +79,7 @@ namespace Grid { public: typedef RealD scalar_type; typedef RealD vector_type; + typedef RealD vector_typeD; typedef RealD tensor_reduced; typedef RealD scalar_object; typedef ComplexD Complexified; @@ -87,6 +90,7 @@ namespace Grid { public: typedef ComplexF scalar_type; typedef ComplexF vector_type; + typedef ComplexD vector_typeD; typedef ComplexF tensor_reduced; typedef ComplexF scalar_object; typedef ComplexF Complexified; @@ -97,6 +101,7 @@ namespace Grid { public: typedef ComplexD scalar_type; typedef ComplexD vector_type; + typedef ComplexD vector_typeD; typedef ComplexD tensor_reduced; typedef ComplexD scalar_object; typedef ComplexD Complexified; @@ -118,6 +123,7 @@ namespace Grid { public: typedef RealF scalar_type; typedef vRealF vector_type; + typedef vRealD vector_typeD; typedef vRealF tensor_reduced; typedef RealF scalar_object; typedef vComplexF Complexified; @@ -128,6 +134,7 @@ namespace Grid { public: typedef RealD scalar_type; typedef vRealD vector_type; + typedef vRealD vector_typeD; typedef vRealD tensor_reduced; typedef RealD scalar_object; typedef vComplexD Complexified; @@ -138,6 +145,7 @@ namespace Grid { public: typedef ComplexF scalar_type; typedef vComplexF vector_type; + typedef vComplexD vector_typeD; typedef vComplexF tensor_reduced; typedef ComplexF scalar_object; typedef vComplexF Complexified; @@ -148,6 +156,7 @@ namespace Grid { public: typedef ComplexD scalar_type; typedef vComplexD vector_type; + typedef vComplexD vector_typeD; typedef vComplexD tensor_reduced; typedef ComplexD scalar_object; typedef vComplexD Complexified; @@ -158,6 +167,7 @@ namespace Grid { public: typedef Integer scalar_type; typedef vInteger vector_type; + typedef vInteger vector_typeD; typedef vInteger tensor_reduced; typedef Integer scalar_object; typedef void Complexified;