From 441a52ee5d9917ef0c7d2de179adcc9dab71804d Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 15 Apr 2017 10:57:21 +0100 Subject: [PATCH 1/5] First cut at higher precision reduction --- lib/lattice/Lattice_reduction.h | 197 ++++++++++++++++---------------- lib/simd/Grid_vector_types.h | 1 - lib/tensors/Tensor_class.h | 3 + lib/tensors/Tensor_inner.h | 142 ++++++++++++++++------- lib/tensors/Tensor_traits.h | 10 ++ 5 files changed, 210 insertions(+), 143 deletions(-) diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h index 45a88a64..bb7808b8 100644 --- a/lib/lattice/Lattice_reduction.h +++ b/lib/lattice/Lattice_reduction.h @@ -38,110 +38,108 @@ namespace Grid { //////////////////////////////////////////////////////////////////////////////////////////////////// // Deterministic Reduction operations //////////////////////////////////////////////////////////////////////////////////////////////////// - template inline RealD norm2(const Lattice &arg){ - ComplexD nrm = innerProduct(arg,arg); - return std::real(nrm); +template inline RealD norm2(const Lattice &arg){ + ComplexD nrm = innerProduct(arg,arg); + return std::real(nrm); +} + +template +inline ComplexD innerProduct(const Lattice &left,const Lattice &right) +{ + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_typeD vector_type; + scalar_type nrm; + + GridBase *grid = left._grid; + + std::vector > sumarray(grid->SumArraySize()); + for(int i=0;iSumArraySize();i++){ + sumarray[i]=zero; } - - template - inline ComplexD innerProduct(const Lattice &left,const Lattice &right) - { - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - scalar_type nrm; - - GridBase *grid = left._grid; - - std::vector > sumarray(grid->SumArraySize()); - for(int i=0;iSumArraySize();i++){ - sumarray[i]=zero; - } - - parallel_for(int thr=0;thrSumArraySize();thr++){ - int nwork, mywork, myoff; - GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); - - decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation - for(int ss=myoff;ssSumArraySize();i++){ - vvnrm = vvnrm+sumarray[i]; - } - nrm = Reduce(vvnrm);// sum across simd - right._grid->GlobalSum(nrm); - return nrm; + + parallel_for(int thr=0;thrSumArraySize();thr++){ + int nwork, mywork, myoff; + GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); + + decltype(innerProductD(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation + for(int ss=myoff;ssSumArraySize();i++){ + vvnrm = vvnrm+sumarray[i]; + } + nrm = Reduce(vvnrm);// sum across simd + right._grid->GlobalSum(nrm); + return nrm; +} + +template +inline auto sum(const LatticeUnaryExpression & expr) + ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object +{ + return sum(closure(expr)); +} - template - inline auto sum(const LatticeUnaryExpression & expr) - ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object - { - return sum(closure(expr)); - } - - template - inline auto sum(const LatticeBinaryExpression & expr) +template +inline auto sum(const LatticeBinaryExpression & expr) ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),eval(0,std::get<1>(expr.second))))::scalar_object - { - return sum(closure(expr)); +{ + return sum(closure(expr)); +} + + +template +inline auto sum(const LatticeTrinaryExpression & expr) + ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)), + eval(0,std::get<1>(expr.second)), + eval(0,std::get<2>(expr.second)) + ))::scalar_object +{ + return sum(closure(expr)); +} + +template +inline typename vobj::scalar_object sum(const Lattice &arg) +{ + GridBase *grid=arg._grid; + int Nsimd = grid->Nsimd(); + + std::vector > sumarray(grid->SumArraySize()); + for(int i=0;iSumArraySize();i++){ + sumarray[i]=zero; + } + + parallel_for(int thr=0;thrSumArraySize();thr++){ + int nwork, mywork, myoff; + GridThread::GetWork(grid->oSites(),thr,mywork,myoff); + + vobj vvsum=zero; + for(int ss=myoff;ss - inline auto sum(const LatticeTrinaryExpression & expr) - ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)), - eval(0,std::get<1>(expr.second)), - eval(0,std::get<2>(expr.second)) - ))::scalar_object - { - return sum(closure(expr)); - } - - template - inline typename vobj::scalar_object sum(const Lattice &arg){ - - GridBase *grid=arg._grid; - int Nsimd = grid->Nsimd(); - - std::vector > sumarray(grid->SumArraySize()); - for(int i=0;iSumArraySize();i++){ - sumarray[i]=zero; - } - - parallel_for(int thr=0;thrSumArraySize();thr++){ - int nwork, mywork, myoff; - GridThread::GetWork(grid->oSites(),thr,mywork,myoff); - - vobj vvsum=zero; - for(int ss=myoff;ssSumArraySize();i++){ - vsum = vsum+sumarray[i]; - } - - typedef typename vobj::scalar_object sobj; - sobj ssum=zero; - - std::vector buf(Nsimd); - extract(vsum,buf); - - for(int i=0;iGlobalSum(ssum); - - return ssum; - } - - + sumarray[thr]=vvsum; + } + + vobj vsum=zero; // sum across threads + for(int i=0;iSumArraySize();i++){ + vsum = vsum+sumarray[i]; + } + + typedef typename vobj::scalar_object sobj; + sobj ssum=zero; + + std::vector buf(Nsimd); + extract(vsum,buf); + + for(int i=0;iGlobalSum(ssum); + + return ssum; +} template inline void sliceSum(const Lattice &Data,std::vector &result,int orthogdim) { @@ -214,7 +212,6 @@ template inline void sliceSum(const Lattice &Data,std::vector< result[t]=gsum; } - } diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index ccedf99c..248a625c 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -694,7 +694,6 @@ inline Grid_simd innerProduct(const Grid_simd &l, const Grid_simd &r) { return conjugate(l) * r; } - template inline Grid_simd outerProduct(const Grid_simd &l, const Grid_simd &r) { diff --git a/lib/tensors/Tensor_class.h b/lib/tensors/Tensor_class.h index e0b69eb0..9b806d75 100644 --- a/lib/tensors/Tensor_class.h +++ b/lib/tensors/Tensor_class.h @@ -56,6 +56,7 @@ class iScalar { typedef vtype element; typedef typename GridTypeMapper::scalar_type scalar_type; typedef typename GridTypeMapper::vector_type vector_type; + typedef typename GridTypeMapper::vector_typeD vector_typeD; typedef typename GridTypeMapper::tensor_reduced tensor_reduced_v; typedef iScalar tensor_reduced; typedef typename GridTypeMapper::scalar_object recurse_scalar_object; @@ -193,6 +194,7 @@ class iVector { typedef vtype element; typedef typename GridTypeMapper::scalar_type scalar_type; typedef typename GridTypeMapper::vector_type vector_type; + typedef typename GridTypeMapper::vector_typeD vector_typeD; typedef typename GridTypeMapper::tensor_reduced tensor_reduced_v; typedef typename GridTypeMapper::scalar_object recurse_scalar_object; typedef iScalar tensor_reduced; @@ -305,6 +307,7 @@ class iMatrix { typedef vtype element; typedef typename GridTypeMapper::scalar_type scalar_type; typedef typename GridTypeMapper::vector_type vector_type; + typedef typename GridTypeMapper::vector_typeD vector_typeD; typedef typename GridTypeMapper::tensor_reduced tensor_reduced_v; typedef typename GridTypeMapper::scalar_object recurse_scalar_object; diff --git a/lib/tensors/Tensor_inner.h b/lib/tensors/Tensor_inner.h index 22284be4..46185652 100644 --- a/lib/tensors/Tensor_inner.h +++ b/lib/tensors/Tensor_inner.h @@ -29,51 +29,109 @@ Author: Peter Boyle #ifndef GRID_MATH_INNER_H #define GRID_MATH_INNER_H namespace Grid { - /////////////////////////////////////////////////////////////////////////////////////// - // innerProduct Scalar x Scalar -> Scalar - // innerProduct Vector x Vector -> Scalar - // innerProduct Matrix x Matrix -> Scalar - /////////////////////////////////////////////////////////////////////////////////////// - template inline RealD norm2(const sobj &arg){ - typedef typename sobj::scalar_type scalar; - decltype(innerProduct(arg,arg)) nrm; - nrm = innerProduct(arg,arg); - RealD ret = real(nrm); - return ret; - } + /////////////////////////////////////////////////////////////////////////////////////// + // innerProduct Scalar x Scalar -> Scalar + // innerProduct Vector x Vector -> Scalar + // innerProduct Matrix x Matrix -> Scalar + /////////////////////////////////////////////////////////////////////////////////////// + template inline RealD norm2(const sobj &arg){ + auto nrm = innerProductD(arg,arg); + RealD ret = real(nrm); + return ret; + } + ////////////////////////////////////// + // If single promote to double and sum 2x + ////////////////////////////////////// - template inline - auto innerProduct (const iVector& lhs,const iVector& rhs) -> iScalar - { - typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t; - iScalar ret; - ret=zero; - for(int c1=0;c1 inline + auto innerProductD (const iVector& lhs,const iVector& rhs) -> iScalar + { + typedef decltype(innerProductD(lhs._internal[0],rhs._internal[0])) ret_t; + iScalar ret; + ret=zero; + for(int c1=0;c1 inline - auto innerProduct (const iMatrix& lhs,const iMatrix& rhs) -> iScalar - { - typedef decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t; - iScalar ret; - iScalar tmp; - ret=zero; - for(int c1=0;c1 inline - auto innerProduct (const iScalar& lhs,const iScalar& rhs) -> iScalar - { - typedef decltype(innerProduct(lhs._internal,rhs._internal)) ret_t; - iScalar ret; - ret._internal = innerProduct(lhs._internal,rhs._internal); - return ret; + return ret; + } + template inline + auto innerProductD (const iMatrix& lhs,const iMatrix& rhs) -> iScalar + { + typedef decltype(innerProductD(lhs._internal[0][0],rhs._internal[0][0])) ret_t; + iScalar ret; + iScalar tmp; + ret=zero; + for(int c1=0;c1 inline + auto innerProductD (const iScalar& lhs,const iScalar& rhs) -> iScalar + { + typedef decltype(innerProductD(lhs._internal,rhs._internal)) ret_t; + iScalar ret; + ret._internal = innerProductD(lhs._internal,rhs._internal); + return ret; + } + ////////////////////// + // Keep same precison + ////////////////////// + template inline + auto innerProduct (const iVector& lhs,const iVector& rhs) -> iScalar + { + typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t; + iScalar ret; + ret=zero; + for(int c1=0;c1 inline + auto innerProduct (const iMatrix& lhs,const iMatrix& rhs) -> iScalar + { + typedef decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t; + iScalar ret; + iScalar tmp; + ret=zero; + for(int c1=0;c1 inline + auto innerProduct (const iScalar& lhs,const iScalar& rhs) -> iScalar + { + typedef decltype(innerProduct(lhs._internal,rhs._internal)) ret_t; + iScalar ret; + ret._internal = innerProduct(lhs._internal,rhs._internal); + return ret; + } } #endif diff --git a/lib/tensors/Tensor_traits.h b/lib/tensors/Tensor_traits.h index 777b398d..4dcfd9b1 100644 --- a/lib/tensors/Tensor_traits.h +++ b/lib/tensors/Tensor_traits.h @@ -53,6 +53,7 @@ namespace Grid { public: typedef typename T::scalar_type scalar_type; typedef typename T::vector_type vector_type; + typedef typename T::vector_typeD vector_typeD; typedef typename T::tensor_reduced tensor_reduced; typedef typename T::scalar_object scalar_object; typedef typename T::Complexified Complexified; @@ -67,6 +68,7 @@ namespace Grid { public: typedef RealF scalar_type; typedef RealF vector_type; + typedef RealD vector_typeD; typedef RealF tensor_reduced ; typedef RealF scalar_object; typedef ComplexF Complexified; @@ -77,6 +79,7 @@ namespace Grid { public: typedef RealD scalar_type; typedef RealD vector_type; + typedef RealD vector_typeD; typedef RealD tensor_reduced; typedef RealD scalar_object; typedef ComplexD Complexified; @@ -87,6 +90,7 @@ namespace Grid { public: typedef ComplexF scalar_type; typedef ComplexF vector_type; + typedef ComplexD vector_typeD; typedef ComplexF tensor_reduced; typedef ComplexF scalar_object; typedef ComplexF Complexified; @@ -97,6 +101,7 @@ namespace Grid { public: typedef ComplexD scalar_type; typedef ComplexD vector_type; + typedef ComplexD vector_typeD; typedef ComplexD tensor_reduced; typedef ComplexD scalar_object; typedef ComplexD Complexified; @@ -118,6 +123,7 @@ namespace Grid { public: typedef RealF scalar_type; typedef vRealF vector_type; + typedef vRealD vector_typeD; typedef vRealF tensor_reduced; typedef RealF scalar_object; typedef vComplexF Complexified; @@ -128,6 +134,7 @@ namespace Grid { public: typedef RealD scalar_type; typedef vRealD vector_type; + typedef vRealD vector_typeD; typedef vRealD tensor_reduced; typedef RealD scalar_object; typedef vComplexD Complexified; @@ -138,6 +145,7 @@ namespace Grid { public: typedef ComplexF scalar_type; typedef vComplexF vector_type; + typedef vComplexD vector_typeD; typedef vComplexF tensor_reduced; typedef ComplexF scalar_object; typedef vComplexF Complexified; @@ -148,6 +156,7 @@ namespace Grid { public: typedef ComplexD scalar_type; typedef vComplexD vector_type; + typedef vComplexD vector_typeD; typedef vComplexD tensor_reduced; typedef ComplexD scalar_object; typedef vComplexD Complexified; @@ -158,6 +167,7 @@ namespace Grid { public: typedef Integer scalar_type; typedef vInteger vector_type; + typedef vInteger vector_typeD; typedef vInteger tensor_reduced; typedef Integer scalar_object; typedef void Complexified; From bf516c3b8127667129daa993a61d1f2755039407 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 15 Apr 2017 12:27:28 +0100 Subject: [PATCH 2/5] higher precision reduction variables in norm and inner product --- lib/lattice/Lattice_reduction.h | 3 ++- tests/solver/Test_dwf_cg_prec.cc | 2 +- tests/solver/Test_dwf_cg_unprec.cc | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h index bb7808b8..e12bf0dd 100644 --- a/lib/lattice/Lattice_reduction.h +++ b/lib/lattice/Lattice_reduction.h @@ -42,7 +42,7 @@ template inline RealD norm2(const Lattice &arg){ ComplexD nrm = innerProduct(arg,arg); return std::real(nrm); } - +// Double inner product template inline ComplexD innerProduct(const Lattice &left,const Lattice &right) { @@ -102,6 +102,7 @@ inline auto sum(const LatticeTrinaryExpression & expr) return sum(closure(expr)); } +// FIXME precision promoted summation template inline typename vobj::scalar_object sum(const Lattice &arg) { diff --git a/tests/solver/Test_dwf_cg_prec.cc b/tests/solver/Test_dwf_cg_prec.cc index 30436e36..0ede352e 100644 --- a/tests/solver/Test_dwf_cg_prec.cc +++ b/tests/solver/Test_dwf_cg_prec.cc @@ -89,7 +89,7 @@ int main(int argc, char** argv) { GridStopWatch CGTimer; SchurDiagMooeeOperator HermOpEO(Ddwf); - ConjugateGradient CG(1.0e-8, 10000, 0);// switch off the assert + ConjugateGradient CG(1.0e-5, 10000, 0);// switch off the assert CGTimer.Start(); CG(HermOpEO, src_o, result_o); diff --git a/tests/solver/Test_dwf_cg_unprec.cc b/tests/solver/Test_dwf_cg_unprec.cc index 131f6ce1..38f632ef 100644 --- a/tests/solver/Test_dwf_cg_unprec.cc +++ b/tests/solver/Test_dwf_cg_unprec.cc @@ -73,7 +73,7 @@ int main (int argc, char ** argv) DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); MdagMLinearOperator HermOp(Ddwf); - ConjugateGradient CG(1.0e-8,10000); + ConjugateGradient CG(1.0e-6,10000); CG(HermOp,src,result); Grid_finalize(); From 7ede6961269516638aa21560f9511ec5140fbcb2 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 16 Apr 2017 23:40:00 +0100 Subject: [PATCH 3/5] Non compile of tests fixed --- TODO | 17 +- .../iterative/BlockConjugateGradient.h | 201 ------------------ lib/lattice/Lattice_reduction.h | 159 ++++++++++++++ lib/simd/Grid_vector_types.h | 2 +- lib/tensors/Tensor_traits.h | 1 + 5 files changed, 170 insertions(+), 210 deletions(-) diff --git a/TODO b/TODO index 91034f20..27579ad3 100644 --- a/TODO +++ b/TODO @@ -3,19 +3,20 @@ TODO: Peter's work list: --- Merge high precision reduction into develop +-- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- started +-- Merge high precision reduction into develop <-- done +-- Precision conversion and sort out localConvert <-- -- Physical propagator interface --- Precision conversion and sort out localConvert --- slice* linalg routines for multiRHS, BlockCG + +-- multiRHS DWF; benchmark on Cori/BNL for comms elimination + -- slice* linalg routines for multiRHS, BlockCG <-- started + -- Profile CG, BlockCG, etc... Flop count/rate -- Binary I/O speed up & x-strips --- Half-precision comms --- multiRHS DWF; benchmark on Cori/BNL for comms elimination +-- Half-precision comms <-- started -- GaugeFix into central location --- Help Julia with NPR code --- Switch to measurements +-- FFTfix in sensible place -- Multigrid Wilson and DWF, compare to other Multigrid implementations --- Remove DenseVector, DenseMatrix; Use Eigen instead. -- quaternions -- Might not need diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h index 0f4a3a80..1db89512 100644 --- a/lib/algorithms/iterative/BlockConjugateGradient.h +++ b/lib/algorithms/iterative/BlockConjugateGradient.h @@ -30,210 +30,9 @@ directory #ifndef GRID_BLOCK_CONJUGATE_GRADIENT_H #define GRID_BLOCK_CONJUGATE_GRADIENT_H -#include namespace Grid { -GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) -{ - int NN = BlockSolverGrid->_ndimension; - int nsimd = BlockSolverGrid->Nsimd(); - - std::vector latt_phys(0); - std::vector simd_phys(0); - std::vector mpi_phys(0); - - for(int d=0;d_fdimensions[d]); - simd_phys.push_back(BlockSolverGrid->_simd_layout[d]); - mpi_phys.push_back(BlockSolverGrid->_processors[d]); - } - } - return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); -} - ////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Need to move sliceInnerProduct, sliceAxpy, sliceNorm etc... into lattice sector along with sliceSum - ////////////////////////////////////////////////////////////////////////////////////////////////////////////// -template -static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,const Lattice &Y,int Orthog,RealD scale=1.0) -{ - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - int Nblock = X._grid->GlobalDimensions()[Orthog]; - - GridBase *FullGrid = X._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - - Lattice Xslice(SliceGrid); - Lattice Rslice(SliceGrid); - // FIXME: Implementation is slow - // If we based this on Cshift it would work for spread out - // but it would be even slower - // - // Repeated extract slice is inefficient - // - // Best base the linear combination by constructing a - // set of vectors of size grid->_rdimensions[Orthog]. - for(int i=0;i -static void sliceMaddVector (Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, - int Orthog,RealD scale=1.0) -{ - // FIXME: Implementation is slow - // Best base the linear combination by constructing a - // set of vectors of size grid->_rdimensions[Orthog]. - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - int Nblock = X._grid->GlobalDimensions()[Orthog]; - - GridBase *FullGrid = X._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - - Lattice Xslice(SliceGrid); - Lattice Rslice(SliceGrid); - // If we based this on Cshift it would work for spread out - // but it would be even slower - for(int i=0;i -static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) -{ - // FIXME: Implementation is slow - // Not sure of best solution.. think about it - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - GridBase *FullGrid = lhs._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - - int Nblock = FullGrid->GlobalDimensions()[Orthog]; - - Lattice Lslice(SliceGrid); - Lattice Rslice(SliceGrid); - - mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); - - for(int i=0;i -static void sliceInnerProductVector( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) -{ - // FIXME: Implementation is slow - // Look at localInnerProduct implementation, - // and do inside a site loop with block strided iterators - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - typedef typename vobj::tensor_reduced scalar; - typedef typename scalar::scalar_object scomplex; - - int Nblock = lhs._grid->GlobalDimensions()[Orthog]; - - vec.resize(Nblock); - std::vector sip(Nblock); - Lattice IP(lhs._grid); - - IP=localInnerProduct(lhs,rhs); - sliceSum(IP,sip,Orthog); - - for(int ss=0;ss -static void sliceNorm (std::vector &sn,const Lattice &rhs,int Orthog) { - - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - int Nblock = rhs._grid->GlobalDimensions()[Orthog]; - std::vector ip(Nblock); - sn.resize(Nblock); - - sliceInnerProductVector(ip,rhs,rhs,Orthog); - for(int ss=0;ss -static void sliceInnerProductMatrixOld( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) -{ - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - typedef typename vobj::tensor_reduced scalar; - typedef typename scalar::scalar_object scomplex; - - int Nblock = lhs._grid->GlobalDimensions()[Orthog]; - - std::cout << " sliceInnerProductMatrix Dim "< IP(lhs._grid); - std::vector sip(Nblock); - - mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); - - Lattice tmp = rhs; - - for(int s1=0;s1 #ifndef GRID_LATTICE_REDUCTION_H #define GRID_LATTICE_REDUCTION_H +#include + namespace Grid { #ifdef GRID_WARN_SUBOPTIMAL #warning "Optimisation alert all these reduction loops are NOT threaded " @@ -215,6 +217,163 @@ template inline void sliceSum(const Lattice &Data,std::vector< } } +inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) + { + int NN = BlockSolverGrid->_ndimension; + int nsimd = BlockSolverGrid->Nsimd(); + + std::vector latt_phys(0); + std::vector simd_phys(0); + std::vector mpi_phys(0); + + for(int d=0;d_fdimensions[d]); + simd_phys.push_back(BlockSolverGrid->_simd_layout[d]); + mpi_phys.push_back(BlockSolverGrid->_processors[d]); + } + } + return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); + } + ////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Need to move sliceInnerProduct, sliceAxpy, sliceNorm etc... into lattice sector along with sliceSum + ////////////////////////////////////////////////////////////////////////////////////////////////////////////// +template + static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,const Lattice &Y,int Orthog,RealD scale=1.0) + { + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = X._grid->GlobalDimensions()[Orthog]; + + GridBase *FullGrid = X._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + Lattice Xslice(SliceGrid); + Lattice Rslice(SliceGrid); + // FIXME: Implementation is slow + // If we based this on Cshift it would work for spread out + // but it would be even slower + // + // Repeated extract slice is inefficient + // + // Best base the linear combination by constructing a + // set of vectors of size grid->_rdimensions[Orthog]. + for(int i=0;i + static void sliceMaddVector (Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, + int Orthog,RealD scale=1.0) + { + // FIXME: Implementation is slow + // Best base the linear combination by constructing a + // set of vectors of size grid->_rdimensions[Orthog]. + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = X._grid->GlobalDimensions()[Orthog]; + + GridBase *FullGrid = X._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + Lattice Xslice(SliceGrid); + Lattice Rslice(SliceGrid); + // If we based this on Cshift it would work for spread out + // but it would be even slower + for(int i=0;i + static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) + { + // FIXME: Implementation is slow + // Not sure of best solution.. think about it + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + GridBase *FullGrid = lhs._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + int Nblock = FullGrid->GlobalDimensions()[Orthog]; + + Lattice Lslice(SliceGrid); + Lattice Rslice(SliceGrid); + + mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); + + for(int i=0;i + static void sliceInnerProductVector( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) + { + // FIXME: Implementation is slow + // Look at localInnerProduct implementation, + // and do inside a site loop with block strided iterators + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + typedef typename vobj::tensor_reduced scalar; + typedef typename scalar::scalar_object scomplex; + + int Nblock = lhs._grid->GlobalDimensions()[Orthog]; + + vec.resize(Nblock); + std::vector sip(Nblock); + Lattice IP(lhs._grid); + + IP=localInnerProduct(lhs,rhs); + sliceSum(IP,sip,Orthog); + + for(int ss=0;ss + static void sliceNorm (std::vector &sn,const Lattice &rhs,int Orthog) { + + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = rhs._grid->GlobalDimensions()[Orthog]; + std::vector ip(Nblock); + sn.resize(Nblock); + + sliceInnerProductVector(ip,rhs,rhs,Orthog); + for(int ss=0;ss Date: Mon, 17 Apr 2017 10:50:19 +0100 Subject: [PATCH 4/5] MultiRHS working, starting to optimise. Block doesn't and I thought it already was; puzzled. --- .../iterative/BlockConjugateGradient.h | 2 + lib/lattice/Lattice_reduction.h | 209 ++++++++++++++---- 2 files changed, 170 insertions(+), 41 deletions(-) diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h index 1db89512..42a9af56 100644 --- a/lib/algorithms/iterative/BlockConjugateGradient.h +++ b/lib/algorithms/iterative/BlockConjugateGradient.h @@ -256,8 +256,10 @@ void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) Linop.HermOp(P, AP); // Alpha + // sliceInnerProductVectorTest(v_pAp_test,P,AP,Orthog); sliceInnerProductVector(v_pAp,P,AP,Orthog); for(int b=0;b &left,const Lattice &righ GridBase *grid = left._grid; std::vector > sumarray(grid->SumArraySize()); - for(int i=0;iSumArraySize();i++){ - sumarray[i]=zero; - } parallel_for(int thr=0;thrSumArraySize();thr++){ int nwork, mywork, myoff; @@ -152,7 +149,6 @@ template inline void sliceSum(const Lattice &Data,std::vector< // FIXME // std::cout<SumArraySize()<<" threads "<_ndimension; const int Nsimd = grid->Nsimd(); @@ -164,8 +160,8 @@ template inline void sliceSum(const Lattice &Data,std::vector< int rd=grid->_rdimensions[orthogdim]; std::vector > lvSum(rd); // will locally sum vectors first - std::vector lsSum(ld,zero); // sum across these down to scalars - std::vector extracted(Nsimd); // splitting the SIMD + std::vector lsSum(ld,zero); // sum across these down to scalars + std::vector extracted(Nsimd); // splitting the SIMD result.resize(fd); // And then global sum to return the same vector to every node for IO to file for(int r=0;r inline void sliceSum(const Lattice &Data,std::vector< std::vector coor(Nd); // sum over reduced dimension planes, breaking out orthog dir - for(int ss=0;ssoSites();ss++){ Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions); int r = coor[orthogdim]; lvSum[r]=lvSum[r]+Data._odata[ss]; - } + } // Sum across simd lanes in the plane, breaking out orthog dir. std::vector icoor(Nd); @@ -217,6 +212,171 @@ template inline void sliceSum(const Lattice &Data,std::vector< } } +template + static void sliceInnerProductVectorSlow( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) + { + // FIXME: Implementation is slow + // Look at localInnerProduct implementation, + // and do inside a site loop with block strided iterators + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + typedef typename vobj::tensor_reduced scalar; + typedef typename scalar::scalar_object scomplex; + + int Nblock = lhs._grid->GlobalDimensions()[Orthog]; + + vec.resize(Nblock); + std::vector sip(Nblock); + Lattice IP(lhs._grid); + + IP=localInnerProduct(lhs,rhs); + sliceSum(IP,sip,Orthog); + + for(int ss=0;ss +static void sliceInnerProductVector( std::vector & result, const Lattice &lhs,const Lattice &rhs,int orthogdim) +{ + typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_type scalar_type; + GridBase *grid = lhs._grid; + assert(grid!=NULL); + conformable(grid,rhs._grid); + + // FIXME + // std::cout<SumArraySize()<<" threads "<_ndimension; + const int Nsimd = grid->Nsimd(); + + assert(orthogdim >= 0); + assert(orthogdim < Nd); + + int fd=grid->_fdimensions[orthogdim]; + int ld=grid->_ldimensions[orthogdim]; + int rd=grid->_rdimensions[orthogdim]; + + std::vector > lvSum(rd); // will locally sum vectors first + std::vector lsSum(ld,scalar_type(0.0)); // sum across these down to scalars + std::vector > extracted(Nsimd); // splitting the SIMD + + result.resize(fd); // And then global sum to return the same vector to every node for IO to file + for(int r=0;r coor(Nd); + vector_type vv; + PARALLEL_FOR_LOOP_INTERN + for(int ss=0;ssoSites();ss++){ + Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions); + int r = coor[orthogdim]; + vv = TensorRemove(innerProduct(lhs._odata[ss],rhs._odata[ss])); + PARALLEL_CRITICAL { // ouch slow rfo thrashing atomic fp add + lvSum[r]=lvSum[r]+vv; + } + } + } + + // Sum across simd lanes in the plane, breaking out orthog dir. + std::vector icoor(Nd); + for(int rt=0;rt temp; temp._internal = lvSum[rt]; + extract(temp,extracted); + + for(int idx=0;idxiCoorFromIindex(icoor,idx); + + int ldx =rt+icoor[orthogdim]*rd; + + lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal; + + } + } + + // sum over nodes. + scalar_type gsum; + for(int t=0;t_processor_coor[orthogdim] ) { + gsum=lsSum[lt]; + } else { + gsum=scalar_type(0.0); + } + + grid->GlobalSum(gsum); + + result[t]=gsum; + } +} +#if 0 +template +static void sliceInnerProductVector( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) +{ + // FIXME: Implementation is slow + // Look at sliceSum implementation, + // and do inside a site loop with block strided iterators + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + typedef typename vobj::tensor_reduced scalar; + typedef typename scalar::scalar_object scomplex; + + GridBase * grid = lhs._grid; + + + const int Nd = grid->_ndimension; + const int Nsimd = grid->Nsimd(); + + assert(orthogdim >= 0); + assert(orthogdim < Nd); + + int fd=grid->_fdimensions[orthogdim]; + int ld=grid->_ldimensions[orthogdim]; + int rd=grid->_rdimensions[orthogdim]; + + int Nblock = grid->GlobalDimensions()[Orthog]; + int Nrblock = grid->_rdimensions[Orthog]; + int Nthr = grid->SumArraySize(); + + std::vector > sumarray(Nrblock*Nthr); + + parallel_for(int thr=0;thrSumArraySize();thr++){ + + int nwork, mywork, myoff; + + for(int rb=0;rboSites()/Nrblock),thr,mywork,myoff); + int off = rb * grid->_slice_ + vector_type vnrm=zero; // private to thread; sub summation + for(int ss=myoff;ss sip(Nblock); + Lattice IP(lhs._grid); + + IP=localInnerProduct(lhs,rhs); + sliceSum(IP,sip,Orthog); + + for(int ss=0;ss_ndimension; @@ -322,41 +482,8 @@ template mat(i,j) = innerProduct(Lslice,Rslice); } } -#undef FORCE_DIAG -#ifdef FORCE_DIAG - for(int i=0;i - static void sliceInnerProductVector( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) - { - // FIXME: Implementation is slow - // Look at localInnerProduct implementation, - // and do inside a site loop with block strided iterators - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - typedef typename vobj::tensor_reduced scalar; - typedef typename scalar::scalar_object scomplex; - - int Nblock = lhs._grid->GlobalDimensions()[Orthog]; - - vec.resize(Nblock); - std::vector sip(Nblock); - Lattice IP(lhs._grid); - - IP=localInnerProduct(lhs,rhs); - sliceSum(IP,sip,Orthog); - - for(int ss=0;ss static void sliceNorm (std::vector &sn,const Lattice &rhs,int Orthog) { From 8e161152e457954ba2493b9d807516ae0f2eb030 Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 18 Apr 2017 10:51:55 +0100 Subject: [PATCH 5/5] MultiRHS solver improvements with slice operations moved into lattice and sped up. Block solver requires a lot of performance work. --- .../iterative/BlockConjugateGradient.h | 85 +++- lib/algorithms/iterative/ConjugateGradient.h | 39 +- lib/lattice/Lattice_reduction.h | 457 +++++++++--------- lib/log/Log.h | 4 +- lib/simd/Grid_vector_types.h | 4 - lib/tensors/Tensor_class.h | 28 +- lib/tensors/Tensor_traits.h | 3 +- .../solver/Test_staggered_block_cg_unprec.cc | 30 +- tests/solver/Test_staggered_cg_unprec.cc | 1 - 9 files changed, 366 insertions(+), 285 deletions(-) diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h index 42a9af56..d90194ae 100644 --- a/lib/algorithms/iterative/BlockConjugateGradient.h +++ b/lib/algorithms/iterative/BlockConjugateGradient.h @@ -60,8 +60,8 @@ void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) { int Orthog = 0; // First dimension is block dim Nblock = Src._grid->_fdimensions[Orthog]; - std::cout< &Linop, const Field &Src, Field &Psi) Field AP(Src); Field R(Src); - GridStopWatch LinalgTimer; - GridStopWatch MatrixTimer; - GridStopWatch SolverTimer; - Eigen::MatrixXcd m_pAp = Eigen::MatrixXcd::Identity(Nblock,Nblock); Eigen::MatrixXcd m_pAp_inv= Eigen::MatrixXcd::Identity(Nblock,Nblock); Eigen::MatrixXcd m_rr = Eigen::MatrixXcd::Zero(Nblock,Nblock); @@ -116,33 +112,49 @@ void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) P = R; sliceInnerProductMatrix(m_rr,R,R,Orthog); + GridStopWatch sliceInnerTimer; + GridStopWatch sliceMaddTimer; + GridStopWatch MatrixTimer; + GridStopWatch SolverTimer; + SolverTimer.Start(); + int k; for (k = 1; k <= MaxIterations; k++) { RealD rrsum=0; for(int b=0;b &Linop, const Field &Src, Field &Psi) if ( max_resid < Tolerance*Tolerance ) { - std::cout << GridLogMessage<<" Block solver has converged in " - < &Linop, const Field &Src, Field &Psi) { int Orthog = 0; // First dimension is block dim Nblock = Src._grid->_fdimensions[Orthog]; - std::cout< &Linop, const Field &Src, Field &Psi) P = R; sliceNorm(v_rr,R,Orthog); + GridStopWatch sliceInnerTimer; + GridStopWatch sliceMaddTimer; + GridStopWatch sliceNormTimer; + GridStopWatch MatrixTimer; + GridStopWatch SolverTimer; + + SolverTimer.Start(); int k; for (k = 1; k <= MaxIterations; k++) { RealD rrsum=0; for(int b=0;b &Linop, const Field &Src, Field &Psi) } if ( max_resid < Tolerance*Tolerance ) { - std::cout << GridLogMessage<<" MultiRHS solver has converged in " - < { cp = a; ssq = norm2(src); - std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: guess " << guess << std::endl; - std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: src " << ssq << std::endl; - std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: mp " << d << std::endl; - std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: mmp " << b << std::endl; - std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: cp,r " << cp << std::endl; - std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: p " << a << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: guess " << guess << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: src " << ssq << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: mp " << d << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: mmp " << b << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: cp,r " << cp << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: p " << a << std::endl; RealD rsq = Tolerance * Tolerance * ssq; @@ -144,19 +138,20 @@ class ConjugateGradient : public OperatorFunction { RealD resnorm = sqrt(norm2(p)); RealD true_residual = resnorm / srcnorm; - std::cout << GridLogMessage - << "ConjugateGradient: Converged on iteration " << k << std::endl; - std::cout << GridLogMessage << "Computed residual " << sqrt(cp / ssq) - << " true residual " << true_residual << " target " - << Tolerance << std::endl; - std::cout << GridLogMessage << "Time elapsed: Iterations " - << SolverTimer.Elapsed() << " Matrix " - << MatrixTimer.Elapsed() << " Linalg " - << LinalgTimer.Elapsed(); - std::cout << std::endl; + std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl; + std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)< inline RealD norm2(const Lattice &arg){ ComplexD nrm = innerProduct(arg,arg); return std::real(nrm); } + // Double inner product template inline ComplexD innerProduct(const Lattice &left,const Lattice &right) @@ -101,7 +102,6 @@ inline auto sum(const LatticeTrinaryExpression & expr) return sum(closure(expr)); } -// FIXME precision promoted summation template inline typename vobj::scalar_object sum(const Lattice &arg) { @@ -141,14 +141,22 @@ inline typename vobj::scalar_object sum(const Lattice &arg) return ssum; } + +////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc... +////////////////////////////////////////////////////////////////////////////////////////////////////////////// + template inline void sliceSum(const Lattice &Data,std::vector &result,int orthogdim) { + /////////////////////////////////////////////////////// + // FIXME precision promoted summation + // may be important for correlation functions + // But easily avoided by using double precision fields + /////////////////////////////////////////////////////// typedef typename vobj::scalar_object sobj; GridBase *grid = Data._grid; assert(grid!=NULL); - // FIXME - // std::cout<SumArraySize()<<" threads "<_ndimension; const int Nsimd = grid->Nsimd(); @@ -163,18 +171,27 @@ template inline void sliceSum(const Lattice &Data,std::vector< std::vector lsSum(ld,zero); // sum across these down to scalars std::vector extracted(Nsimd); // splitting the SIMD - result.resize(fd); // And then global sum to return the same vector to every node for IO to file + result.resize(fd); // And then global sum to return the same vector to every node for(int r=0;r coor(Nd); + int e1= grid->_slice_nblock[orthogdim]; + int e2= grid->_slice_block [orthogdim]; + int stride=grid->_slice_stride[orthogdim]; // sum over reduced dimension planes, breaking out orthog dir - for(int ss=0;ssoSites();ss++){ - Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions); - int r = coor[orthogdim]; - lvSum[r]=lvSum[r]+Data._odata[ss]; + // Parallel over orthog direction + parallel_for(int r=0;r_ostride[orthogdim]; // base offset for start of plane + + for(int n=0;n inline void sliceSum(const Lattice &Data,std::vector< } } -template - static void sliceInnerProductVectorSlow( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) - { - // FIXME: Implementation is slow - // Look at localInnerProduct implementation, - // and do inside a site loop with block strided iterators - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - typedef typename vobj::tensor_reduced scalar; - typedef typename scalar::scalar_object scomplex; - - int Nblock = lhs._grid->GlobalDimensions()[Orthog]; - - vec.resize(Nblock); - std::vector sip(Nblock); - Lattice IP(lhs._grid); - - IP=localInnerProduct(lhs,rhs); - sliceSum(IP,sip,Orthog); - - for(int ss=0;ss static void sliceInnerProductVector( std::vector & result, const Lattice &lhs,const Lattice &rhs,int orthogdim) { @@ -247,8 +238,6 @@ static void sliceInnerProductVector( std::vector & result, const Latti assert(grid!=NULL); conformable(grid,rhs._grid); - // FIXME - // std::cout<SumArraySize()<<" threads "<_ndimension; const int Nsimd = grid->Nsimd(); @@ -268,16 +257,18 @@ static void sliceInnerProductVector( std::vector & result, const Latti lvSum[r]=zero; } - // sum over reduced dimension planes, breaking out orthog dir - PARALLEL_REGION { - std::vector coor(Nd); - vector_type vv; - PARALLEL_FOR_LOOP_INTERN - for(int ss=0;ssoSites();ss++){ - Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions); - int r = coor[orthogdim]; - vv = TensorRemove(innerProduct(lhs._odata[ss],rhs._odata[ss])); - PARALLEL_CRITICAL { // ouch slow rfo thrashing atomic fp add + int e1= grid->_slice_nblock[orthogdim]; + int e2= grid->_slice_block [orthogdim]; + int stride=grid->_slice_stride[orthogdim]; + + parallel_for(int r=0;r_ostride[orthogdim]; // base offset for start of plane + + for(int n=0;n & result, const Latti std::vector icoor(Nd); for(int rt=0;rt temp; temp._internal = lvSum[rt]; + iScalar temp; + temp._internal = lvSum[rt]; extract(temp,extracted); for(int idx=0;idx & result, const Latti result[t]=gsum; } } -#if 0 template -static void sliceInnerProductVector( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) +static void sliceNorm (std::vector &sn,const Lattice &rhs,int Orthog) { - // FIXME: Implementation is slow - // Look at sliceSum implementation, - // and do inside a site loop with block strided iterators - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - typedef typename vobj::tensor_reduced scalar; - typedef typename scalar::scalar_object scomplex; - - GridBase * grid = lhs._grid; - - - const int Nd = grid->_ndimension; - const int Nsimd = grid->Nsimd(); - - assert(orthogdim >= 0); - assert(orthogdim < Nd); - - int fd=grid->_fdimensions[orthogdim]; - int ld=grid->_ldimensions[orthogdim]; - int rd=grid->_rdimensions[orthogdim]; - - int Nblock = grid->GlobalDimensions()[Orthog]; - int Nrblock = grid->_rdimensions[Orthog]; - int Nthr = grid->SumArraySize(); - - std::vector > sumarray(Nrblock*Nthr); - - parallel_for(int thr=0;thrSumArraySize();thr++){ - - int nwork, mywork, myoff; - - for(int rb=0;rboSites()/Nrblock),thr,mywork,myoff); - int off = rb * grid->_slice_ - vector_type vnrm=zero; // private to thread; sub summation - for(int ss=myoff;ss sip(Nblock); - Lattice IP(lhs._grid); - - IP=localInnerProduct(lhs,rhs); - sliceSum(IP,sip,Orthog); - - for(int ss=0;ss_ndimension; - int nsimd = BlockSolverGrid->Nsimd(); - - std::vector latt_phys(0); - std::vector simd_phys(0); - std::vector mpi_phys(0); - - for(int d=0;d_fdimensions[d]); - simd_phys.push_back(BlockSolverGrid->_simd_layout[d]); - mpi_phys.push_back(BlockSolverGrid->_processors[d]); - } - } - return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); - } - ////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Need to move sliceInnerProduct, sliceAxpy, sliceNorm etc... into lattice sector along with sliceSum - ////////////////////////////////////////////////////////////////////////////////////////////////////////////// -template - static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,const Lattice &Y,int Orthog,RealD scale=1.0) - { - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - int Nblock = X._grid->GlobalDimensions()[Orthog]; - - GridBase *FullGrid = X._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - - Lattice Xslice(SliceGrid); - Lattice Rslice(SliceGrid); - // FIXME: Implementation is slow - // If we based this on Cshift it would work for spread out - // but it would be even slower - // - // Repeated extract slice is inefficient - // - // Best base the linear combination by constructing a - // set of vectors of size grid->_rdimensions[Orthog]. - for(int i=0;i - static void sliceMaddVector (Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, - int Orthog,RealD scale=1.0) - { - // FIXME: Implementation is slow - // Best base the linear combination by constructing a - // set of vectors of size grid->_rdimensions[Orthog]. - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - int Nblock = X._grid->GlobalDimensions()[Orthog]; - - GridBase *FullGrid = X._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - - Lattice Xslice(SliceGrid); - Lattice Rslice(SliceGrid); - // If we based this on Cshift it would work for spread out - // but it would be even slower - for(int i=0;i - static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) - { - // FIXME: Implementation is slow - // Not sure of best solution.. think about it - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - GridBase *FullGrid = lhs._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - - int Nblock = FullGrid->GlobalDimensions()[Orthog]; - - Lattice Lslice(SliceGrid); - Lattice Rslice(SliceGrid); - - mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); - - for(int i=0;i - static void sliceNorm (std::vector &sn,const Lattice &rhs,int Orthog) { - typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; @@ -499,9 +324,207 @@ template for(int ss=0;ss +static void sliceMaddVector(Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, + int orthogdim,RealD scale=1.0) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + typedef typename vobj::tensor_reduced tensor_reduced; + + GridBase *grid = X._grid; + + int Nsimd =grid->Nsimd(); + int Nblock =grid->GlobalDimensions()[orthogdim]; + + int fd =grid->_fdimensions[orthogdim]; + int ld =grid->_ldimensions[orthogdim]; + int rd =grid->_rdimensions[orthogdim]; + + int e1 =grid->_slice_nblock[orthogdim]; + int e2 =grid->_slice_block [orthogdim]; + int stride =grid->_slice_stride[orthogdim]; + + std::vector icoor; + + for(int r=0;r_ostride[orthogdim]; // base offset for start of plane + + vector_type av; + + for(int l=0;liCoorFromIindex(icoor,l); + int ldx =r+icoor[orthogdim]*rd; + scalar_type *as =(scalar_type *)&av; + as[l] = scalar_type(a[ldx])*scale; + } + + tensor_reduced at; at=av; + + parallel_for_nest2(int n=0;n +static void sliceMaddVectorSlow (Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, + int Orthog,RealD scale=1.0) +{ + // FIXME: Implementation is slow + // Best base the linear combination by constructing a + // set of vectors of size grid->_rdimensions[Orthog]. + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = X._grid->GlobalDimensions()[Orthog]; + + GridBase *FullGrid = X._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + Lattice Xslice(SliceGrid); + Lattice Rslice(SliceGrid); + // If we based this on Cshift it would work for spread out + // but it would be even slower + for(int i=0;i +static void sliceInnerProductVectorSlow( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) + { + // FIXME: Implementation is slow + // Look at localInnerProduct implementation, + // and do inside a site loop with block strided iterators + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + typedef typename vobj::tensor_reduced scalar; + typedef typename scalar::scalar_object scomplex; + + int Nblock = lhs._grid->GlobalDimensions()[Orthog]; + + vec.resize(Nblock); + std::vector sip(Nblock); + Lattice IP(lhs._grid); + + IP=localInnerProduct(lhs,rhs); + sliceSum(IP,sip,Orthog); + + for(int ss=0;ss_rdimensions[Orthog]. +////////////////////////////////////////////////////////////////////////////////////////// + +inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) +{ + int NN = BlockSolverGrid->_ndimension; + int nsimd = BlockSolverGrid->Nsimd(); + + std::vector latt_phys(0); + std::vector simd_phys(0); + std::vector mpi_phys(0); + + for(int d=0;d_fdimensions[d]); + simd_phys.push_back(BlockSolverGrid->_simd_layout[d]); + mpi_phys.push_back(BlockSolverGrid->_processors[d]); + } + } + return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); } + + +template +static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,const Lattice &Y,int Orthog,RealD scale=1.0) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = X._grid->GlobalDimensions()[Orthog]; + + GridBase *FullGrid = X._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + Lattice Xslice(SliceGrid); + Lattice Rslice(SliceGrid); + + for(int i=0;i +static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) +{ + // FIXME: Implementation is slow + // Not sure of best solution.. think about it + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + GridBase *FullGrid = lhs._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + int Nblock = FullGrid->GlobalDimensions()[Orthog]; + + Lattice Lslice(SliceGrid); + Lattice Rslice(SliceGrid); + + mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); + + for(int i=0;i = 0> inline Grid_simd rotate(Grid_simd b, int nrot) { nrot = nrot % Grid_simd::Nsimd(); Grid_simd ret; - // std::cout << "Rotate Real by "< = 0> inline Grid_simd rotate(Grid_simd b, int nrot) { nrot = nrot % Grid_simd::Nsimd(); Grid_simd ret; - // std::cout << "Rotate Complex by "< =0> inline void rotate( Grid_simd &ret,Grid_simd b,int nrot) { nrot = nrot % Grid_simd::Nsimd(); - // std::cout << "Rotate Real by "< =0> inline void rotate(Grid_simd &ret,Grid_simd b,int nrot) { nrot = nrot % Grid_simd::Nsimd(); - // std::cout << "Rotate Complex by "<::vector_type vector_type; typedef typename GridTypeMapper::vector_typeD vector_typeD; typedef typename GridTypeMapper::tensor_reduced tensor_reduced_v; - typedef iScalar tensor_reduced; typedef typename GridTypeMapper::scalar_object recurse_scalar_object; + typedef iScalar tensor_reduced; typedef iScalar scalar_object; - // substitutes a real or complex version with same tensor structure typedef iScalar::Complexified> Complexified; typedef iScalar::Realified> Realified; @@ -78,8 +77,12 @@ class iScalar { iScalar & operator= (const iScalar ©me) = default; iScalar & operator= (iScalar &©me) = default; */ - iScalar(scalar_type s) - : _internal(s){}; // recurse down and hit the constructor for vector_type + + // template + // iScalar(EnableIf, vector_type> s) : _internal(s){}; // recurse down and hit the constructor for vector_type + + iScalar(scalar_type s) : _internal(s){}; // recurse down and hit the constructor for vector_type + iScalar(const Zero &z) { *this = zero; }; iScalar &operator=(const Zero &hero) { @@ -135,33 +138,28 @@ class iScalar { strong_inline const vtype &operator()(void) const { return _internal; } // Type casts meta programmed, must be pure scalar to match TensorRemove - template = 0, - IfNotSimd = 0> + template = 0, IfNotSimd = 0> operator ComplexF() const { return (TensorRemove(_internal)); }; - template = 0, - IfNotSimd = 0> + template = 0, IfNotSimd = 0> operator ComplexD() const { return (TensorRemove(_internal)); }; // template = 0,IfNotSimd = // 0> operator RealD () const { return(real(TensorRemove(_internal))); } - template = 0, - IfNotSimd = 0> + template = 0,IfNotSimd = 0> operator RealD() const { return TensorRemove(_internal); } - template = 0, - IfNotSimd = 0> + template = 0, IfNotSimd = 0> operator Integer() const { return Integer(TensorRemove(_internal)); } // convert from a something to a scalar via constructor of something arg - template ::value, T>::type - * = nullptr> - strong_inline iScalar operator=(T arg) { + template ::value, T>::type * = nullptr> + strong_inline iScalar operator=(T arg) { _internal = arg; return *this; } diff --git a/lib/tensors/Tensor_traits.h b/lib/tensors/Tensor_traits.h index e630c217..80009391 100644 --- a/lib/tensors/Tensor_traits.h +++ b/lib/tensors/Tensor_traits.h @@ -252,7 +252,8 @@ namespace Grid { template class isSIMDvectorized{ template - static typename std::enable_if< !std::is_same< typename GridTypeMapper::type>::scalar_type, typename GridTypeMapper::type>::vector_type>::value, char>::type test(void *); + static typename std::enable_if< !std::is_same< typename GridTypeMapper::type>::scalar_type, + typename GridTypeMapper::type>::vector_type>::value, char>::type test(void *); template static double test(...); diff --git a/tests/solver/Test_staggered_block_cg_unprec.cc b/tests/solver/Test_staggered_block_cg_unprec.cc index e9142f8c..44e8fb52 100644 --- a/tests/solver/Test_staggered_block_cg_unprec.cc +++ b/tests/solver/Test_staggered_block_cg_unprec.cc @@ -51,7 +51,7 @@ int main (int argc, char ** argv) typedef typename ImprovedStaggeredFermion5DR::ComplexField ComplexField; typename ImprovedStaggeredFermion5DR::ImplParams params; - const int Ls=8; + const int Ls=4; Grid_init(&argc,&argv); @@ -76,24 +76,44 @@ int main (int argc, char ** argv) RealD mass=0.01; ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass); - MdagMLinearOperator HermOp(Ds); ConjugateGradient CG(1.0e-8,10000); BlockConjugateGradient BCG(1.0e-8,10000); MultiRHSConjugateGradient mCG(1.0e-8,10000); - std::cout << GridLogMessage << " Calling CG "< HermOp4d(Ds4d); + FermionField src4d(UGrid); random(pRNG,src4d); + FermionField result4d(UGrid); result4d=zero; + CG(HermOp4d,src4d,result4d); + std::cout << GridLogMessage << "************************************************************************ "< HermOp(Ds); - ConjugateGradient CG(1.0e-8,10000); CG(HermOp,src,result); Grid_finalize();