From e9cc21900f00b81a17ab87d649e014edc99c636b Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Tue, 20 Jun 2017 12:37:41 +0100 Subject: [PATCH] Block solver complete for staggered. Now stable on mass 0.003 and gives 8x (!) speed up on Haswell laptop vs. standard CG for 8 RHS solves. 166 iterations vs. 537 iterations so algorithmic gain + 2x in flop rate gain. Better than a slap in the face with a wet kipper. --- .../iterative/BlockConjugateGradient.h | 295 ++++++++++++++++-- lib/lattice/Lattice_reduction.h | 235 +++----------- .../solver/Test_staggered_block_cg_unprec.cc | 13 +- 3 files changed, 321 insertions(+), 222 deletions(-) diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h index 53e11fa7..f8b83b1f 100644 --- a/lib/algorithms/iterative/BlockConjugateGradient.h +++ b/lib/algorithms/iterative/BlockConjugateGradient.h @@ -33,6 +33,8 @@ directory namespace Grid { +enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS }; + ////////////////////////////////////////////////////////////////////////// // Block conjugate gradient. Dimension zero should be the block direction ////////////////////////////////////////////////////////////////////////// @@ -40,24 +42,274 @@ template class BlockConjugateGradient : public OperatorFunction { public: + typedef typename Field::scalar_type scomplex; int blockDim ; - int Nblock; + + BlockCGtype CGtype; bool ErrorOnNoConverge; // throw an assert when the CG fails to converge. // Defaults true. RealD Tolerance; Integer MaxIterations; Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion - BlockConjugateGradient(int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true) + BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true) : Tolerance(tol), + CGtype(cgtype), blockDim(_Orthog), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv){}; +//////////////////////////////////////////////////////////////////////////////////////////////////// +// Thin QR factorisation (google it) +//////////////////////////////////////////////////////////////////////////////////////////////////// +void ThinQRfact (Eigen::MatrixXcd &m_rr, + Eigen::MatrixXcd &C, + Eigen::MatrixXcd &Cinv, + Field & Q, + const Field & R) +{ + int Orthog = blockDim; // First dimension is block dim; this is an assumption + //////////////////////////////////////////////////////////////////////////////////////////////////// + //Dimensions + // R_{ferm x Nblock} = Q_{ferm x Nblock} x C_{Nblock x Nblock} -> ferm x Nblock + // + // Rdag R = m_rr = Herm = L L^dag <-- Cholesky decomposition (LLT routine in Eigen) + // + // Q C = R => Q = R C^{-1} + // + // Want Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock} + // + // Set C = L^{dag}, and then Q^dag Q = ident + // + // Checks: + // Cdag C = Rdag R ; passes. + // QdagQ = 1 ; passes + //////////////////////////////////////////////////////////////////////////////////////////////////// + sliceInnerProductMatrix(m_rr,R,R,Orthog); + + //////////////////////////////////////////////////////////////////////////////////////////////////// + // Cholesky from Eigen + // There exists a ldlt that is documented as more stable + //////////////////////////////////////////////////////////////////////////////////////////////////// + Eigen::MatrixXcd L = m_rr.llt().matrixL(); + + C = L.adjoint(); + Cinv = C.inverse(); + + //////////////////////////////////////////////////////////////////////////////////////////////////// + // Q = R C^{-1} + // + // Q_j = R_i Cinv(i,j) + // + // NB maddMatrix conventions are Right multiplication X[j] a[j,i] already + //////////////////////////////////////////////////////////////////////////////////////////////////// + // FIXME:: make a sliceMulMatrix to avoid zero vector + sliceMulMatrix(Q,Cinv,R,Orthog); +} +//////////////////////////////////////////////////////////////////////////////////////////////////// +// Call one of several implementations +//////////////////////////////////////////////////////////////////////////////////////////////////// void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) +{ + if ( CGtype == BlockCGrQ ) { + BlockCGrQsolve(Linop,Src,Psi); + } else if (CGtype == BlockCG ) { + BlockCGsolve(Linop,Src,Psi); + } else if (CGtype == CGmultiRHS ) { + CGmultiRHSsolve(Linop,Src,Psi); + } else { + assert(0); + } +} + +//////////////////////////////////////////////////////////////////////////// +// BlockCGrQ implementation: +//-------------------------- +// X is guess/Solution +// B is RHS +// Solve A X_i = B_i ; i refers to Nblock index +//////////////////////////////////////////////////////////////////////////// +void BlockCGrQsolve(LinearOperatorBase &Linop, const Field &B, Field &X) +{ + int Orthog = blockDim; // First dimension is block dim; this is an assumption + Nblock = B._grid->_fdimensions[Orthog]; + + std::cout< residuals(Nblock); + std::vector ssq(Nblock); + + sliceNorm(ssq,B,Orthog); + RealD sssum=0; + for(int b=0;b Thin QR factorisation (google it) + * for k: + * Z = AD + * M = [D^dag Z]^{-1} + * X = X + D MC + * QS = Q - ZM + * D = Q + D S^dag + * C = S C + */ + /////////////////////////////////////// + // Initial block: initial search dir is guess + /////////////////////////////////////// + std::cout << GridLogMessage<<"BlockCGrQ algorithm initialisation " < Thin QR factorisation (google it) + + Linop.HermOp(X, AD); + tmp = B - AD; + ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp); + D=Q; + + std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " < max_resid ) max_resid = rr; + } + + std::cout << GridLogIterative << "\titeration "< &Linop, const Field &Src, Field &Psi) { int Orthog = blockDim; // First dimension is block dim; this is an assumption Nblock = Src._grid->_fdimensions[Orthog]; @@ -163,8 +415,9 @@ void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) ********************* */ RealD max_resid=0; + RealD rr; for(int b=0;b max_resid ) max_resid = rr; } @@ -174,13 +427,14 @@ void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) std::cout << GridLogMessage<<"BlockCG converged in "< &Linop, const Field &Src, Field &Psi) if (ErrorOnNoConverge) assert(0); IterationsToComplete = k; } -}; - - ////////////////////////////////////////////////////////////////////////// // multiRHS conjugate gradient. Dimension zero should be the block direction +// Use this for spread out across nodes ////////////////////////////////////////////////////////////////////////// -template -class MultiRHSConjugateGradient : public OperatorFunction { - public: - - typedef typename Field::scalar_type scomplex; - - int blockDim; - int Nblock; - bool ErrorOnNoConverge; // throw an assert when the CG fails to converge. - // Defaults true. - RealD Tolerance; - Integer MaxIterations; - Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion - - MultiRHSConjugateGradient(int Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true) - : Tolerance(tol), - blockDim(Orthog), - MaxIterations(maxit), - ErrorOnNoConverge(err_on_no_conv){}; - -void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) +void CGmultiRHSsolve(LinearOperatorBase &Linop, const Field &Src, Field &Psi) { int Orthog = blockDim; // First dimension is block dim Nblock = Src._grid->_fdimensions[Orthog]; @@ -331,7 +563,7 @@ void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) std::cout << GridLogMessage<<"MultiRHS solver converged in " < &Linop, const Field &Src, Field &Psi) if (ErrorOnNoConverge) assert(0); IterationsToComplete = k; } + }; - - } #endif diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h index 78f88ce3..c5b20f3c 100644 --- a/lib/lattice/Lattice_reduction.h +++ b/lib/lattice/Lattice_reduction.h @@ -369,71 +369,6 @@ static void sliceMaddVector(Lattice &R,std::vector &a,const Lattice } }; - -/* -template -static void sliceMaddVectorSlow (Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, - int Orthog,RealD scale=1.0) -{ - // FIXME: Implementation is slow - // Best base the linear combination by constructing a - // set of vectors of size grid->_rdimensions[Orthog]. - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - int Nblock = X._grid->GlobalDimensions()[Orthog]; - - GridBase *FullGrid = X._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - - Lattice Xslice(SliceGrid); - Lattice Rslice(SliceGrid); - // If we based this on Cshift it would work for spread out - // but it would be even slower - for(int i=0;i -static void sliceInnerProductVectorSlow( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) - { - // FIXME: Implementation is slow - // Look at localInnerProduct implementation, - // and do inside a site loop with block strided iterators - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - typedef typename vobj::tensor_reduced scalar; - typedef typename scalar::scalar_object scomplex; - - int Nblock = lhs._grid->GlobalDimensions()[Orthog]; - vec.resize(Nblock); - std::vector sip(Nblock); - Lattice IP(lhs._grid); - IP=localInnerProduct(lhs,rhs); - sliceSum(IP,sip,Orthog); - - for(int ss=0;ss_rdimensions[Orthog]. -////////////////////////////////////////////////////////////////////////////////////////// - inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) { int NN = BlockSolverGrid->_ndimension; @@ -453,7 +388,6 @@ inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Or return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); } - template static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,const Lattice &Y,int Orthog,RealD scale=1.0) { @@ -469,64 +403,10 @@ static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice Lattice Xslice(SliceGrid); Lattice Rslice(SliceGrid); -#if 0 - // R[i] = Y[i] + X[j] a(j,i) - for(int i=0;i_ndimension; - int nl = SliceGrid->_ndimension; - -#pragma omp parallel -{ - - std::vector lcoor(nl); // sliced coor - std::vector hcoor(nh); // unsliced coor - std::vector s_x(Nblock); - -#pragma omp for - for(int idx=0;idxlSites();idx++){ - - SliceGrid->LocalIndexToLocalCoor(idx,lcoor); - - int ddl=0; - for(int d=0;d_simd_layout[Orthog]==1); int nh = FullGrid->_ndimension; int nl = SliceGrid->_ndimension; - //FIXME package in a convenient iterator //Should loop over a plane orthogonal to direction "Orthog" int stride=FullGrid->_slice_stride[Orthog]; @@ -535,7 +415,6 @@ static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice int ostride=FullGrid->_ostride[Orthog]; #pragma omp parallel { - std::vector s_x(Nblock); #pragma omp for collapse(2) @@ -543,13 +422,11 @@ static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice for(int b=0;b &R,Eigen::MatrixXcd &aa,const Lattice } }} } -#endif +}; + +template +static void sliceMulMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,int Orthog,RealD scale=1.0) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = X._grid->GlobalDimensions()[Orthog]; + + GridBase *FullGrid = X._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + Lattice Xslice(SliceGrid); + Lattice Rslice(SliceGrid); + + assert( FullGrid->_simd_layout[Orthog]==1); + int nh = FullGrid->_ndimension; + int nl = SliceGrid->_ndimension; + + //FIXME package in a convenient iterator + //Should loop over a plane orthogonal to direction "Orthog" + int stride=FullGrid->_slice_stride[Orthog]; + int block =FullGrid->_slice_block [Orthog]; + int nblock=FullGrid->_slice_nblock[Orthog]; + int ostride=FullGrid->_ostride[Orthog]; +#pragma omp parallel + { + std::vector s_x(Nblock); + +#pragma omp for collapse(2) + for(int n=0;n static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) { - // FIXME: Implementation is slow - // Not sure of best solution.. think about it typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; @@ -582,63 +507,6 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); -#if 0 - for(int i=0;i_ndimension; - int nl = SliceGrid->_ndimension; - -#pragma omp parallel -{ - std::vector lcoor(nl); // sliced coor - std::vector hcoor(nh); // unsliced coor - std::vector Left(Nblock); - std::vector Right(Nblock); - Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock); - -#pragma omp for - for(int idx=0;idxlSites();idx++){ - - SliceGrid->LocalIndexToLocalCoor(idx,lcoor); - - int ddl=0; - for(int d=0;d ip = innerProduct(Left[i],Right[j]); - mat_thread(i,j) += ip; - }} - } - -#pragma omp critical - { - mat += mat_thread; - } - -} -#endif - -#if 1 assert( FullGrid->_simd_layout[Orthog]==1); int nh = FullGrid->_ndimension; int nl = SliceGrid->_ndimension; @@ -681,7 +549,6 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice mat += mat_thread; } } -#endif return; } diff --git a/tests/solver/Test_staggered_block_cg_unprec.cc b/tests/solver/Test_staggered_block_cg_unprec.cc index 8da93195..8db41e98 100644 --- a/tests/solver/Test_staggered_block_cg_unprec.cc +++ b/tests/solver/Test_staggered_block_cg_unprec.cc @@ -51,7 +51,7 @@ int main (int argc, char ** argv) typedef typename ImprovedStaggeredFermion5DR::ComplexField ComplexField; typename ImprovedStaggeredFermion5DR::ImplParams params; - const int Ls=4; + const int Ls=8; Grid_init(&argc,&argv); @@ -80,12 +80,13 @@ int main (int argc, char ** argv) ConjugateGradient CG(1.0e-8,10000); int blockDim = 0; - BlockConjugateGradient BCG(blockDim,1.0e-8,10000); - MultiRHSConjugateGradient mCG(blockDim,1.0e-8,10000); + BlockConjugateGradient BCGrQ(BlockCGrQ,blockDim,1.0e-8,10000); + BlockConjugateGradient BCG (BlockCG,blockDim,1.0e-8,10000); + BlockConjugateGradient mCG (CGmultiRHS,blockDim,1.0e-8,10000); - std::cout << GridLogMessage << "************************************************************************ "< HermOp4d(Ds4d); FermionField src4d(UGrid); random(pRNG,src4d); @@ -112,7 +113,7 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << " Calling Block CG for "<