From 86aaa35294b6bcc1dc90441283bd89f434cfc977 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 7 Apr 2017 11:07:40 +0900 Subject: [PATCH 01/36] Christoph needs SchurDiagTwoKappa which is mobius specific. --- lib/qcd/action/fermion/Fermion.h | 1 + lib/qcd/action/fermion/SchurDiagTwoKappa.h | 102 +++++++++++++++++++++ 2 files changed, 103 insertions(+) create mode 100644 lib/qcd/action/fermion/SchurDiagTwoKappa.h diff --git a/lib/qcd/action/fermion/Fermion.h b/lib/qcd/action/fermion/Fermion.h index 791afeee..ac42f177 100644 --- a/lib/qcd/action/fermion/Fermion.h +++ b/lib/qcd/action/fermion/Fermion.h @@ -58,6 +58,7 @@ Author: Peter Boyle #include #include #include +#include #include #include #include diff --git a/lib/qcd/action/fermion/SchurDiagTwoKappa.h b/lib/qcd/action/fermion/SchurDiagTwoKappa.h new file mode 100644 index 00000000..8305f98a --- /dev/null +++ b/lib/qcd/action/fermion/SchurDiagTwoKappa.h @@ -0,0 +1,102 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: SchurDiagTwoKappa.h + + Copyright (C) 2017 + +Author: Christoph Lehner +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#ifndef _SCHUR_DIAG_TWO_KAPPA_H +#define _SCHUR_DIAG_TWO_KAPPA_H + +namespace Grid { + + // This is specific to (Z)mobius fermions + template + class KappaSimilarityTransform { + public: + INHERIT_IMPL_TYPES(Matrix); + std::vector kappa, kappaDag, kappaInv, kappaInvDag; + + KappaSimilarityTransform (Matrix &zmob) { + for (int i=0;i<(int)zmob.bs.size();i++) { + Coeff_t k = 1.0 / ( 2.0 * (zmob.bs[i] *(4 - zmob.M5) + 1.0) ); + kappa.push_back( k ); + kappaDag.push_back( conj(k) ); + kappaInv.push_back( 1.0 / k ); + kappaInvDag.push_back( 1.0 / conj(k) ); + } + } + + template + void sscale(const Lattice& in, Lattice& out, Coeff_t* s) { + GridBase *grid=out._grid; + out.checkerboard = in.checkerboard; + assert(grid->_simd_layout[0] == 1); // should be fine for ZMobius for now + int Ls = grid->_rdimensions[0]; + parallel_for(int ss=0;ssoSites();ss++){ + vobj tmp = s[ss % Ls]*in._odata[ss]; + vstream(out._odata[ss],tmp); + } + } + + RealD sscale_norm(const Field& in, Field& out, Coeff_t* s) { + sscale(in,out,s); + return norm2(out); + } + + virtual RealD M (const Field& in, Field& out) { return sscale_norm(in,out,&kappa[0]); } + virtual RealD MDag (const Field& in, Field& out) { return sscale_norm(in,out,&kappaDag[0]);} + virtual RealD MInv (const Field& in, Field& out) { return sscale_norm(in,out,&kappaInv[0]);} + virtual RealD MInvDag (const Field& in, Field& out) { return sscale_norm(in,out,&kappaInvDag[0]);} + + }; + + template + class SchurDiagTwoKappaOperator : public SchurOperatorBase { + public: + KappaSimilarityTransform _S; + SchurDiagTwoOperator _Mat; + + SchurDiagTwoKappaOperator (Matrix &Mat): _S(Mat), _Mat(Mat) {}; + + virtual RealD Mpc (const Field &in, Field &out) { + Field tmp(in._grid); + + _S.MInv(in,out); + _Mat.Mpc(out,tmp); + return _S.M(tmp,out); + + } + virtual RealD MpcDag (const Field &in, Field &out){ + Field tmp(in._grid); + + _S.MDag(in,out); + _Mat.MpcDag(out,tmp); + return _S.MInvDag(tmp,out); + } + }; + +} + +#endif From 683550f1166d316f3af722228c8be4b6c87c8d16 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 9 Apr 2017 23:41:04 +0900 Subject: [PATCH 02/36] Const args improvement --- lib/lattice/Lattice_transfer.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/lattice/Lattice_transfer.h b/lib/lattice/Lattice_transfer.h index 8eb75f15..68de52d0 100644 --- a/lib/lattice/Lattice_transfer.h +++ b/lib/lattice/Lattice_transfer.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -359,7 +359,7 @@ void localConvert(const Lattice &in,Lattice &out) template -void InsertSlice(Lattice &lowDim,Lattice & higherDim,int slice, int orthog) +void InsertSlice(const Lattice &lowDim,Lattice & higherDim,int slice, int orthog) { typedef typename vobj::scalar_object sobj; @@ -401,7 +401,7 @@ void InsertSlice(Lattice &lowDim,Lattice & higherDim,int slice, int } template -void ExtractSlice(Lattice &lowDim, Lattice & higherDim,int slice, int orthog) +void ExtractSlice(Lattice &lowDim,const Lattice & higherDim,int slice, int orthog) { typedef typename vobj::scalar_object sobj; @@ -444,7 +444,7 @@ void ExtractSlice(Lattice &lowDim, Lattice & higherDim,int slice, in template -void InsertSliceLocal(Lattice &lowDim, Lattice & higherDim,int slice_lo,int slice_hi, int orthog) +void InsertSliceLocal(const Lattice &lowDim, Lattice & higherDim,int slice_lo,int slice_hi, int orthog) { typedef typename vobj::scalar_object sobj; From db5f6d3ae39a77800faab9cf0e70f2f24898c1e9 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 9 Apr 2017 23:41:30 +0900 Subject: [PATCH 03/36] Verbose fix --- lib/qcd/action/fermion/ImprovedStaggeredFermion.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc index 2ba4f4af..5ce2b335 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc @@ -160,8 +160,6 @@ void ImprovedStaggeredFermion::ImportGauge(const GaugeField &_Uthin,const PokeIndex(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4); } - std::cout << " Umu " << Umu._odata[0]< Date: Sun, 9 Apr 2017 23:42:10 +0900 Subject: [PATCH 04/36] Start of blockCG --- .../iterative/BlockConjugateGradient.h | 337 ++++++++++++++++++ .../solver/Test_staggered_block_cg_unprec.cc | 94 +++++ tests/solver/Test_staggered_cg_unprec.cc | 83 +++++ 3 files changed, 514 insertions(+) create mode 100644 lib/algorithms/iterative/BlockConjugateGradient.h create mode 100644 tests/solver/Test_staggered_block_cg_unprec.cc create mode 100644 tests/solver/Test_staggered_cg_unprec.cc diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h new file mode 100644 index 00000000..616387d8 --- /dev/null +++ b/lib/algorithms/iterative/BlockConjugateGradient.h @@ -0,0 +1,337 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/algorithms/iterative/BlockConjugateGradient.h + +Copyright (C) 2017 + +Author: Azusa Yamaguchi +Author: Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#ifndef GRID_BLOCK_CONJUGATE_GRADIENT_H +#define GRID_BLOCK_CONJUGATE_GRADIENT_H + +#include + +namespace Grid { + +GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) +{ + int NN = BlockSolverGrid->_ndimension; + int nsimd = BlockSolverGrid->Nsimd(); + + std::vector latt_phys(0); + std::vector simd_phys(0); + std::vector mpi_phys(0); + + for(int d=0;d_fdimensions[d]); + simd_phys.push_back(BlockSolverGrid->_simd_layout[d]); + mpi_phys.push_back(BlockSolverGrid->_processors[d]); + } + } + return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); +} + ////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Need to move sliceInnerProduct, sliceAxpy, sliceNorm etc... into lattice sector along with sliceSum + ////////////////////////////////////////////////////////////////////////////////////////////////////////////// +template +static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,const Lattice &Y,int Orthog,RealD scale=1.0) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = X._grid->GlobalDimensions()[Orthog]; + + GridBase *FullGrid = X._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + Lattice Xslice(SliceGrid); + Lattice Rslice(SliceGrid); + // If we based this on Cshift it would work for spread out + // but it would be even slower + for(int i=0;i +static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + GridBase *FullGrid = lhs._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + int Nblock = FullGrid->GlobalDimensions()[Orthog]; + + Lattice Lslice(SliceGrid); + Lattice Rslice(SliceGrid); + + mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); + + for(int i=0;i +static void sliceInnerProductVector( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + typedef typename vobj::tensor_reduced scalar; + typedef typename scalar::scalar_object scomplex; + + int Nblock = lhs._grid->GlobalDimensions()[Orthog]; + + vec.resize(Nblock); + std::vector sip(Nblock); + Lattice IP(lhs._grid); + + IP=localInnerProduct(lhs,rhs); + sliceSum(IP,sip,Orthog); + + for(int ss=0;ss +static void sliceNorm (std::vector &sn,const Lattice &rhs,int Orthog) { + + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = rhs._grid->GlobalDimensions()[Orthog]; + std::vector ip(Nblock); + sn.resize(Nblock); + + sliceInnerProductVector(ip,rhs,rhs,Orthog); + for(int ss=0;ss +static void sliceInnerProductMatrixOld( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + typedef typename vobj::tensor_reduced scalar; + typedef typename scalar::scalar_object scomplex; + + int Nblock = lhs._grid->GlobalDimensions()[Orthog]; + + std::cout << " sliceInnerProductMatrix Dim "< IP(lhs._grid); + std::vector sip(Nblock); + + mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); + + Lattice tmp = rhs; + + for(int s1=0;s1 +class BlockConjugateGradient : public OperatorFunction { + public: + + typedef typename Field::scalar_type scomplex; + + const int blockDim = 0; + + int Nblock; + bool ErrorOnNoConverge; // throw an assert when the CG fails to converge. + // Defaults true. + RealD Tolerance; + Integer MaxIterations; + Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion + + BlockConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true) + : Tolerance(tol), + MaxIterations(maxit), + ErrorOnNoConverge(err_on_no_conv){}; + +void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) +{ + int Orthog = 0; // First dimension is block dim + Nblock = Src._grid->_fdimensions[Orthog]; + std::cout< residuals(Nblock); + std::vector ssq(Nblock); + + sliceNorm(ssq,Src,Orthog); + RealD sssum=0; + for(int b=0;b max_resid ) max_resid = rr; + } + + if ( max_resid < Tolerance*Tolerance ) { + std::cout << GridLogMessage<<" Block solver has converged in " + < +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +template +struct scal { + d internal; +}; + + Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + +int main (int argc, char ** argv) +{ + typedef typename ImprovedStaggeredFermion5DR::FermionField FermionField; + typedef typename ImprovedStaggeredFermion5DR::ComplexField ComplexField; + typename ImprovedStaggeredFermion5DR::ImplParams params; + + const int Ls=8; + + Grid_init(&argc,&argv); + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::vector seeds({1,2,3,4}); + GridParallelRNG pRNG(UGrid ); pRNG.SeedFixedIntegers(seeds); + GridParallelRNG pRNG5(FGrid); pRNG5.SeedFixedIntegers(seeds); + + FermionField src(FGrid); random(pRNG5,src); + FermionField result(FGrid); result=zero; + RealD nrm = norm2(src); + + LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu); + + RealD mass=0.01; + ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass); + + MdagMLinearOperator HermOp(Ds); + + ConjugateGradient CG(1.0e-8,10000); + BlockConjugateGradient BCG(1.0e-8,10000); + + std::cout << GridLogMessage << " Calling CG "< +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +template +struct scal { + d internal; +}; + + Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + +int main (int argc, char ** argv) +{ + typedef typename ImprovedStaggeredFermionR::FermionField FermionField; + typedef typename ImprovedStaggeredFermionR::ComplexField ComplexField; + typename ImprovedStaggeredFermionR::ImplParams params; + + Grid_init(&argc,&argv); + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridRedBlackCartesian RBGrid(latt_size,simd_layout,mpi_layout); + + std::vector seeds({1,2,3,4}); + GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); + + FermionField src(&Grid); random(pRNG,src); + RealD nrm = norm2(src); + FermionField result(&Grid); result=zero; + LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + + double volume=1; + for(int mu=0;mu HermOp(Ds); + ConjugateGradient CG(1.0e-8,10000); + CG(HermOp,src,result); + + Grid_finalize(); +} From d80d802f9daafee01462579ca7eb4b9b8f452c6e Mon Sep 17 00:00:00 2001 From: paboyle Date: Mon, 10 Apr 2017 00:12:12 +0900 Subject: [PATCH 05/36] MultiRHS solver test --- .../iterative/BlockConjugateGradient.h | 159 ++++++++++++++++++ .../solver/Test_staggered_block_cg_unprec.cc | 5 + 2 files changed, 164 insertions(+) diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h index 616387d8..c333f8e7 100644 --- a/lib/algorithms/iterative/BlockConjugateGradient.h +++ b/lib/algorithms/iterative/BlockConjugateGradient.h @@ -81,6 +81,30 @@ static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice } }; template +static void sliceMaddVector (Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, + int Orthog,RealD scale=1.0) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = X._grid->GlobalDimensions()[Orthog]; + + GridBase *FullGrid = X._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + Lattice Xslice(SliceGrid); + Lattice Rslice(SliceGrid); + // If we based this on Cshift it would work for spread out + // but it would be even slower + for(int i=0;i static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) { typedef typename vobj::scalar_object sobj; @@ -194,6 +218,8 @@ static void sliceInnerProductMatrixOld( Eigen::MatrixXcd &mat, const Lattice &Linop, const Field &Src, Field &Psi) IterationsToComplete = k; } }; + + +////////////////////////////////////////////////////////////////////////// +// multiRHS conjugate gradient. Dimension zero should be the block direction +////////////////////////////////////////////////////////////////////////// +template +class MultiRHSConjugateGradient : public OperatorFunction { + public: + + typedef typename Field::scalar_type scomplex; + + const int blockDim = 0; + + int Nblock; + bool ErrorOnNoConverge; // throw an assert when the CG fails to converge. + // Defaults true. + RealD Tolerance; + Integer MaxIterations; + Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion + + MultiRHSConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true) + : Tolerance(tol), + MaxIterations(maxit), + ErrorOnNoConverge(err_on_no_conv){}; + +void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) +{ + int Orthog = 0; // First dimension is block dim + Nblock = Src._grid->_fdimensions[Orthog]; + std::cout< v_pAp(Nblock); + std::vector v_rr (Nblock); + std::vector v_rr_inv(Nblock); + std::vector v_alpha(Nblock); + std::vector v_beta(Nblock); + + // Initial residual computation & set up + std::vector residuals(Nblock); + std::vector ssq(Nblock); + + sliceNorm(ssq,Src,Orthog); + RealD sssum=0; + for(int b=0;b max_resid ) max_resid = rr; + } + + if ( max_resid < Tolerance*Tolerance ) { + std::cout << GridLogMessage<<" MultiRHS solver has converged in " + < CG(1.0e-8,10000); BlockConjugateGradient BCG(1.0e-8,10000); + MultiRHSConjugateGradient mCG(1.0e-8,10000); std::cout << GridLogMessage << " Calling CG "< Date: Mon, 10 Apr 2017 20:38:20 +0900 Subject: [PATCH 06/36] Commenting and clean up --- .../iterative/BlockConjugateGradient.h | 20 +++++++++++++++++++ lib/algorithms/iterative/ConjugateGradient.h | 3 +-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h index c333f8e7..0f4a3a80 100644 --- a/lib/algorithms/iterative/BlockConjugateGradient.h +++ b/lib/algorithms/iterative/BlockConjugateGradient.h @@ -69,8 +69,14 @@ static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice Lattice Xslice(SliceGrid); Lattice Rslice(SliceGrid); + // FIXME: Implementation is slow // If we based this on Cshift it would work for spread out // but it would be even slower + // + // Repeated extract slice is inefficient + // + // Best base the linear combination by constructing a + // set of vectors of size grid->_rdimensions[Orthog]. for(int i=0;i static void sliceMaddVector (Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, int Orthog,RealD scale=1.0) { + // FIXME: Implementation is slow + // Best base the linear combination by constructing a + // set of vectors of size grid->_rdimensions[Orthog]. typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; @@ -107,6 +116,8 @@ static void sliceMaddVector (Lattice &R,std::vector &a,const Lattic template static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) { + // FIXME: Implementation is slow + // Not sure of best solution.. think about it typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; @@ -141,6 +152,9 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice template static void sliceInnerProductVector( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) { + // FIXME: Implementation is slow + // Look at localInnerProduct implementation, + // and do inside a site loop with block strided iterators typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; @@ -257,6 +271,10 @@ void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) Field AP(Src); Field R(Src); + GridStopWatch LinalgTimer; + GridStopWatch MatrixTimer; + GridStopWatch SolverTimer; + Eigen::MatrixXcd m_pAp = Eigen::MatrixXcd::Identity(Nblock,Nblock); Eigen::MatrixXcd m_pAp_inv= Eigen::MatrixXcd::Identity(Nblock,Nblock); Eigen::MatrixXcd m_rr = Eigen::MatrixXcd::Zero(Nblock,Nblock); @@ -339,8 +357,10 @@ void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) } if ( max_resid < Tolerance*Tolerance ) { + std::cout << GridLogMessage<<" Block solver has converged in " < { } std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: k=0 residual " << cp << " target " << rsq - << std::endl; + << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl; GridStopWatch LinalgTimer; GridStopWatch MatrixTimer; From 98a24ebf31d071f0d13bc67f165ce904c49db7d7 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Mon, 10 Apr 2017 16:58:54 +0100 Subject: [PATCH 07/36] =?UTF-8?q?The=20macro=20=E2=80=9Cmagics=E2=80=9D=20?= =?UTF-8?q?is=20very=20intensive=20for=20the=20preprocessor=20in=20the=20m?= =?UTF-8?q?easurement=20code=20which=20has=20numerous=20serialisable=20cla?= =?UTF-8?q?sses.=20Reducing=20the=20number=20of=20serialisable=20fields=20?= =?UTF-8?q?to=2064=20(instead=20of=201024)=20helps=20a=20lot,=20this=20is?= =?UTF-8?q?=20enough=20for=20now=20and=20can=20be=20extended=20trivially?= =?UTF-8?q?=20if=20needed=20in=20the=20future.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lib/serialisation/MacroMagic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/serialisation/MacroMagic.h b/lib/serialisation/MacroMagic.h index ca789312..a864989c 100644 --- a/lib/serialisation/MacroMagic.h +++ b/lib/serialisation/MacroMagic.h @@ -54,7 +54,7 @@ THE SOFTWARE. #define GRID_MACRO_EMPTY() -#define GRID_MACRO_EVAL(...) GRID_MACRO_EVAL1024(__VA_ARGS__) +#define GRID_MACRO_EVAL(...) GRID_MACRO_EVAL64(__VA_ARGS__) #define GRID_MACRO_EVAL1024(...) GRID_MACRO_EVAL512(GRID_MACRO_EVAL512(__VA_ARGS__)) #define GRID_MACRO_EVAL512(...) GRID_MACRO_EVAL256(GRID_MACRO_EVAL256(__VA_ARGS__)) #define GRID_MACRO_EVAL256(...) GRID_MACRO_EVAL128(GRID_MACRO_EVAL128(__VA_ARGS__)) From 8ef4300412ee7a3ac028effae9b3a52a732179fe Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Mon, 10 Apr 2017 17:00:22 +0100 Subject: [PATCH 08/36] spurious .dirstamp files removed --- lib/algorithms/approx/.dirstamp | 0 lib/communicator/.dirstamp | 0 lib/qcd/action/fermion/.dirstamp | 0 lib/qcd/spin/.dirstamp | 0 lib/qcd/utils/.dirstamp | 0 lib/stencil/.dirstamp | 0 6 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 lib/algorithms/approx/.dirstamp delete mode 100644 lib/communicator/.dirstamp delete mode 100644 lib/qcd/action/fermion/.dirstamp delete mode 100644 lib/qcd/spin/.dirstamp delete mode 100644 lib/qcd/utils/.dirstamp delete mode 100644 lib/stencil/.dirstamp diff --git a/lib/algorithms/approx/.dirstamp b/lib/algorithms/approx/.dirstamp deleted file mode 100644 index e69de29b..00000000 diff --git a/lib/communicator/.dirstamp b/lib/communicator/.dirstamp deleted file mode 100644 index e69de29b..00000000 diff --git a/lib/qcd/action/fermion/.dirstamp b/lib/qcd/action/fermion/.dirstamp deleted file mode 100644 index e69de29b..00000000 diff --git a/lib/qcd/spin/.dirstamp b/lib/qcd/spin/.dirstamp deleted file mode 100644 index e69de29b..00000000 diff --git a/lib/qcd/utils/.dirstamp b/lib/qcd/utils/.dirstamp deleted file mode 100644 index e69de29b..00000000 diff --git a/lib/stencil/.dirstamp b/lib/stencil/.dirstamp deleted file mode 100644 index e69de29b..00000000 From cb6b81ae8291a3cecf2cc896a04842792735e67e Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 12 Apr 2017 19:32:37 +0100 Subject: [PATCH 09/36] Half precision conversion --- configure.ac | 10 ++--- lib/simd/Grid_avx.h | 37 ++++++++++++++++++ lib/simd/Grid_avx512.h | 36 +++++++++++++++++ lib/simd/Grid_sse4.h | 57 +++++++++++++++++++++++---- lib/simd/Grid_vector_types.h | 65 ++++++++++++++++++++++++++++-- tests/Test_simd.cc | 76 ++++++++++++++++++++++++++++++++++-- 6 files changed, 261 insertions(+), 20 deletions(-) diff --git a/configure.ac b/configure.ac index 3a323a5d..6924fae1 100644 --- a/configure.ac +++ b/configure.ac @@ -176,19 +176,19 @@ case ${ax_cv_cxx_compiler_vendor} in case ${ac_SIMD} in SSE4) AC_DEFINE([SSE4],[1],[SSE4 intrinsics]) - SIMD_FLAGS='-msse4.2';; + SIMD_FLAGS='-msse4.2 -mf16c';; AVX) AC_DEFINE([AVX1],[1],[AVX intrinsics]) - SIMD_FLAGS='-mavx';; + SIMD_FLAGS='-mavx -mf16c';; AVXFMA4) AC_DEFINE([AVXFMA4],[1],[AVX intrinsics with FMA4]) - SIMD_FLAGS='-mavx -mfma4';; + SIMD_FLAGS='-mavx -mfma4 -mf16c';; AVXFMA) AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA3]) - SIMD_FLAGS='-mavx -mfma';; + SIMD_FLAGS='-mavx -mfma -mf16c';; AVX2) AC_DEFINE([AVX2],[1],[AVX2 intrinsics]) - SIMD_FLAGS='-mavx2 -mfma';; + SIMD_FLAGS='-mavx2 -mfma -mf16c';; AVX512) AC_DEFINE([AVX512],[1],[AVX512 intrinsics]) SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';; diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index 2dbe26f4..86422f51 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -471,6 +471,42 @@ namespace Optimization { }; }; + struct PrecisionChange { + static inline __m256i StoH (__m256 a,__m256 b) { + __m128i ha = _mm256_cvtps_ph(a,0); + __m128i hb = _mm256_cvtps_ph(b,0); + __m256 h = _mm256_castps128_ps256(ha); + h = _mm256_insertf128_ps(h,hb,1); + return h; + } + static inline void HtoS (__m256i h,__m256 &sa,__m256 &sb) { + sa = _mm256_cvtph_ps(_mm256_extractf128_ps(h,0)); + sb = _mm256_cvtph_ps(_mm256_extractf128_ps(h,1)); + } + static inline __m256 DtoS (__m256d a,__m256d b) { + __m128 sa = _mm256_cvtpd_ps(a); + __m128 sb = _mm256_cvtpd_ps(b); + __m256 s = _mm256_castps128_ps256(sa); + s = _mm256_insertf128_ps(s,sb,1); + return s; + } + static inline void StoD (__m256 s,__m256d &a,__m256d &b) { + a = _mm256_cvtps_pd(_mm256_extractf128_ps(s,0)); + b = _mm256_cvtps_pd(_mm256_extractf128_ps(s,1)); + } + static inline __m256 DtoH (__m256i a,__m256 b,__m256 c,__m256 d) { + __m256 sa,sb; + sa = DtoS(a,b); + sb = DtoS(c,d); + return StoH(sa,sb); + } + static inline void HtoD (__m256i h,__m256d &a,__m256d &b,__m256d &c,__m256d &d) { + __m256 sa,sb; + HtoS(h,sa,sb); + StoD(sa,a,b); + StoD(sb,c,d); + } + }; struct Exchange{ // 3210 ordering static inline void Exchange0(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){ @@ -675,6 +711,7 @@ namespace Optimization { ////////////////////////////////////////////////////////////////////////////////////// // Here assign types + typedef __m256i SIMD_Htype; // Single precision type typedef __m256 SIMD_Ftype; // Single precision type typedef __m256d SIMD_Dtype; // Double precision type typedef __m256i SIMD_Itype; // Integer type diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index f39c4033..647fba77 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -343,6 +343,42 @@ namespace Optimization { }; + struct PrecisionChange { + static inline __m512i StoH (__m512 a,__m512 b) { + __m256i ha = _mm512_cvtps_ph(a,0); + __m256i hb = _mm512_cvtps_ph(b,0); + __m512 h = _mm512_castps256_ps512(ha); + h = _mm512_insertf256_ps(h,hb,1); + return h; + } + static inline void HtoS (__m512i h,__m512 &sa,__m512 &sb) { + sa = _mm512_cvtph_ps(_mm512_extractf256_ps(h,0)); + sb = _mm512_cvtph_ps(_mm512_extractf256_ps(h,1)); + } + static inline __m512 DtoS (__m512d a,__m512d b) { + __m256 sa = _mm512_cvtpd_ps(a); + __m256 sb = _mm512_cvtpd_ps(b); + __m512 s = _mm512_castps256_ps512(sa); + s = _mm512_insertf256_ps(s,sb,1); + return s; + } + static inline void StoD (__m512 s,__m512d &a,__m512d &b) { + a = _mm512_cvtps_pd(_mm512_extractf256_ps(s,0)); + b = _mm512_cvtps_pd(_mm512_extractf256_ps(s,1)); + } + static inline __m512 DtoH (__m512i a,__m512 b,__m512 c,__m512 d) { + __m512 sa,sb; + sa = DtoS(a,b); + sb = DtoS(c,d); + return StoH(sa,sb); + } + static inline void HtoD (__m512i h,__m512d &a,__m512d &b,__m512d &c,__m512d &d) { + __m512 sa,sb; + HtoS(h,sa,sb); + StoD(sa,a,b); + StoD(sb,c,d); + } + }; // On extracting face: Ah Al , Bh Bl -> Ah Bh, Al Bl // On merging buffers: Ah,Bh , Al Bl -> Ah Al, Bh, Bl // The operation is its own inverse diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index fcad4c28..cf78aa52 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -38,6 +38,7 @@ Author: neo #include + namespace Grid { namespace Optimization { @@ -328,6 +329,48 @@ namespace Optimization { }; }; + +#ifndef _mm_alignr_epi64 +#define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16) +#define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16) +#endif + struct PrecisionChange { + static inline __m128i StoH (__m128 a,__m128 b) { + __m128i ha = _mm_cvtps_ph(a,0); + __m128i hb = _mm_cvtps_ph(b,0); + __m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0)); + return h; + } + static inline void HtoS (__m128i h,__m128 &sa,__m128 &sb) { + sa = _mm_cvtph_ps(h); + h = (__m128)_mm_alignr_epi32((__m128i)h,(__m128i)h,2); + sb = _mm_cvtph_ps(h); + } + static inline __m128 DtoS (__m128d a,__m128d b) { + __m128 sa = _mm_cvtpd_ps(a); + __m128 sb = _mm_cvtpd_ps(b); + __m128 s = _mm_shuffle_ps(sa,sb,_MM_SELECT_FOUR_FOUR(1,0,1,0)); + return s; + } + static inline void StoD (__m128 s,__m128d &a,__m128d &b) { + a = _mm_cvtps_pd(s); + s = (__m128)_mm_alignr_epi32((__m128i)s,(__m128i)s,2); + b = _mm_cvtps_pd(s); + } + static inline __m128 DtoH (__m128i a,__m128 b,__m128 c,__m128 d) { + __m128 sa,sb; + sa = DtoS(a,b); + sb = DtoS(c,d); + return StoH(sa,sb); + } + static inline void HtoD (__m128i h,__m128d &a,__m128d &b,__m128d &c,__m128d &d) { + __m128 sa,sb; + HtoS(h,sa,sb); + StoD(sa,a,b); + StoD(sb,c,d); + } + }; + struct Exchange{ // 3210 ordering static inline void Exchange0(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){ @@ -335,8 +378,10 @@ namespace Optimization { out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2)); }; static inline void Exchange1(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){ - out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); - out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); + out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); /*ACEG*/ + out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); /*BDFH*/ + out1= _mm_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ + out2= _mm_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ }; static inline void Exchange2(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){ assert(0); @@ -383,11 +428,6 @@ namespace Optimization { default: assert(0); } } - -#ifndef _mm_alignr_epi64 -#define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16) -#define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16) -#endif template static inline __m128 tRotate(__m128 in){ return (__m128)_mm_alignr_epi32((__m128i)in,(__m128i)in,n); }; template static inline __m128d tRotate(__m128d in){ return (__m128d)_mm_alignr_epi64((__m128i)in,(__m128i)in,n); }; @@ -450,7 +490,8 @@ namespace Optimization { ////////////////////////////////////////////////////////////////////////////////////// // Here assign types - typedef __m128 SIMD_Ftype; // Single precision type + typedef __m128i SIMD_Htype; // Single precision type + typedef __m128 SIMD_Ftype; // Single precision type typedef __m128d SIMD_Dtype; // Double precision type typedef __m128i SIMD_Itype; // Integer type diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 57e7f11e..ccedf99c 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -358,16 +358,12 @@ class Grid_simd { { if (n==3) { Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v); - // std::cout << " Exchange3 "<< out1<<" "<< out2<<" <- " << in1 << " "<, SIMD_Ftype> vComplexF; typedef Grid_simd, SIMD_Dtype> vComplexD; typedef Grid_simd vInteger; +// Half precision; no arithmetic support +typedef Grid_simd vRealH; +typedef Grid_simd, SIMD_Htype> vComplexH; + +inline void precisionChange(vRealF *out,vRealD *in,int nvec) +{ + assert((nvec&0x1)==0); + for(int m=0;m*2({45,12,81,9})); + const int Ndp = 16; + const int Nsp = Ndp/2; + const int Nhp = Ndp/4; + std::vector > H (Nhp); + std::vector > F (Nsp); + std::vector > FF(Nsp); + std::vector > D (Ndp); + std::vector > DD(Ndp); + for(int i=0;i<16;i++){ + random(sRNG,D[i]); + } + // Double to Single + precisionChange(&F[0],&D[0],Ndp); + precisionChange(&DD[0],&F[0],Ndp); + std::cout << GridLogMessage<<"Double to single"; + for(int i=0;i Date: Thu, 13 Apr 2017 08:38:12 +0100 Subject: [PATCH 10/36] Exchange in generic Precision change in AVX, SSE, AVX512, Generic. QPX still to do. --- configure.ac | 2 +- lib/simd/Grid_avx.h | 6 +-- lib/simd/Grid_avx512.h | 4 +- lib/simd/Grid_generic.h | 88 +++++++++++++++++++++++++++++++++++ lib/simd/Grid_generic_types.h | 11 +++-- lib/simd/Grid_qpx.h | 1 - lib/simd/Grid_sse4.h | 2 +- tests/Test_simd.cc | 41 +++++++++------- 8 files changed, 126 insertions(+), 29 deletions(-) diff --git a/configure.ac b/configure.ac index 6924fae1..298c04de 100644 --- a/configure.ac +++ b/configure.ac @@ -10,7 +10,7 @@ AC_CONFIG_HEADERS([lib/Config.h],[sed -i 's|PACKAGE_|GRID_|' lib/Config.h]) m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) ############### Checks for programs -CXXFLAGS="-O3 $CXXFLAGS" +CXXFLAGS="-g $CXXFLAGS" AC_PROG_CXX AC_PROG_RANLIB diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index 86422f51..c2e02cb2 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -377,8 +377,8 @@ namespace Optimization { b0 = _mm256_extractf128_si256(b,0); a1 = _mm256_extractf128_si256(a,1); b1 = _mm256_extractf128_si256(b,1); - a0 = _mm_mul_epi32(a0,b0); - a1 = _mm_mul_epi32(a1,b1); + a0 = _mm_mullo_epi32(a0,b0); + a1 = _mm_mullo_epi32(a1,b1); return _mm256_set_m128i(a1,a0); #endif #if defined (AVX2) @@ -494,7 +494,7 @@ namespace Optimization { a = _mm256_cvtps_pd(_mm256_extractf128_ps(s,0)); b = _mm256_cvtps_pd(_mm256_extractf128_ps(s,1)); } - static inline __m256 DtoH (__m256i a,__m256 b,__m256 c,__m256 d) { + static inline __m256i DtoH (__m256d a,__m256d b,__m256d c,__m256d d) { __m256 sa,sb; sa = DtoS(a,b); sb = DtoS(c,d); diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index 647fba77..87798637 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -235,11 +235,9 @@ namespace Optimization { inline void mac(__m512 &a, __m512 b, __m512 c){ a= _mm512_fmadd_ps( b, c, a); } - inline void mac(__m512d &a, __m512d b, __m512d c){ a= _mm512_fmadd_pd( b, c, a); } - // Real float inline __m512 operator()(__m512 a, __m512 b){ return _mm512_mul_ps(a,b); @@ -366,7 +364,7 @@ namespace Optimization { a = _mm512_cvtps_pd(_mm512_extractf256_ps(s,0)); b = _mm512_cvtps_pd(_mm512_extractf256_ps(s,1)); } - static inline __m512 DtoH (__m512i a,__m512 b,__m512 c,__m512 d) { + static inline __m512i DtoH (__m512d a,__m512d b,__m512d c,__m512d d) { __m512 sa,sb; sa = DtoS(a,b); sb = DtoS(c,d); diff --git a/lib/simd/Grid_generic.h b/lib/simd/Grid_generic.h index 7972da55..466aa2f7 100644 --- a/lib/simd/Grid_generic.h +++ b/lib/simd/Grid_generic.h @@ -279,6 +279,93 @@ namespace Optimization { #undef timesi + struct PrecisionChange { + static inline vech StoH (const vecf &a,const vecf &b) { + vech ret; + vech *ha = (vech *)&a; + vech *hb = (vech *)&b; + const int nf = W::r; + // VECTOR_FOR(i, nf,1){ ret.v[i] = ( (uint16_t *) &a.v[i])[1] ; } + // VECTOR_FOR(i, nf,1){ ret.v[i+nf] = ( (uint16_t *) &b.v[i])[1] ; } + VECTOR_FOR(i, nf,1){ ret.v[i] = ha->v[2*i+1]; } + VECTOR_FOR(i, nf,1){ ret.v[i+nf] = hb->v[2*i+1]; } + return ret; + } + static inline void HtoS (vech h,vecf &sa,vecf &sb) { + const int nf = W::r; + const int nh = W::r; + vech *ha = (vech *)&sa; + vech *hb = (vech *)&sb; + VECTOR_FOR(i, nf, 1){ sb.v[i]= sa.v[i] = 0; } + // VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sa.v[i]))[1] = h.v[i];} + // VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sb.v[i]))[1] = h.v[i+nf];} + VECTOR_FOR(i, nf, 1){ ha->v[2*i+1]=h.v[i]; } + VECTOR_FOR(i, nf, 1){ hb->v[2*i+1]=h.v[i+nf]; } + } + static inline vecf DtoS (vecd a,vecd b) { + const int nd = W::r; + const int nf = W::r; + vecf ret; + VECTOR_FOR(i, nd,1){ ret.v[i] = a.v[i] ; } + VECTOR_FOR(i, nd,1){ ret.v[i+nd] = b.v[i] ; } + return ret; + } + static inline void StoD (vecf s,vecd &a,vecd &b) { + const int nd = W::r; + VECTOR_FOR(i, nd,1){ a.v[i] = s.v[i] ; } + VECTOR_FOR(i, nd,1){ b.v[i] = s.v[i+nd] ; } + } + static inline vech DtoH (vecd a,vecd b,vecd c,vecd d) { + vecf sa,sb; + sa = DtoS(a,b); + sb = DtoS(c,d); + return StoH(sa,sb); + } + static inline void HtoD (vech h,vecd &a,vecd &b,vecd &c,vecd &d) { + vecf sa,sb; + HtoS(h,sa,sb); + StoD(sa,a,b); + StoD(sb,c,d); + } + }; + + ////////////////////////////////////////////// + // Exchange support + struct Exchange{ + + template + static inline void ExchangeN(vec &out1,vec &out2,vec &in1,vec &in2){ + const int w = W::r; + unsigned int mask = w >> (n + 1); + // std::cout << " Exchange "< + static inline void Exchange0(vec &out1,vec &out2,vec &in1,vec &in2){ + ExchangeN(out1,out2,in1,in2); + }; + template + static inline void Exchange1(vec &out1,vec &out2,vec &in1,vec &in2){ + ExchangeN(out1,out2,in1,in2); + }; + template + static inline void Exchange2(vec &out1,vec &out2,vec &in1,vec &in2){ + ExchangeN(out1,out2,in1,in2); + }; + template + static inline void Exchange3(vec &out1,vec &out2,vec &in1,vec &in2){ + ExchangeN(out1,out2,in1,in2); + }; + }; + + ////////////////////////////////////////////// // Some Template specialization #define perm(a, b, n, w)\ @@ -403,6 +490,7 @@ namespace Optimization { ////////////////////////////////////////////////////////////////////////////////////// // Here assign types + typedef Optimization::vech SIMD_Htype; // Reduced precision type typedef Optimization::vecf SIMD_Ftype; // Single precision type typedef Optimization::vecd SIMD_Dtype; // Double precision type typedef Optimization::veci SIMD_Itype; // Integer type diff --git a/lib/simd/Grid_generic_types.h b/lib/simd/Grid_generic_types.h index 2142bc8e..642f6ffe 100644 --- a/lib/simd/Grid_generic_types.h +++ b/lib/simd/Grid_generic_types.h @@ -66,6 +66,10 @@ namespace Optimization { template <> struct W { constexpr static unsigned int r = GEN_SIMD_WIDTH/4u; }; + template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/4u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/2u; + }; // SIMD vector types template @@ -73,8 +77,9 @@ namespace Optimization { alignas(GEN_SIMD_WIDTH) T v[W::r]; }; - typedef vec vecf; - typedef vec vecd; - typedef vec veci; + typedef vec vecf; + typedef vec vecd; + typedef vec vech; // half precision comms + typedef vec veci; }} diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h index d77a560a..757c84c9 100644 --- a/lib/simd/Grid_qpx.h +++ b/lib/simd/Grid_qpx.h @@ -125,7 +125,6 @@ namespace Optimization { f[2] = a.v2; f[3] = a.v3; } - //Double inline void operator()(double *d, vector4double a){ vec_st(a, 0, d); diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index cf78aa52..89eb7daa 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -357,7 +357,7 @@ namespace Optimization { s = (__m128)_mm_alignr_epi32((__m128i)s,(__m128i)s,2); b = _mm_cvtps_pd(s); } - static inline __m128 DtoH (__m128i a,__m128 b,__m128 c,__m128 d) { + static inline __m128i DtoH (__m128d a,__m128d b,__m128d c,__m128d d) { __m128 sa,sb; sa = DtoS(a,b); sb = DtoS(c,d); diff --git a/tests/Test_simd.cc b/tests/Test_simd.cc index 423372f1..abec2410 100644 --- a/tests/Test_simd.cc +++ b/tests/Test_simd.cc @@ -308,18 +308,23 @@ public: int n; funcExchange(int _n) { n=_n;}; template void operator()(vec &r1,vec &r2,vec &i1,vec &i2) const { exchange(r1,r2,i1,i2,n);} - template void apply(std::vector &r1,std::vector &r2,std::vector &in1,std::vector &in2) const { + template void apply(std::vector &r1, + std::vector &r2, + std::vector &in1, + std::vector &in2) const + { int sz=in1.size(); - - int msk = sz>>(n+1); - int j1=0; - int j2=0; - for(int i=0;i Date: Thu, 13 Apr 2017 08:40:44 +0100 Subject: [PATCH 11/36] Align cast fixed for __mm128i gcc complained --- lib/simd/Grid_sse4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index 89eb7daa..b43c0628 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -343,7 +343,7 @@ namespace Optimization { } static inline void HtoS (__m128i h,__m128 &sa,__m128 &sb) { sa = _mm_cvtph_ps(h); - h = (__m128)_mm_alignr_epi32((__m128i)h,(__m128i)h,2); + h = (__m128i)_mm_alignr_epi32((__m128i)h,(__m128i)h,2); sb = _mm_cvtph_ps(h); } static inline __m128 DtoS (__m128d a,__m128d b) { From 9c3065b860bb349483201d6fee365794aa0a21fc Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 13 Apr 2017 10:01:32 +0100 Subject: [PATCH 12/36] Debug flags off again --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 298c04de..6924fae1 100644 --- a/configure.ac +++ b/configure.ac @@ -10,7 +10,7 @@ AC_CONFIG_HEADERS([lib/Config.h],[sed -i 's|PACKAGE_|GRID_|' lib/Config.h]) m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) ############### Checks for programs -CXXFLAGS="-g $CXXFLAGS" +CXXFLAGS="-O3 $CXXFLAGS" AC_PROG_CXX AC_PROG_RANLIB From c38400b26fab1f7596bcac21fe23c69c0437fa59 Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 13 Apr 2017 10:35:20 +0100 Subject: [PATCH 13/36] Trap signals --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index bc6dd0ef..107343ed 100644 --- a/.travis.yml +++ b/.travis.yml @@ -92,11 +92,11 @@ script: - cd build - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none - make -j4 - - ./benchmarks/Benchmark_dwf --threads 1 + - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - echo make clean - ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none - make -j4 - - ./benchmarks/Benchmark_dwf --threads 1 + - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - echo make clean - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; fi - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto From 1c2577331990df1a4376fed3b0c971dda7d04ebb Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 13 Apr 2017 10:51:40 +0100 Subject: [PATCH 14/36] Trap illegal instructions --- lib/util/Init.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/util/Init.cc b/lib/util/Init.cc index dd3e6d13..990a4e0f 100644 --- a/lib/util/Init.cc +++ b/lib/util/Init.cc @@ -457,5 +457,6 @@ void Grid_debug_handler_init(void) sigaction(SIGFPE,&sa,NULL); sigaction(SIGKILL,&sa,NULL); + sigaction(SIGILL,&sa,NULL); } } From 73cdf0fffea01ec30c835092657dd1785652e0c2 Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 13 Apr 2017 11:23:41 +0100 Subject: [PATCH 15/36] Drop f16c from SSE because of a macos compile error on travis --- configure.ac | 3 ++- lib/simd/Grid_sse4.h | 15 +++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/configure.ac b/configure.ac index 6924fae1..c45693c5 100644 --- a/configure.ac +++ b/configure.ac @@ -176,7 +176,8 @@ case ${ax_cv_cxx_compiler_vendor} in case ${ac_SIMD} in SSE4) AC_DEFINE([SSE4],[1],[SSE4 intrinsics]) - SIMD_FLAGS='-msse4.2 -mf16c';; +# SIMD_FLAGS='-msse4.2 -mf16c';; + SIMD_FLAGS='-msse4.2';; AVX) AC_DEFINE([AVX1],[1],[AVX intrinsics]) SIMD_FLAGS='-mavx -mf16c';; diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index b43c0628..a6ab369d 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -336,15 +336,18 @@ namespace Optimization { #endif struct PrecisionChange { static inline __m128i StoH (__m128 a,__m128 b) { - __m128i ha = _mm_cvtps_ph(a,0); - __m128i hb = _mm_cvtps_ph(b,0); - __m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0)); + // __m128i ha = _mm_cvtps_ph(a,0); + // __m128i hb = _mm_cvtps_ph(b,0); + // __m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0)); + __m128i h = (__m128i)a; + assert(0); return h; } static inline void HtoS (__m128i h,__m128 &sa,__m128 &sb) { - sa = _mm_cvtph_ps(h); - h = (__m128i)_mm_alignr_epi32((__m128i)h,(__m128i)h,2); - sb = _mm_cvtph_ps(h); + // sa = _mm_cvtph_ps(h); + // h = (__m128i)_mm_alignr_epi32((__m128i)h,(__m128i)h,2); + // sb = _mm_cvtph_ps(h); + assert(0); } static inline __m128 DtoS (__m128d a,__m128d b) { __m128 sa = _mm_cvtpd_ps(a); From 1d502e4ed6e481647702f2f1fe39fa9bb8c6084f Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 13 Apr 2017 11:55:24 +0100 Subject: [PATCH 16/36] FP16 optional compile time --- configure.ac | 13 +++++++++++++ lib/simd/Grid_avx.h | 8 ++++++++ lib/simd/Grid_avx512.h | 8 ++++++++ lib/simd/Grid_generic.h | 8 ++++++++ lib/simd/Grid_sse4.h | 19 +++++++++++++------ 5 files changed, 50 insertions(+), 6 deletions(-) diff --git a/configure.ac b/configure.ac index c45693c5..e0820d79 100644 --- a/configure.ac +++ b/configure.ac @@ -83,6 +83,19 @@ case ${ac_LAPACK} in AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);; esac +############### FP16 conversions +AC_ARG_ENABLE([fp16], + [AC_HELP_STRING([--enable-fp16=yes|no], [enable fp16 comms])], + [ac_FP16=${enable_fp16}], [ac_FP16=no]) +case ${ac_FP16} in + no) + ;; + yes) + AC_DEFINE([USE_FP16],[1],[conversion to fp16]);; + *) + ;; +esac + ############### MKL AC_ARG_ENABLE([mkl], [AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])], diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index c2e02cb2..73792f84 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -473,15 +473,23 @@ namespace Optimization { struct PrecisionChange { static inline __m256i StoH (__m256 a,__m256 b) { +#ifdef USE_FP16 __m128i ha = _mm256_cvtps_ph(a,0); __m128i hb = _mm256_cvtps_ph(b,0); __m256 h = _mm256_castps128_ps256(ha); h = _mm256_insertf128_ps(h,hb,1); +#else + assert(0); +#endif return h; } static inline void HtoS (__m256i h,__m256 &sa,__m256 &sb) { +#ifdef USE_FP16 sa = _mm256_cvtph_ps(_mm256_extractf128_ps(h,0)); sb = _mm256_cvtph_ps(_mm256_extractf128_ps(h,1)); +#else + assert(0); +#endif } static inline __m256 DtoS (__m256d a,__m256d b) { __m128 sa = _mm256_cvtpd_ps(a); diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index 87798637..dae3c1c7 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -343,15 +343,23 @@ namespace Optimization { struct PrecisionChange { static inline __m512i StoH (__m512 a,__m512 b) { +#ifdef USE_FP16 __m256i ha = _mm512_cvtps_ph(a,0); __m256i hb = _mm512_cvtps_ph(b,0); __m512 h = _mm512_castps256_ps512(ha); h = _mm512_insertf256_ps(h,hb,1); +#else + assert(0); +#endif return h; } static inline void HtoS (__m512i h,__m512 &sa,__m512 &sb) { +#ifdef USE_FP16 sa = _mm512_cvtph_ps(_mm512_extractf256_ps(h,0)); sb = _mm512_cvtph_ps(_mm512_extractf256_ps(h,1)); +#else + assert(0); +#endif } static inline __m512 DtoS (__m512d a,__m512d b) { __m256 sa = _mm512_cvtpd_ps(a); diff --git a/lib/simd/Grid_generic.h b/lib/simd/Grid_generic.h index 466aa2f7..bdf5aa01 100644 --- a/lib/simd/Grid_generic.h +++ b/lib/simd/Grid_generic.h @@ -281,6 +281,7 @@ namespace Optimization { struct PrecisionChange { static inline vech StoH (const vecf &a,const vecf &b) { +#ifdef USE_FP16 vech ret; vech *ha = (vech *)&a; vech *hb = (vech *)&b; @@ -289,9 +290,13 @@ namespace Optimization { // VECTOR_FOR(i, nf,1){ ret.v[i+nf] = ( (uint16_t *) &b.v[i])[1] ; } VECTOR_FOR(i, nf,1){ ret.v[i] = ha->v[2*i+1]; } VECTOR_FOR(i, nf,1){ ret.v[i+nf] = hb->v[2*i+1]; } +#else + assert(0); +#endif return ret; } static inline void HtoS (vech h,vecf &sa,vecf &sb) { +#ifdef USE_FP16 const int nf = W::r; const int nh = W::r; vech *ha = (vech *)&sa; @@ -301,6 +306,9 @@ namespace Optimization { // VECTOR_FOR(i, nf, 1){ ( (uint16_t *) (&sb.v[i]))[1] = h.v[i+nf];} VECTOR_FOR(i, nf, 1){ ha->v[2*i+1]=h.v[i]; } VECTOR_FOR(i, nf, 1){ hb->v[2*i+1]=h.v[i+nf]; } +#else + assert(0); +#endif } static inline vecf DtoS (vecd a,vecd b) { const int nd = W::r; diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index a6ab369d..969ba3ed 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -334,20 +334,27 @@ namespace Optimization { #define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16) #define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16) #endif + struct PrecisionChange { static inline __m128i StoH (__m128 a,__m128 b) { - // __m128i ha = _mm_cvtps_ph(a,0); - // __m128i hb = _mm_cvtps_ph(b,0); - // __m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0)); +#ifdef USE_FP16 + __m128i ha = _mm_cvtps_ph(a,0); + __m128i hb = _mm_cvtps_ph(b,0); + __m128i h =(__m128i) _mm_shuffle_ps((__m128)ha,(__m128)hb,_MM_SELECT_FOUR_FOUR(1,0,1,0)); +#else __m128i h = (__m128i)a; assert(0); +#endif return h; } static inline void HtoS (__m128i h,__m128 &sa,__m128 &sb) { - // sa = _mm_cvtph_ps(h); - // h = (__m128i)_mm_alignr_epi32((__m128i)h,(__m128i)h,2); - // sb = _mm_cvtph_ps(h); +#ifdef USE_FP16 + sa = _mm_cvtph_ps(h); + h = (__m128i)_mm_alignr_epi32((__m128i)h,(__m128i)h,2); + sb = _mm_cvtph_ps(h); +#else assert(0); +#endif } static inline __m128 DtoS (__m128d a,__m128d b) { __m128 sa = _mm_cvtpd_ps(a); From 2846f079e5002692e36edc866700895d31d6def2 Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 13 Apr 2017 12:08:05 +0100 Subject: [PATCH 17/36] Predicate tests on fp16 being enabled --- tests/Test_simd.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/Test_simd.cc b/tests/Test_simd.cc index abec2410..c0bbef1d 100644 --- a/tests/Test_simd.cc +++ b/tests/Test_simd.cc @@ -725,6 +725,7 @@ int main (int argc, char ** argv) std::cout <<" OK ! "< Date: Thu, 13 Apr 2017 12:22:40 +0100 Subject: [PATCH 18/36] Update to use Xcode 8.3 since -mfp16 causes SIGILL --- .travis.yml | 2 +- configure.ac | 5 ++--- lib/simd/Grid_sse4.h | 14 ++++++-------- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/.travis.yml b/.travis.yml index 107343ed..20716334 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,7 +7,7 @@ cache: matrix: include: - os: osx - osx_image: xcode7.2 + osx_image: xcode8.3 compiler: clang - compiler: gcc addons: diff --git a/configure.ac b/configure.ac index e0820d79..6b233c0d 100644 --- a/configure.ac +++ b/configure.ac @@ -86,7 +86,7 @@ esac ############### FP16 conversions AC_ARG_ENABLE([fp16], [AC_HELP_STRING([--enable-fp16=yes|no], [enable fp16 comms])], - [ac_FP16=${enable_fp16}], [ac_FP16=no]) + [ac_FP16=${enable_fp16}], [ac_FP16=yes]) case ${ac_FP16} in no) ;; @@ -189,8 +189,7 @@ case ${ax_cv_cxx_compiler_vendor} in case ${ac_SIMD} in SSE4) AC_DEFINE([SSE4],[1],[SSE4 intrinsics]) -# SIMD_FLAGS='-msse4.2 -mf16c';; - SIMD_FLAGS='-msse4.2';; + SIMD_FLAGS='-msse4.2 -mf16c';; AVX) AC_DEFINE([AVX1],[1],[AVX intrinsics]) SIMD_FLAGS='-mavx -mf16c';; diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index 969ba3ed..8a4537c2 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -330,10 +330,8 @@ namespace Optimization { }; -#ifndef _mm_alignr_epi64 -#define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16) -#define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16) -#endif +#define _my_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16) +#define _my_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16) struct PrecisionChange { static inline __m128i StoH (__m128 a,__m128 b) { @@ -350,7 +348,7 @@ namespace Optimization { static inline void HtoS (__m128i h,__m128 &sa,__m128 &sb) { #ifdef USE_FP16 sa = _mm_cvtph_ps(h); - h = (__m128i)_mm_alignr_epi32((__m128i)h,(__m128i)h,2); + h = (__m128i)_my_alignr_epi32((__m128i)h,(__m128i)h,2); sb = _mm_cvtph_ps(h); #else assert(0); @@ -364,7 +362,7 @@ namespace Optimization { } static inline void StoD (__m128 s,__m128d &a,__m128d &b) { a = _mm_cvtps_pd(s); - s = (__m128)_mm_alignr_epi32((__m128i)s,(__m128i)s,2); + s = (__m128)_my_alignr_epi32((__m128i)s,(__m128i)s,2); b = _mm_cvtps_pd(s); } static inline __m128i DtoH (__m128d a,__m128d b,__m128d c,__m128d d) { @@ -439,8 +437,8 @@ namespace Optimization { } } - template static inline __m128 tRotate(__m128 in){ return (__m128)_mm_alignr_epi32((__m128i)in,(__m128i)in,n); }; - template static inline __m128d tRotate(__m128d in){ return (__m128d)_mm_alignr_epi64((__m128i)in,(__m128i)in,n); }; + template static inline __m128 tRotate(__m128 in){ return (__m128)_my_alignr_epi32((__m128i)in,(__m128i)in,n); }; + template static inline __m128d tRotate(__m128d in){ return (__m128d)_my_alignr_epi64((__m128i)in,(__m128i)in,n); }; }; ////////////////////////////////////////////// From eb8e26018b3c579ffa29232df7ada67a024372b1 Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 13 Apr 2017 12:35:11 +0100 Subject: [PATCH 19/36] Travis update for macos --- .travis.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index 20716334..df7eb5a7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -73,8 +73,6 @@ before_install: - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install openmpi; fi - - if [[ "$TRAVIS_OS_NAME" == "osx" ]] && [[ "$CC" == "gcc" ]]; then brew install gcc5; fi install: - export CC=$CC$VERSION @@ -98,9 +96,11 @@ script: - make -j4 - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - echo make clean - - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; fi - - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto - - make -j4 - - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then + - export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; + - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto; + - make -j4; + - mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; + - fi From 5a4eafbf7e6ba77515eca2f5b201109f57c5da69 Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 13 Apr 2017 12:50:43 +0100 Subject: [PATCH 20/36] .travis --- .travis.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index df7eb5a7..6eab0a4a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -96,8 +96,7 @@ script: - make -j4 - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - echo make clean - - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then - - export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto; - make -j4; - mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; From 4226c633c4fb1284fe92c37ced904c1c81b742ba Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 13 Apr 2017 12:51:39 +0100 Subject: [PATCH 21/36] Default to FP16 off again --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 6b233c0d..52acbfde 100644 --- a/configure.ac +++ b/configure.ac @@ -86,7 +86,7 @@ esac ############### FP16 conversions AC_ARG_ENABLE([fp16], [AC_HELP_STRING([--enable-fp16=yes|no], [enable fp16 comms])], - [ac_FP16=${enable_fp16}], [ac_FP16=yes]) + [ac_FP16=${enable_fp16}], [ac_FP16=no]) case ${ac_FP16} in no) ;; From 75ea306ce9c96b76a5663d0488399465b14295ed Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 13 Apr 2017 13:05:32 +0100 Subject: [PATCH 22/36] Another try at travis --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 6eab0a4a..278d2c1c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -96,8 +96,8 @@ script: - make -j4 - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - echo make clean - - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; - - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto; + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then + - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; - make -j4; - mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; - fi From d3b9a7fa145b46e2fbdcb5f0318345b79a0563de Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 13 Apr 2017 13:19:11 +0100 Subject: [PATCH 23/36] F16c apparently requires AVX, even if the 128 bit are used. Seems odd. --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 52acbfde..f18beb32 100644 --- a/configure.ac +++ b/configure.ac @@ -189,7 +189,7 @@ case ${ax_cv_cxx_compiler_vendor} in case ${ac_SIMD} in SSE4) AC_DEFINE([SSE4],[1],[SSE4 intrinsics]) - SIMD_FLAGS='-msse4.2 -mf16c';; + SIMD_FLAGS='-msse4.2';; AVX) AC_DEFINE([AVX1],[1],[AVX intrinsics]) SIMD_FLAGS='-mavx -mf16c';; From 2ed6c76fc5a256103ca7949db726fec53b6b9ed6 Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 13 Apr 2017 13:43:13 +0100 Subject: [PATCH 24/36] Getting multiline if then fi working --- .travis.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index 278d2c1c..32c3c918 100644 --- a/.travis.yml +++ b/.travis.yml @@ -96,10 +96,10 @@ script: - make -j4 - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - echo make clean - - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then - - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; - - make -j4; - - mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; - - fi + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then + ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; + make -j4; + mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; + fi From 0957378679c2f4a5cfe9d912858b1724b33d679c Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 13 Apr 2017 13:47:56 +0100 Subject: [PATCH 25/36] Fixing conditional ugly way --- .travis.yml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index 32c3c918..a33d8ec0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -96,10 +96,8 @@ script: - make -j4 - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - echo make clean - - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; - make -j4; - mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; - fi + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; fi + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make -j4; fi + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi From b9113ed310f4933b0754e6113906909581bdc623 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 13 Apr 2017 12:02:12 -0400 Subject: [PATCH 26/36] Patches for knl --- lib/simd/Grid_avx512.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index dae3c1c7..0c4061fb 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -343,11 +343,12 @@ namespace Optimization { struct PrecisionChange { static inline __m512i StoH (__m512 a,__m512 b) { + __m512i h; #ifdef USE_FP16 __m256i ha = _mm512_cvtps_ph(a,0); __m256i hb = _mm512_cvtps_ph(b,0); - __m512 h = _mm512_castps256_ps512(ha); - h = _mm512_insertf256_ps(h,hb,1); + h =(__m512i) _mm512_castps256_ps512(ha); + h =(__m512i) _mm512_insertf64x4((__m512d)h,(__m512d)hb,1); #else assert(0); #endif @@ -365,12 +366,12 @@ namespace Optimization { __m256 sa = _mm512_cvtpd_ps(a); __m256 sb = _mm512_cvtpd_ps(b); __m512 s = _mm512_castps256_ps512(sa); - s = _mm512_insertf256_ps(s,sb,1); + s =(__m512) _mm512_insertf64x4((__m512d)s,(__m256d)sb,1); return s; } static inline void StoD (__m512 s,__m512d &a,__m512d &b) { - a = _mm512_cvtps_pd(_mm512_extractf256_ps(s,0)); - b = _mm512_cvtps_pd(_mm512_extractf256_ps(s,1)); + a = _mm512_cvtps_pd((__m256)_mm512_extractf64x4_pd((__m512d)s,0)); + b = _mm512_cvtps_pd((__m256)_mm512_extractf64x4_pd((__m512d)s,1)); } static inline __m512i DtoH (__m512d a,__m512d b,__m512d c,__m512d d) { __m512 sa,sb; @@ -581,7 +582,9 @@ namespace Optimization { ////////////////////////////////////////////////////////////////////////////////////// // Here assign types - typedef __m512 SIMD_Ftype; // Single precision type + + typedef __m512i SIMD_Htype; // Single precision type + typedef __m512 SIMD_Ftype; // Single precision type typedef __m512d SIMD_Dtype; // Double precision type typedef __m512i SIMD_Itype; // Integer type From 951be75292043995589bcb780b1e9702e4ea8c14 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 13 Apr 2017 17:35:11 +0100 Subject: [PATCH 27/36] Half precision conversion working on AVX512 now too --- lib/simd/Grid_avx512.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index 0c4061fb..9156f113 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -347,8 +347,8 @@ namespace Optimization { #ifdef USE_FP16 __m256i ha = _mm512_cvtps_ph(a,0); __m256i hb = _mm512_cvtps_ph(b,0); - h =(__m512i) _mm512_castps256_ps512(ha); - h =(__m512i) _mm512_insertf64x4((__m512d)h,(__m512d)hb,1); + h =(__m512i) _mm512_castps256_ps512((__m256)ha); + h =(__m512i) _mm512_insertf64x4((__m512d)h,(__m256d)hb,1); #else assert(0); #endif @@ -356,8 +356,8 @@ namespace Optimization { } static inline void HtoS (__m512i h,__m512 &sa,__m512 &sb) { #ifdef USE_FP16 - sa = _mm512_cvtph_ps(_mm512_extractf256_ps(h,0)); - sb = _mm512_cvtph_ps(_mm512_extractf256_ps(h,1)); + sa = _mm512_cvtph_ps((__m256i)_mm512_extractf64x4_pd((__m512d)h,0)); + sb = _mm512_cvtph_ps((__m256i)_mm512_extractf64x4_pd((__m512d)h,1)); #else assert(0); #endif From 3ca41458a3be2f2eb055947ab647c49fb1ae782a Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 14 Apr 2017 14:20:54 +0100 Subject: [PATCH 28/36] Fix to no USE_FP16 case --- lib/simd/Grid_avx.h | 5 +++-- lib/simd/Grid_avx512.h | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index 73792f84..5c16dd71 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -470,13 +470,14 @@ namespace Optimization { return in; }; }; - +#define USE_FP16 struct PrecisionChange { static inline __m256i StoH (__m256 a,__m256 b) { + __m256 h; #ifdef USE_FP16 __m128i ha = _mm256_cvtps_ph(a,0); __m128i hb = _mm256_cvtps_ph(b,0); - __m256 h = _mm256_castps128_ps256(ha); + h = _mm256_castps128_ps256(ha); h = _mm256_insertf128_ps(h,hb,1); #else assert(0); diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index 9156f113..ba054665 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -340,7 +340,7 @@ namespace Optimization { }; }; - +#define USE_FP16 struct PrecisionChange { static inline __m512i StoH (__m512 a,__m512 b) { __m512i h; From a9c22d5f4347d4882b513ed63e47be63e7118d61 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 14 Apr 2017 14:38:49 +0100 Subject: [PATCH 29/36] Verbose removal --- tests/core/Test_fft_gfix.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/core/Test_fft_gfix.cc b/tests/core/Test_fft_gfix.cc index c6b77a13..7938241e 100644 --- a/tests/core/Test_fft_gfix.cc +++ b/tests/core/Test_fft_gfix.cc @@ -148,11 +148,13 @@ class FourierAcceleratedGaugeFixer : public Gimpl { Complex psqMax(16.0); Fp = psqMax*one/psq; + /* static int once; if ( once == 0 ) { std::cout << " Fp " << Fp < Date: Sat, 15 Apr 2017 08:54:11 +0100 Subject: [PATCH 30/36] Cleaning up the dense matrix and lanczos sector --- TODO | 62 ++- lib/algorithms/Algorithms.h | 2 +- .../{iterative => densematrix}/DenseMatrix.h | 0 .../{iterative => densematrix}/Francis.h | 0 .../{iterative => densematrix}/Householder.h | 0 .../iterative/ImplicitlyRestartedLanczos.h | 9 +- lib/algorithms/iterative/Matrix.h | 453 ------------------ lib/algorithms/iterative/MatrixUtils.h | 75 --- lib/algorithms/iterative/TODO | 15 - lib/algorithms/iterative/bisec.c | 122 ----- lib/algorithms/iterative/get_eig.c | 1 - lib/util/Init.cc | 4 +- tests/debug/Test_synthetic_lanczos.cc | 13 +- 13 files changed, 61 insertions(+), 695 deletions(-) rename lib/algorithms/{iterative => densematrix}/DenseMatrix.h (100%) rename lib/algorithms/{iterative => densematrix}/Francis.h (100%) rename lib/algorithms/{iterative => densematrix}/Householder.h (100%) delete mode 100644 lib/algorithms/iterative/Matrix.h delete mode 100644 lib/algorithms/iterative/MatrixUtils.h delete mode 100644 lib/algorithms/iterative/TODO delete mode 100644 lib/algorithms/iterative/bisec.c delete mode 100644 lib/algorithms/iterative/get_eig.c diff --git a/TODO b/TODO index df8554cc..91034f20 100644 --- a/TODO +++ b/TODO @@ -1,6 +1,27 @@ TODO: --------------- +Peter's work list: + +-- Merge high precision reduction into develop +-- Physical propagator interface +-- Precision conversion and sort out localConvert +-- slice* linalg routines for multiRHS, BlockCG +-- Profile CG, BlockCG, etc... Flop count/rate +-- Binary I/O speed up & x-strips +-- Half-precision comms +-- multiRHS DWF; benchmark on Cori/BNL for comms elimination +-- GaugeFix into central location +-- Help Julia with NPR code +-- Switch to measurements +-- Multigrid Wilson and DWF, compare to other Multigrid implementations +-- Remove DenseVector, DenseMatrix; Use Eigen instead. +-- quaternions -- Might not need + + +-- Conserved currents + +----- * Forces; the UdSdU term in gauge force term is half of what I think it should be. This is a consequence of taking ONLY the first term in: @@ -21,16 +42,8 @@ TODO: This means we must double the force in the Test_xxx_force routines, and is the origin of the factor of two. This 2x is applied by hand in the fermion routines and in the Test_rect_force routine. - -Policies: - -* Link smearing/boundary conds; Policy class based implementation ; framework more in place - * Support different boundary conditions (finite temp, chem. potential ... ) -* Support different fermion representations? - - contained entirely within the integrator presently - - Sign of force term. - Reversibility test. @@ -41,11 +54,6 @@ Policies: - Audit oIndex usage for cb behaviour -- Rectangle gauge actions. - Iwasaki, - Symanzik, - ... etc... - - Prepare multigrid for HMC. - Alternate setup schemes. - Support for ILDG --- ugly, not done @@ -55,9 +63,11 @@ Policies: - FFTnD ? - Gparity; hand opt use template specialisation elegance to enable the optimised paths ? + - Gparity force term; Gparity (R)HMC. -- Random number state save restore + - Mobius implementation clean up to rmove #if 0 stale code sequences + - CG -- profile carefully, kernel fusion, whole CG performance measurements. ================================================================ @@ -90,6 +100,7 @@ Insert/Extract Not sure of status of this -- reverify. Things are working nicely now though. * Make the Tensor types and Complex etc... play more nicely. + - TensorRemove is a hack, come up with a long term rationalised approach to Complex vs. Scalar > > QDP forces use of "toDouble" to get back to non tensor scalar. This role is presently taken TensorRemove, but I want to introduce a syntax that does not require this. @@ -112,6 +123,8 @@ Not sure of status of this -- reverify. Things are working nicely now though. RECENT --------------- + - Support different fermion representations? -- DONE + - contained entirely within the integrator presently - Clean up HMC -- DONE - LorentzScalar gets Gauge link type (cleaner). -- DONE - Simplified the integrators a bit. -- DONE @@ -123,6 +136,26 @@ RECENT - Parallel io improvements -- DONE - Plaquette and link trace checks into nersc reader from the Grid_nersc_io.cc test. -- DONE + +DONE: +- MultiArray -- MultiRHS done +- ConjugateGradientMultiShift -- DONE +- MCR -- DONE +- Remez -- Mike or Boost? -- DONE +- Proto (ET) -- DONE +- uBlas -- DONE ; Eigen +- Potentially Useful Boost libraries -- DONE ; Eigen +- Aligned allocator; memory pool -- DONE +- Multiprecision -- DONE +- Serialization -- DONE +- Regex -- Not needed +- Tokenize -- Why? + +- Random number state save restore -- DONE +- Rectangle gauge actions. -- DONE + Iwasaki, + Symanzik, + ... etc... Done: Cayley, Partial , ContFrac force terms. DONE @@ -207,6 +240,7 @@ Done FUNCTIONALITY: it pleases me to keep track of things I have done (keeps me arguably sane) ====================================================================================================== +* Link smearing/boundary conds; Policy class based implementation ; framework more in place -- DONE * Command line args for geometry, simd, etc. layout. Is it necessary to have -- DONE user pass these? Is this a QCD specific? diff --git a/lib/algorithms/Algorithms.h b/lib/algorithms/Algorithms.h index 1b82f0ce..5123c7a1 100644 --- a/lib/algorithms/Algorithms.h +++ b/lib/algorithms/Algorithms.h @@ -46,7 +46,7 @@ Author: Peter Boyle #include // Lanczos support -#include +//#include #include #include #include diff --git a/lib/algorithms/iterative/DenseMatrix.h b/lib/algorithms/densematrix/DenseMatrix.h similarity index 100% rename from lib/algorithms/iterative/DenseMatrix.h rename to lib/algorithms/densematrix/DenseMatrix.h diff --git a/lib/algorithms/iterative/Francis.h b/lib/algorithms/densematrix/Francis.h similarity index 100% rename from lib/algorithms/iterative/Francis.h rename to lib/algorithms/densematrix/Francis.h diff --git a/lib/algorithms/iterative/Householder.h b/lib/algorithms/densematrix/Householder.h similarity index 100% rename from lib/algorithms/iterative/Householder.h rename to lib/algorithms/densematrix/Householder.h diff --git a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h index 233eb8f5..3aa54360 100644 --- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h +++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h @@ -30,6 +30,7 @@ Author: paboyle #define GRID_IRL_H #include //memset + #ifdef USE_LAPACK void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e, double *vl, double *vu, int *il, int *iu, double *abstol, @@ -37,8 +38,9 @@ void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e, double *work, int *lwork, int *iwork, int *liwork, int *info); #endif -#include "DenseMatrix.h" -#include "EigenSort.h" + +#include +#include namespace Grid { @@ -1088,8 +1090,6 @@ static void Lock(DenseMatrix &H, // Hess mtx int dfg, bool herm) { - - //ForceTridiagonal(H); int M = H.dim; @@ -1121,7 +1121,6 @@ static void Lock(DenseMatrix &H, // Hess mtx AH = Hermitian(QQ)*AH; AH = AH*QQ; - for(int i=con;i - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ -#ifndef MATRIX_H -#define MATRIX_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -/** Sign function **/ -template T sign(T p){return ( p/abs(p) );} - -///////////////////////////////////////////////////////////////////////////////////////////////////////// -///////////////////// Hijack STL containers for our wicked means ///////////////////////////////////////// -///////////////////////////////////////////////////////////////////////////////////////////////////////// -template using Vector = Vector; -template using Matrix = Vector >; - -template void Resize(Vector & vec, int N) { vec.resize(N); } - -template void Resize(Matrix & mat, int N, int M) { - mat.resize(N); - for(int i=0;i void Size(Vector & vec, int &N) -{ - N= vec.size(); -} -template void Size(Matrix & mat, int &N,int &M) -{ - N= mat.size(); - M= mat[0].size(); -} -template void SizeSquare(Matrix & mat, int &N) -{ - int M; Size(mat,N,M); - assert(N==M); -} -template void SizeSame(Matrix & mat1,Matrix &mat2, int &N1,int &M1) -{ - int N2,M2; - Size(mat1,N1,M1); - Size(mat2,N2,M2); - assert(N1==N2); - assert(M1==M2); -} - -//***************************************** -//* (Complex) Vector operations * -//***************************************** - -/**Conj of a Vector **/ -template Vector conj(Vector p){ - Vector q(p.size()); - for(int i=0;i T norm(Vector p){ - T sum = 0; - for(int i=0;i T norm2(Vector p){ - T sum = 0; - for(int i=0;i T trace(Vector p){ - T sum = 0; - for(int i=0;i void Fill(Vector &p, T c){ - for(int i=0;i void normalize(Vector &p){ - T m = norm(p); - if( abs(m) > 0.0) for(int i=0;i Vector times(Vector p, U s){ - for(int i=0;i Vector times(U s, Vector p){ - for(int i=0;i T inner(Vector a, Vector b){ - T m = 0.; - for(int i=0;i Vector add(Vector a, Vector b){ - Vector m(a.size()); - for(int i=0;i Vector sub(Vector a, Vector b){ - Vector m(a.size()); - for(int i=0;i void Fill(Matrix & mat, T&val) { - int N,M; - Size(mat,N,M); - for(int i=0;i Transpose(Matrix & mat){ - int N,M; - Size(mat,N,M); - Matrix C; Resize(C,M,N); - for(int i=0;i void Unity(Matrix &mat){ - int N; SizeSquare(mat,N); - for(int i=0;i -void PlusUnit(Matrix & A,T c){ - int dim; SizeSquare(A,dim); - for(int i=0;i HermitianConj(Matrix &mat){ - - int dim; SizeSquare(mat,dim); - - Matrix C; Resize(C,dim,dim); - - for(int i=0;i diag(Matrix &A) -{ - int dim; SizeSquare(A,dim); - Vector d; Resize(d,dim); - - for(int i=0;i operator *(Vector &B,Matrix &A) -{ - int K,M,N; - Size(B,K); - Size(A,M,N); - assert(K==M); - - Vector C; Resize(C,N); - - for(int j=0;j inv_diag(Matrix & A){ - int dim; SizeSquare(A,dim); - Vector d; Resize(d,dim); - for(int i=0;i operator + (Matrix &A,Matrix &B) -{ - int N,M ; SizeSame(A,B,N,M); - Matrix C; Resize(C,N,M); - for(int i=0;i operator- (Matrix & A,Matrix &B){ - int N,M ; SizeSame(A,B,N,M); - Matrix C; Resize(C,N,M); - for(int i=0;i operator* (Matrix & A,T c){ - int N,M; Size(A,N,M); - Matrix C; Resize(C,N,M); - for(int i=0;i operator* (Matrix &A,Matrix &B){ - int K,L,N,M; - Size(A,K,L); - Size(B,N,M); assert(L==N); - Matrix C; Resize(C,K,M); - - for(int i=0;i operator* (Matrix &A,Vector &B){ - int M,N,K; - Size(A,N,M); - Size(B,K); assert(K==M); - Vector C; Resize(C,N); - for(int i=0;i T LargestDiag(Matrix &A) -{ - int dim ; SizeSquare(A,dim); - - T ld = abs(A[0][0]); - for(int i=1;i abs(ld) ){ld = cf;} - } - return ld; -} - -/** Look for entries on the leading subdiagonal that are smaller than 'small' **/ -template int Chop_subdiag(Matrix &A,T norm, int offset, U small) -{ - int dim; SizeSquare(A,dim); - for(int l = dim - 1 - offset; l >= 1; l--) { - if((U)abs(A[l][l - 1]) < (U)small) { - A[l][l-1]=(U)0.0; - return l; - } - } - return 0; -} - -/** Look for entries on the leading subdiagonal that are smaller than 'small' **/ -template int Chop_symm_subdiag(Matrix & A,T norm, int offset, U small) -{ - int dim; SizeSquare(A,dim); - for(int l = dim - 1 - offset; l >= 1; l--) { - if((U)abs(A[l][l - 1]) < (U)small) { - A[l][l - 1] = (U)0.0; - A[l - 1][l] = (U)0.0; - return l; - } - } - return 0; -} -/**Assign a submatrix to a larger one**/ -template -void AssignSubMtx(Matrix & A,int row_st, int row_end, int col_st, int col_end, Matrix &S) -{ - for(int i = row_st; i -Matrix GetSubMtx(Matrix &A,int row_st, int row_end, int col_st, int col_end) -{ - Matrix H; Resize(row_end - row_st,col_end-col_st); - - for(int i = row_st; i -void AssignSubMtx(Matrix & A,int row_st, int row_end, int col_st, int col_end, Matrix &S) -{ - for(int i = row_st; i T proj(Matrix A, Vector B){ - int dim; SizeSquare(A,dim); - int dimB; Size(B,dimB); - assert(dimB==dim); - T C = 0; - for(int i=0;i q Q -template void times(Vector &q, Matrix &Q) -{ - int M; SizeSquare(Q,M); - int N; Size(q,N); - assert(M==N); - - times(q,Q,N); -} - -/// q -> q Q -template void times(multi1d &q, Matrix &Q, int N) -{ - GridBase *grid = q[0]._grid; - int M; SizeSquare(Q,M); - int K; Size(q,K); - assert(N S(N,grid ); - for(int j=0;j - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ -#ifndef GRID_MATRIX_UTILS_H -#define GRID_MATRIX_UTILS_H - -namespace Grid { - - namespace MatrixUtils { - - template inline void Size(Matrix& A,int &N,int &M){ - N=A.size(); assert(N>0); - M=A[0].size(); - for(int i=0;i inline void SizeSquare(Matrix& A,int &N) - { - int M; - Size(A,N,M); - assert(N==M); - } - - template inline void Fill(Matrix& A,T & val) - { - int N,M; - Size(A,N,M); - for(int i=0;i inline void Diagonal(Matrix& A,T & val) - { - int N; - SizeSquare(A,N); - for(int i=0;i inline void Identity(Matrix& A) - { - Fill(A,0.0); - Diagonal(A,1.0); - } - - }; -} -#endif diff --git a/lib/algorithms/iterative/TODO b/lib/algorithms/iterative/TODO deleted file mode 100644 index ca3bca3b..00000000 --- a/lib/algorithms/iterative/TODO +++ /dev/null @@ -1,15 +0,0 @@ -- ConjugateGradientMultiShift -- MCR - -- Potentially Useful Boost libraries - -- MultiArray -- Aligned allocator; memory pool -- Remez -- Mike or Boost? -- Multiprecision -- quaternians -- Tokenize -- Serialization -- Regex -- Proto (ET) -- uBlas diff --git a/lib/algorithms/iterative/bisec.c b/lib/algorithms/iterative/bisec.c deleted file mode 100644 index a5117aee..00000000 --- a/lib/algorithms/iterative/bisec.c +++ /dev/null @@ -1,122 +0,0 @@ -#include -#include -#include - -struct Bisection { - -static void get_eig2(int row_num,std::vector &ALPHA,std::vector &BETA, std::vector & eig) -{ - int i,j; - std::vector evec1(row_num+3); - std::vector evec2(row_num+3); - RealD eps2; - ALPHA[1]=0.; - BETHA[1]=0.; - for(i=0;imag(evec2[i+1])) { - swap(evec2+i,evec2+i+1); - swapped=1; - } - } - end--; - for(i=end-1;i>=begin;i--){ - if(mag(evec2[i])>mag(evec2[i+1])) { - swap(evec2+i,evec2+i+1); - swapped=1; - } - } - begin++; - } - - for(i=0;i &c, - std::vector &b, - int n, - int m1, - int m2, - RealD eps1, - RealD relfeh, - std::vector &x, - RealD &eps2) -{ - std::vector wu(n+2); - - RealD h,q,x1,xu,x0,xmin,xmax; - int i,a,k; - - b[1]=0.0; - xmin=c[n]-fabs(b[n]); - xmax=c[n]+fabs(b[n]); - for(i=1;ixmax) xmax= c[i]+h; - if(c[i]-h0.0 ? xmax : -xmin); - if(eps1<=0.0) eps1=eps2; - eps2=0.5*eps1+7.0*(eps2); - x0=xmax; - for(i=m1;i<=m2;i++){ - x[i]=xmax; - wu[i]=xmin; - } - - for(k=m2;k>=m1;k--){ - xu=xmin; - i=k; - do{ - if(xu=m1); - if(x0>x[k]) x0=x[k]; - while((x0-xu)>2*relfeh*(fabs(xu)+fabs(x0))+eps1){ - x1=(xu+x0)/2; - - a=0; - q=1.0; - for(i=1;i<=n;i++){ - q=c[i]-x1-((q!=0.0)? b[i]*b[i]/q:fabs(b[i])/relfeh); - if(q<0) a++; - } - // printf("x1=%e a=%d\n",x1,a); - if(ax1) x[a]=x1; - } - }else x0=x1; - } - x[k]=(x0+xu)/2; - } -} -} diff --git a/lib/algorithms/iterative/get_eig.c b/lib/algorithms/iterative/get_eig.c deleted file mode 100644 index d3f5a12f..00000000 --- a/lib/algorithms/iterative/get_eig.c +++ /dev/null @@ -1 +0,0 @@ - diff --git a/lib/util/Init.cc b/lib/util/Init.cc index 990a4e0f..d0685480 100644 --- a/lib/util/Init.cc +++ b/lib/util/Init.cc @@ -311,8 +311,8 @@ void Grid_init(int *argc,char ***argv) std::cout< Cheby(alpha,beta,mu,order); @@ -131,10 +131,9 @@ int main (int argc, char ** argv) const int Nit= 10000; int Nconv; - RealD eresid = 1.0e-8; + RealD eresid = 1.0e-6; ImplicitlyRestartedLanczos IRL(HermOp,X,Nk,Nm,eresid,Nit); - ImplicitlyRestartedLanczos ChebyIRL(HermOp,Cheby,Nk,Nm,eresid,Nit); LatticeComplex src(grid); gaussian(RNG,src); @@ -145,9 +144,9 @@ int main (int argc, char ** argv) } { - // std::vector eval(Nm); - // std::vector evec(Nm,grid); - // ChebyIRL.calc(eval,evec,src, Nconv); + std::vector eval(Nm); + std::vector evec(Nm,grid); + ChebyIRL.calc(eval,evec,src, Nconv); } Grid_finalize(); From 441a52ee5d9917ef0c7d2de179adcc9dab71804d Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 15 Apr 2017 10:57:21 +0100 Subject: [PATCH 31/36] First cut at higher precision reduction --- lib/lattice/Lattice_reduction.h | 197 ++++++++++++++++---------------- lib/simd/Grid_vector_types.h | 1 - lib/tensors/Tensor_class.h | 3 + lib/tensors/Tensor_inner.h | 142 ++++++++++++++++------- lib/tensors/Tensor_traits.h | 10 ++ 5 files changed, 210 insertions(+), 143 deletions(-) diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h index 45a88a64..bb7808b8 100644 --- a/lib/lattice/Lattice_reduction.h +++ b/lib/lattice/Lattice_reduction.h @@ -38,110 +38,108 @@ namespace Grid { //////////////////////////////////////////////////////////////////////////////////////////////////// // Deterministic Reduction operations //////////////////////////////////////////////////////////////////////////////////////////////////// - template inline RealD norm2(const Lattice &arg){ - ComplexD nrm = innerProduct(arg,arg); - return std::real(nrm); +template inline RealD norm2(const Lattice &arg){ + ComplexD nrm = innerProduct(arg,arg); + return std::real(nrm); +} + +template +inline ComplexD innerProduct(const Lattice &left,const Lattice &right) +{ + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_typeD vector_type; + scalar_type nrm; + + GridBase *grid = left._grid; + + std::vector > sumarray(grid->SumArraySize()); + for(int i=0;iSumArraySize();i++){ + sumarray[i]=zero; } - - template - inline ComplexD innerProduct(const Lattice &left,const Lattice &right) - { - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - scalar_type nrm; - - GridBase *grid = left._grid; - - std::vector > sumarray(grid->SumArraySize()); - for(int i=0;iSumArraySize();i++){ - sumarray[i]=zero; - } - - parallel_for(int thr=0;thrSumArraySize();thr++){ - int nwork, mywork, myoff; - GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); - - decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation - for(int ss=myoff;ssSumArraySize();i++){ - vvnrm = vvnrm+sumarray[i]; - } - nrm = Reduce(vvnrm);// sum across simd - right._grid->GlobalSum(nrm); - return nrm; + + parallel_for(int thr=0;thrSumArraySize();thr++){ + int nwork, mywork, myoff; + GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); + + decltype(innerProductD(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation + for(int ss=myoff;ssSumArraySize();i++){ + vvnrm = vvnrm+sumarray[i]; + } + nrm = Reduce(vvnrm);// sum across simd + right._grid->GlobalSum(nrm); + return nrm; +} + +template +inline auto sum(const LatticeUnaryExpression & expr) + ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object +{ + return sum(closure(expr)); +} - template - inline auto sum(const LatticeUnaryExpression & expr) - ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object - { - return sum(closure(expr)); - } - - template - inline auto sum(const LatticeBinaryExpression & expr) +template +inline auto sum(const LatticeBinaryExpression & expr) ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),eval(0,std::get<1>(expr.second))))::scalar_object - { - return sum(closure(expr)); +{ + return sum(closure(expr)); +} + + +template +inline auto sum(const LatticeTrinaryExpression & expr) + ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)), + eval(0,std::get<1>(expr.second)), + eval(0,std::get<2>(expr.second)) + ))::scalar_object +{ + return sum(closure(expr)); +} + +template +inline typename vobj::scalar_object sum(const Lattice &arg) +{ + GridBase *grid=arg._grid; + int Nsimd = grid->Nsimd(); + + std::vector > sumarray(grid->SumArraySize()); + for(int i=0;iSumArraySize();i++){ + sumarray[i]=zero; + } + + parallel_for(int thr=0;thrSumArraySize();thr++){ + int nwork, mywork, myoff; + GridThread::GetWork(grid->oSites(),thr,mywork,myoff); + + vobj vvsum=zero; + for(int ss=myoff;ss - inline auto sum(const LatticeTrinaryExpression & expr) - ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)), - eval(0,std::get<1>(expr.second)), - eval(0,std::get<2>(expr.second)) - ))::scalar_object - { - return sum(closure(expr)); - } - - template - inline typename vobj::scalar_object sum(const Lattice &arg){ - - GridBase *grid=arg._grid; - int Nsimd = grid->Nsimd(); - - std::vector > sumarray(grid->SumArraySize()); - for(int i=0;iSumArraySize();i++){ - sumarray[i]=zero; - } - - parallel_for(int thr=0;thrSumArraySize();thr++){ - int nwork, mywork, myoff; - GridThread::GetWork(grid->oSites(),thr,mywork,myoff); - - vobj vvsum=zero; - for(int ss=myoff;ssSumArraySize();i++){ - vsum = vsum+sumarray[i]; - } - - typedef typename vobj::scalar_object sobj; - sobj ssum=zero; - - std::vector buf(Nsimd); - extract(vsum,buf); - - for(int i=0;iGlobalSum(ssum); - - return ssum; - } - - + sumarray[thr]=vvsum; + } + + vobj vsum=zero; // sum across threads + for(int i=0;iSumArraySize();i++){ + vsum = vsum+sumarray[i]; + } + + typedef typename vobj::scalar_object sobj; + sobj ssum=zero; + + std::vector buf(Nsimd); + extract(vsum,buf); + + for(int i=0;iGlobalSum(ssum); + + return ssum; +} template inline void sliceSum(const Lattice &Data,std::vector &result,int orthogdim) { @@ -214,7 +212,6 @@ template inline void sliceSum(const Lattice &Data,std::vector< result[t]=gsum; } - } diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index ccedf99c..248a625c 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -694,7 +694,6 @@ inline Grid_simd innerProduct(const Grid_simd &l, const Grid_simd &r) { return conjugate(l) * r; } - template inline Grid_simd outerProduct(const Grid_simd &l, const Grid_simd &r) { diff --git a/lib/tensors/Tensor_class.h b/lib/tensors/Tensor_class.h index e0b69eb0..9b806d75 100644 --- a/lib/tensors/Tensor_class.h +++ b/lib/tensors/Tensor_class.h @@ -56,6 +56,7 @@ class iScalar { typedef vtype element; typedef typename GridTypeMapper::scalar_type scalar_type; typedef typename GridTypeMapper::vector_type vector_type; + typedef typename GridTypeMapper::vector_typeD vector_typeD; typedef typename GridTypeMapper::tensor_reduced tensor_reduced_v; typedef iScalar tensor_reduced; typedef typename GridTypeMapper::scalar_object recurse_scalar_object; @@ -193,6 +194,7 @@ class iVector { typedef vtype element; typedef typename GridTypeMapper::scalar_type scalar_type; typedef typename GridTypeMapper::vector_type vector_type; + typedef typename GridTypeMapper::vector_typeD vector_typeD; typedef typename GridTypeMapper::tensor_reduced tensor_reduced_v; typedef typename GridTypeMapper::scalar_object recurse_scalar_object; typedef iScalar tensor_reduced; @@ -305,6 +307,7 @@ class iMatrix { typedef vtype element; typedef typename GridTypeMapper::scalar_type scalar_type; typedef typename GridTypeMapper::vector_type vector_type; + typedef typename GridTypeMapper::vector_typeD vector_typeD; typedef typename GridTypeMapper::tensor_reduced tensor_reduced_v; typedef typename GridTypeMapper::scalar_object recurse_scalar_object; diff --git a/lib/tensors/Tensor_inner.h b/lib/tensors/Tensor_inner.h index 22284be4..46185652 100644 --- a/lib/tensors/Tensor_inner.h +++ b/lib/tensors/Tensor_inner.h @@ -29,51 +29,109 @@ Author: Peter Boyle #ifndef GRID_MATH_INNER_H #define GRID_MATH_INNER_H namespace Grid { - /////////////////////////////////////////////////////////////////////////////////////// - // innerProduct Scalar x Scalar -> Scalar - // innerProduct Vector x Vector -> Scalar - // innerProduct Matrix x Matrix -> Scalar - /////////////////////////////////////////////////////////////////////////////////////// - template inline RealD norm2(const sobj &arg){ - typedef typename sobj::scalar_type scalar; - decltype(innerProduct(arg,arg)) nrm; - nrm = innerProduct(arg,arg); - RealD ret = real(nrm); - return ret; - } + /////////////////////////////////////////////////////////////////////////////////////// + // innerProduct Scalar x Scalar -> Scalar + // innerProduct Vector x Vector -> Scalar + // innerProduct Matrix x Matrix -> Scalar + /////////////////////////////////////////////////////////////////////////////////////// + template inline RealD norm2(const sobj &arg){ + auto nrm = innerProductD(arg,arg); + RealD ret = real(nrm); + return ret; + } + ////////////////////////////////////// + // If single promote to double and sum 2x + ////////////////////////////////////// - template inline - auto innerProduct (const iVector& lhs,const iVector& rhs) -> iScalar - { - typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t; - iScalar ret; - ret=zero; - for(int c1=0;c1 inline + auto innerProductD (const iVector& lhs,const iVector& rhs) -> iScalar + { + typedef decltype(innerProductD(lhs._internal[0],rhs._internal[0])) ret_t; + iScalar ret; + ret=zero; + for(int c1=0;c1 inline - auto innerProduct (const iMatrix& lhs,const iMatrix& rhs) -> iScalar - { - typedef decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t; - iScalar ret; - iScalar tmp; - ret=zero; - for(int c1=0;c1 inline - auto innerProduct (const iScalar& lhs,const iScalar& rhs) -> iScalar - { - typedef decltype(innerProduct(lhs._internal,rhs._internal)) ret_t; - iScalar ret; - ret._internal = innerProduct(lhs._internal,rhs._internal); - return ret; + return ret; + } + template inline + auto innerProductD (const iMatrix& lhs,const iMatrix& rhs) -> iScalar + { + typedef decltype(innerProductD(lhs._internal[0][0],rhs._internal[0][0])) ret_t; + iScalar ret; + iScalar tmp; + ret=zero; + for(int c1=0;c1 inline + auto innerProductD (const iScalar& lhs,const iScalar& rhs) -> iScalar + { + typedef decltype(innerProductD(lhs._internal,rhs._internal)) ret_t; + iScalar ret; + ret._internal = innerProductD(lhs._internal,rhs._internal); + return ret; + } + ////////////////////// + // Keep same precison + ////////////////////// + template inline + auto innerProduct (const iVector& lhs,const iVector& rhs) -> iScalar + { + typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t; + iScalar ret; + ret=zero; + for(int c1=0;c1 inline + auto innerProduct (const iMatrix& lhs,const iMatrix& rhs) -> iScalar + { + typedef decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t; + iScalar ret; + iScalar tmp; + ret=zero; + for(int c1=0;c1 inline + auto innerProduct (const iScalar& lhs,const iScalar& rhs) -> iScalar + { + typedef decltype(innerProduct(lhs._internal,rhs._internal)) ret_t; + iScalar ret; + ret._internal = innerProduct(lhs._internal,rhs._internal); + return ret; + } } #endif diff --git a/lib/tensors/Tensor_traits.h b/lib/tensors/Tensor_traits.h index 777b398d..4dcfd9b1 100644 --- a/lib/tensors/Tensor_traits.h +++ b/lib/tensors/Tensor_traits.h @@ -53,6 +53,7 @@ namespace Grid { public: typedef typename T::scalar_type scalar_type; typedef typename T::vector_type vector_type; + typedef typename T::vector_typeD vector_typeD; typedef typename T::tensor_reduced tensor_reduced; typedef typename T::scalar_object scalar_object; typedef typename T::Complexified Complexified; @@ -67,6 +68,7 @@ namespace Grid { public: typedef RealF scalar_type; typedef RealF vector_type; + typedef RealD vector_typeD; typedef RealF tensor_reduced ; typedef RealF scalar_object; typedef ComplexF Complexified; @@ -77,6 +79,7 @@ namespace Grid { public: typedef RealD scalar_type; typedef RealD vector_type; + typedef RealD vector_typeD; typedef RealD tensor_reduced; typedef RealD scalar_object; typedef ComplexD Complexified; @@ -87,6 +90,7 @@ namespace Grid { public: typedef ComplexF scalar_type; typedef ComplexF vector_type; + typedef ComplexD vector_typeD; typedef ComplexF tensor_reduced; typedef ComplexF scalar_object; typedef ComplexF Complexified; @@ -97,6 +101,7 @@ namespace Grid { public: typedef ComplexD scalar_type; typedef ComplexD vector_type; + typedef ComplexD vector_typeD; typedef ComplexD tensor_reduced; typedef ComplexD scalar_object; typedef ComplexD Complexified; @@ -118,6 +123,7 @@ namespace Grid { public: typedef RealF scalar_type; typedef vRealF vector_type; + typedef vRealD vector_typeD; typedef vRealF tensor_reduced; typedef RealF scalar_object; typedef vComplexF Complexified; @@ -128,6 +134,7 @@ namespace Grid { public: typedef RealD scalar_type; typedef vRealD vector_type; + typedef vRealD vector_typeD; typedef vRealD tensor_reduced; typedef RealD scalar_object; typedef vComplexD Complexified; @@ -138,6 +145,7 @@ namespace Grid { public: typedef ComplexF scalar_type; typedef vComplexF vector_type; + typedef vComplexD vector_typeD; typedef vComplexF tensor_reduced; typedef ComplexF scalar_object; typedef vComplexF Complexified; @@ -148,6 +156,7 @@ namespace Grid { public: typedef ComplexD scalar_type; typedef vComplexD vector_type; + typedef vComplexD vector_typeD; typedef vComplexD tensor_reduced; typedef ComplexD scalar_object; typedef vComplexD Complexified; @@ -158,6 +167,7 @@ namespace Grid { public: typedef Integer scalar_type; typedef vInteger vector_type; + typedef vInteger vector_typeD; typedef vInteger tensor_reduced; typedef Integer scalar_object; typedef void Complexified; From bf516c3b8127667129daa993a61d1f2755039407 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 15 Apr 2017 12:27:28 +0100 Subject: [PATCH 32/36] higher precision reduction variables in norm and inner product --- lib/lattice/Lattice_reduction.h | 3 ++- tests/solver/Test_dwf_cg_prec.cc | 2 +- tests/solver/Test_dwf_cg_unprec.cc | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h index bb7808b8..e12bf0dd 100644 --- a/lib/lattice/Lattice_reduction.h +++ b/lib/lattice/Lattice_reduction.h @@ -42,7 +42,7 @@ template inline RealD norm2(const Lattice &arg){ ComplexD nrm = innerProduct(arg,arg); return std::real(nrm); } - +// Double inner product template inline ComplexD innerProduct(const Lattice &left,const Lattice &right) { @@ -102,6 +102,7 @@ inline auto sum(const LatticeTrinaryExpression & expr) return sum(closure(expr)); } +// FIXME precision promoted summation template inline typename vobj::scalar_object sum(const Lattice &arg) { diff --git a/tests/solver/Test_dwf_cg_prec.cc b/tests/solver/Test_dwf_cg_prec.cc index 30436e36..0ede352e 100644 --- a/tests/solver/Test_dwf_cg_prec.cc +++ b/tests/solver/Test_dwf_cg_prec.cc @@ -89,7 +89,7 @@ int main(int argc, char** argv) { GridStopWatch CGTimer; SchurDiagMooeeOperator HermOpEO(Ddwf); - ConjugateGradient CG(1.0e-8, 10000, 0);// switch off the assert + ConjugateGradient CG(1.0e-5, 10000, 0);// switch off the assert CGTimer.Start(); CG(HermOpEO, src_o, result_o); diff --git a/tests/solver/Test_dwf_cg_unprec.cc b/tests/solver/Test_dwf_cg_unprec.cc index 131f6ce1..38f632ef 100644 --- a/tests/solver/Test_dwf_cg_unprec.cc +++ b/tests/solver/Test_dwf_cg_unprec.cc @@ -73,7 +73,7 @@ int main (int argc, char ** argv) DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); MdagMLinearOperator HermOp(Ddwf); - ConjugateGradient CG(1.0e-8,10000); + ConjugateGradient CG(1.0e-6,10000); CG(HermOp,src,result); Grid_finalize(); From 7ede6961269516638aa21560f9511ec5140fbcb2 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 16 Apr 2017 23:40:00 +0100 Subject: [PATCH 33/36] Non compile of tests fixed --- TODO | 17 +- .../iterative/BlockConjugateGradient.h | 201 ------------------ lib/lattice/Lattice_reduction.h | 159 ++++++++++++++ lib/simd/Grid_vector_types.h | 2 +- lib/tensors/Tensor_traits.h | 1 + 5 files changed, 170 insertions(+), 210 deletions(-) diff --git a/TODO b/TODO index 91034f20..27579ad3 100644 --- a/TODO +++ b/TODO @@ -3,19 +3,20 @@ TODO: Peter's work list: --- Merge high precision reduction into develop +-- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- started +-- Merge high precision reduction into develop <-- done +-- Precision conversion and sort out localConvert <-- -- Physical propagator interface --- Precision conversion and sort out localConvert --- slice* linalg routines for multiRHS, BlockCG + +-- multiRHS DWF; benchmark on Cori/BNL for comms elimination + -- slice* linalg routines for multiRHS, BlockCG <-- started + -- Profile CG, BlockCG, etc... Flop count/rate -- Binary I/O speed up & x-strips --- Half-precision comms --- multiRHS DWF; benchmark on Cori/BNL for comms elimination +-- Half-precision comms <-- started -- GaugeFix into central location --- Help Julia with NPR code --- Switch to measurements +-- FFTfix in sensible place -- Multigrid Wilson and DWF, compare to other Multigrid implementations --- Remove DenseVector, DenseMatrix; Use Eigen instead. -- quaternions -- Might not need diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h index 0f4a3a80..1db89512 100644 --- a/lib/algorithms/iterative/BlockConjugateGradient.h +++ b/lib/algorithms/iterative/BlockConjugateGradient.h @@ -30,210 +30,9 @@ directory #ifndef GRID_BLOCK_CONJUGATE_GRADIENT_H #define GRID_BLOCK_CONJUGATE_GRADIENT_H -#include namespace Grid { -GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) -{ - int NN = BlockSolverGrid->_ndimension; - int nsimd = BlockSolverGrid->Nsimd(); - - std::vector latt_phys(0); - std::vector simd_phys(0); - std::vector mpi_phys(0); - - for(int d=0;d_fdimensions[d]); - simd_phys.push_back(BlockSolverGrid->_simd_layout[d]); - mpi_phys.push_back(BlockSolverGrid->_processors[d]); - } - } - return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); -} - ////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Need to move sliceInnerProduct, sliceAxpy, sliceNorm etc... into lattice sector along with sliceSum - ////////////////////////////////////////////////////////////////////////////////////////////////////////////// -template -static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,const Lattice &Y,int Orthog,RealD scale=1.0) -{ - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - int Nblock = X._grid->GlobalDimensions()[Orthog]; - - GridBase *FullGrid = X._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - - Lattice Xslice(SliceGrid); - Lattice Rslice(SliceGrid); - // FIXME: Implementation is slow - // If we based this on Cshift it would work for spread out - // but it would be even slower - // - // Repeated extract slice is inefficient - // - // Best base the linear combination by constructing a - // set of vectors of size grid->_rdimensions[Orthog]. - for(int i=0;i -static void sliceMaddVector (Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, - int Orthog,RealD scale=1.0) -{ - // FIXME: Implementation is slow - // Best base the linear combination by constructing a - // set of vectors of size grid->_rdimensions[Orthog]. - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - int Nblock = X._grid->GlobalDimensions()[Orthog]; - - GridBase *FullGrid = X._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - - Lattice Xslice(SliceGrid); - Lattice Rslice(SliceGrid); - // If we based this on Cshift it would work for spread out - // but it would be even slower - for(int i=0;i -static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) -{ - // FIXME: Implementation is slow - // Not sure of best solution.. think about it - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - GridBase *FullGrid = lhs._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - - int Nblock = FullGrid->GlobalDimensions()[Orthog]; - - Lattice Lslice(SliceGrid); - Lattice Rslice(SliceGrid); - - mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); - - for(int i=0;i -static void sliceInnerProductVector( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) -{ - // FIXME: Implementation is slow - // Look at localInnerProduct implementation, - // and do inside a site loop with block strided iterators - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - typedef typename vobj::tensor_reduced scalar; - typedef typename scalar::scalar_object scomplex; - - int Nblock = lhs._grid->GlobalDimensions()[Orthog]; - - vec.resize(Nblock); - std::vector sip(Nblock); - Lattice IP(lhs._grid); - - IP=localInnerProduct(lhs,rhs); - sliceSum(IP,sip,Orthog); - - for(int ss=0;ss -static void sliceNorm (std::vector &sn,const Lattice &rhs,int Orthog) { - - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - int Nblock = rhs._grid->GlobalDimensions()[Orthog]; - std::vector ip(Nblock); - sn.resize(Nblock); - - sliceInnerProductVector(ip,rhs,rhs,Orthog); - for(int ss=0;ss -static void sliceInnerProductMatrixOld( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) -{ - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - typedef typename vobj::tensor_reduced scalar; - typedef typename scalar::scalar_object scomplex; - - int Nblock = lhs._grid->GlobalDimensions()[Orthog]; - - std::cout << " sliceInnerProductMatrix Dim "< IP(lhs._grid); - std::vector sip(Nblock); - - mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); - - Lattice tmp = rhs; - - for(int s1=0;s1 #ifndef GRID_LATTICE_REDUCTION_H #define GRID_LATTICE_REDUCTION_H +#include + namespace Grid { #ifdef GRID_WARN_SUBOPTIMAL #warning "Optimisation alert all these reduction loops are NOT threaded " @@ -215,6 +217,163 @@ template inline void sliceSum(const Lattice &Data,std::vector< } } +inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) + { + int NN = BlockSolverGrid->_ndimension; + int nsimd = BlockSolverGrid->Nsimd(); + + std::vector latt_phys(0); + std::vector simd_phys(0); + std::vector mpi_phys(0); + + for(int d=0;d_fdimensions[d]); + simd_phys.push_back(BlockSolverGrid->_simd_layout[d]); + mpi_phys.push_back(BlockSolverGrid->_processors[d]); + } + } + return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); + } + ////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Need to move sliceInnerProduct, sliceAxpy, sliceNorm etc... into lattice sector along with sliceSum + ////////////////////////////////////////////////////////////////////////////////////////////////////////////// +template + static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,const Lattice &Y,int Orthog,RealD scale=1.0) + { + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = X._grid->GlobalDimensions()[Orthog]; + + GridBase *FullGrid = X._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + Lattice Xslice(SliceGrid); + Lattice Rslice(SliceGrid); + // FIXME: Implementation is slow + // If we based this on Cshift it would work for spread out + // but it would be even slower + // + // Repeated extract slice is inefficient + // + // Best base the linear combination by constructing a + // set of vectors of size grid->_rdimensions[Orthog]. + for(int i=0;i + static void sliceMaddVector (Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, + int Orthog,RealD scale=1.0) + { + // FIXME: Implementation is slow + // Best base the linear combination by constructing a + // set of vectors of size grid->_rdimensions[Orthog]. + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = X._grid->GlobalDimensions()[Orthog]; + + GridBase *FullGrid = X._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + Lattice Xslice(SliceGrid); + Lattice Rslice(SliceGrid); + // If we based this on Cshift it would work for spread out + // but it would be even slower + for(int i=0;i + static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) + { + // FIXME: Implementation is slow + // Not sure of best solution.. think about it + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + GridBase *FullGrid = lhs._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + int Nblock = FullGrid->GlobalDimensions()[Orthog]; + + Lattice Lslice(SliceGrid); + Lattice Rslice(SliceGrid); + + mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); + + for(int i=0;i + static void sliceInnerProductVector( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) + { + // FIXME: Implementation is slow + // Look at localInnerProduct implementation, + // and do inside a site loop with block strided iterators + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + typedef typename vobj::tensor_reduced scalar; + typedef typename scalar::scalar_object scomplex; + + int Nblock = lhs._grid->GlobalDimensions()[Orthog]; + + vec.resize(Nblock); + std::vector sip(Nblock); + Lattice IP(lhs._grid); + + IP=localInnerProduct(lhs,rhs); + sliceSum(IP,sip,Orthog); + + for(int ss=0;ss + static void sliceNorm (std::vector &sn,const Lattice &rhs,int Orthog) { + + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = rhs._grid->GlobalDimensions()[Orthog]; + std::vector ip(Nblock); + sn.resize(Nblock); + + sliceInnerProductVector(ip,rhs,rhs,Orthog); + for(int ss=0;ss Date: Mon, 17 Apr 2017 10:50:19 +0100 Subject: [PATCH 34/36] MultiRHS working, starting to optimise. Block doesn't and I thought it already was; puzzled. --- .../iterative/BlockConjugateGradient.h | 2 + lib/lattice/Lattice_reduction.h | 209 ++++++++++++++---- 2 files changed, 170 insertions(+), 41 deletions(-) diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h index 1db89512..42a9af56 100644 --- a/lib/algorithms/iterative/BlockConjugateGradient.h +++ b/lib/algorithms/iterative/BlockConjugateGradient.h @@ -256,8 +256,10 @@ void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) Linop.HermOp(P, AP); // Alpha + // sliceInnerProductVectorTest(v_pAp_test,P,AP,Orthog); sliceInnerProductVector(v_pAp,P,AP,Orthog); for(int b=0;b &left,const Lattice &righ GridBase *grid = left._grid; std::vector > sumarray(grid->SumArraySize()); - for(int i=0;iSumArraySize();i++){ - sumarray[i]=zero; - } parallel_for(int thr=0;thrSumArraySize();thr++){ int nwork, mywork, myoff; @@ -152,7 +149,6 @@ template inline void sliceSum(const Lattice &Data,std::vector< // FIXME // std::cout<SumArraySize()<<" threads "<_ndimension; const int Nsimd = grid->Nsimd(); @@ -164,8 +160,8 @@ template inline void sliceSum(const Lattice &Data,std::vector< int rd=grid->_rdimensions[orthogdim]; std::vector > lvSum(rd); // will locally sum vectors first - std::vector lsSum(ld,zero); // sum across these down to scalars - std::vector extracted(Nsimd); // splitting the SIMD + std::vector lsSum(ld,zero); // sum across these down to scalars + std::vector extracted(Nsimd); // splitting the SIMD result.resize(fd); // And then global sum to return the same vector to every node for IO to file for(int r=0;r inline void sliceSum(const Lattice &Data,std::vector< std::vector coor(Nd); // sum over reduced dimension planes, breaking out orthog dir - for(int ss=0;ssoSites();ss++){ Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions); int r = coor[orthogdim]; lvSum[r]=lvSum[r]+Data._odata[ss]; - } + } // Sum across simd lanes in the plane, breaking out orthog dir. std::vector icoor(Nd); @@ -217,6 +212,171 @@ template inline void sliceSum(const Lattice &Data,std::vector< } } +template + static void sliceInnerProductVectorSlow( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) + { + // FIXME: Implementation is slow + // Look at localInnerProduct implementation, + // and do inside a site loop with block strided iterators + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + typedef typename vobj::tensor_reduced scalar; + typedef typename scalar::scalar_object scomplex; + + int Nblock = lhs._grid->GlobalDimensions()[Orthog]; + + vec.resize(Nblock); + std::vector sip(Nblock); + Lattice IP(lhs._grid); + + IP=localInnerProduct(lhs,rhs); + sliceSum(IP,sip,Orthog); + + for(int ss=0;ss +static void sliceInnerProductVector( std::vector & result, const Lattice &lhs,const Lattice &rhs,int orthogdim) +{ + typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_type scalar_type; + GridBase *grid = lhs._grid; + assert(grid!=NULL); + conformable(grid,rhs._grid); + + // FIXME + // std::cout<SumArraySize()<<" threads "<_ndimension; + const int Nsimd = grid->Nsimd(); + + assert(orthogdim >= 0); + assert(orthogdim < Nd); + + int fd=grid->_fdimensions[orthogdim]; + int ld=grid->_ldimensions[orthogdim]; + int rd=grid->_rdimensions[orthogdim]; + + std::vector > lvSum(rd); // will locally sum vectors first + std::vector lsSum(ld,scalar_type(0.0)); // sum across these down to scalars + std::vector > extracted(Nsimd); // splitting the SIMD + + result.resize(fd); // And then global sum to return the same vector to every node for IO to file + for(int r=0;r coor(Nd); + vector_type vv; + PARALLEL_FOR_LOOP_INTERN + for(int ss=0;ssoSites();ss++){ + Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions); + int r = coor[orthogdim]; + vv = TensorRemove(innerProduct(lhs._odata[ss],rhs._odata[ss])); + PARALLEL_CRITICAL { // ouch slow rfo thrashing atomic fp add + lvSum[r]=lvSum[r]+vv; + } + } + } + + // Sum across simd lanes in the plane, breaking out orthog dir. + std::vector icoor(Nd); + for(int rt=0;rt temp; temp._internal = lvSum[rt]; + extract(temp,extracted); + + for(int idx=0;idxiCoorFromIindex(icoor,idx); + + int ldx =rt+icoor[orthogdim]*rd; + + lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal; + + } + } + + // sum over nodes. + scalar_type gsum; + for(int t=0;t_processor_coor[orthogdim] ) { + gsum=lsSum[lt]; + } else { + gsum=scalar_type(0.0); + } + + grid->GlobalSum(gsum); + + result[t]=gsum; + } +} +#if 0 +template +static void sliceInnerProductVector( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) +{ + // FIXME: Implementation is slow + // Look at sliceSum implementation, + // and do inside a site loop with block strided iterators + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + typedef typename vobj::tensor_reduced scalar; + typedef typename scalar::scalar_object scomplex; + + GridBase * grid = lhs._grid; + + + const int Nd = grid->_ndimension; + const int Nsimd = grid->Nsimd(); + + assert(orthogdim >= 0); + assert(orthogdim < Nd); + + int fd=grid->_fdimensions[orthogdim]; + int ld=grid->_ldimensions[orthogdim]; + int rd=grid->_rdimensions[orthogdim]; + + int Nblock = grid->GlobalDimensions()[Orthog]; + int Nrblock = grid->_rdimensions[Orthog]; + int Nthr = grid->SumArraySize(); + + std::vector > sumarray(Nrblock*Nthr); + + parallel_for(int thr=0;thrSumArraySize();thr++){ + + int nwork, mywork, myoff; + + for(int rb=0;rboSites()/Nrblock),thr,mywork,myoff); + int off = rb * grid->_slice_ + vector_type vnrm=zero; // private to thread; sub summation + for(int ss=myoff;ss sip(Nblock); + Lattice IP(lhs._grid); + + IP=localInnerProduct(lhs,rhs); + sliceSum(IP,sip,Orthog); + + for(int ss=0;ss_ndimension; @@ -322,41 +482,8 @@ template mat(i,j) = innerProduct(Lslice,Rslice); } } -#undef FORCE_DIAG -#ifdef FORCE_DIAG - for(int i=0;i - static void sliceInnerProductVector( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) - { - // FIXME: Implementation is slow - // Look at localInnerProduct implementation, - // and do inside a site loop with block strided iterators - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - typedef typename vobj::tensor_reduced scalar; - typedef typename scalar::scalar_object scomplex; - - int Nblock = lhs._grid->GlobalDimensions()[Orthog]; - - vec.resize(Nblock); - std::vector sip(Nblock); - Lattice IP(lhs._grid); - - IP=localInnerProduct(lhs,rhs); - sliceSum(IP,sip,Orthog); - - for(int ss=0;ss static void sliceNorm (std::vector &sn,const Lattice &rhs,int Orthog) { From 8e161152e457954ba2493b9d807516ae0f2eb030 Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 18 Apr 2017 10:51:55 +0100 Subject: [PATCH 35/36] MultiRHS solver improvements with slice operations moved into lattice and sped up. Block solver requires a lot of performance work. --- .../iterative/BlockConjugateGradient.h | 85 +++- lib/algorithms/iterative/ConjugateGradient.h | 39 +- lib/lattice/Lattice_reduction.h | 457 +++++++++--------- lib/log/Log.h | 4 +- lib/simd/Grid_vector_types.h | 4 - lib/tensors/Tensor_class.h | 28 +- lib/tensors/Tensor_traits.h | 3 +- .../solver/Test_staggered_block_cg_unprec.cc | 30 +- tests/solver/Test_staggered_cg_unprec.cc | 1 - 9 files changed, 366 insertions(+), 285 deletions(-) diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h index 42a9af56..d90194ae 100644 --- a/lib/algorithms/iterative/BlockConjugateGradient.h +++ b/lib/algorithms/iterative/BlockConjugateGradient.h @@ -60,8 +60,8 @@ void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) { int Orthog = 0; // First dimension is block dim Nblock = Src._grid->_fdimensions[Orthog]; - std::cout< &Linop, const Field &Src, Field &Psi) Field AP(Src); Field R(Src); - GridStopWatch LinalgTimer; - GridStopWatch MatrixTimer; - GridStopWatch SolverTimer; - Eigen::MatrixXcd m_pAp = Eigen::MatrixXcd::Identity(Nblock,Nblock); Eigen::MatrixXcd m_pAp_inv= Eigen::MatrixXcd::Identity(Nblock,Nblock); Eigen::MatrixXcd m_rr = Eigen::MatrixXcd::Zero(Nblock,Nblock); @@ -116,33 +112,49 @@ void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) P = R; sliceInnerProductMatrix(m_rr,R,R,Orthog); + GridStopWatch sliceInnerTimer; + GridStopWatch sliceMaddTimer; + GridStopWatch MatrixTimer; + GridStopWatch SolverTimer; + SolverTimer.Start(); + int k; for (k = 1; k <= MaxIterations; k++) { RealD rrsum=0; for(int b=0;b &Linop, const Field &Src, Field &Psi) if ( max_resid < Tolerance*Tolerance ) { - std::cout << GridLogMessage<<" Block solver has converged in " - < &Linop, const Field &Src, Field &Psi) { int Orthog = 0; // First dimension is block dim Nblock = Src._grid->_fdimensions[Orthog]; - std::cout< &Linop, const Field &Src, Field &Psi) P = R; sliceNorm(v_rr,R,Orthog); + GridStopWatch sliceInnerTimer; + GridStopWatch sliceMaddTimer; + GridStopWatch sliceNormTimer; + GridStopWatch MatrixTimer; + GridStopWatch SolverTimer; + + SolverTimer.Start(); int k; for (k = 1; k <= MaxIterations; k++) { RealD rrsum=0; for(int b=0;b &Linop, const Field &Src, Field &Psi) } if ( max_resid < Tolerance*Tolerance ) { - std::cout << GridLogMessage<<" MultiRHS solver has converged in " - < { cp = a; ssq = norm2(src); - std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: guess " << guess << std::endl; - std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: src " << ssq << std::endl; - std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: mp " << d << std::endl; - std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: mmp " << b << std::endl; - std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: cp,r " << cp << std::endl; - std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: p " << a << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: guess " << guess << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: src " << ssq << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: mp " << d << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: mmp " << b << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: cp,r " << cp << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: p " << a << std::endl; RealD rsq = Tolerance * Tolerance * ssq; @@ -144,19 +138,20 @@ class ConjugateGradient : public OperatorFunction { RealD resnorm = sqrt(norm2(p)); RealD true_residual = resnorm / srcnorm; - std::cout << GridLogMessage - << "ConjugateGradient: Converged on iteration " << k << std::endl; - std::cout << GridLogMessage << "Computed residual " << sqrt(cp / ssq) - << " true residual " << true_residual << " target " - << Tolerance << std::endl; - std::cout << GridLogMessage << "Time elapsed: Iterations " - << SolverTimer.Elapsed() << " Matrix " - << MatrixTimer.Elapsed() << " Linalg " - << LinalgTimer.Elapsed(); - std::cout << std::endl; + std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl; + std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)< inline RealD norm2(const Lattice &arg){ ComplexD nrm = innerProduct(arg,arg); return std::real(nrm); } + // Double inner product template inline ComplexD innerProduct(const Lattice &left,const Lattice &right) @@ -101,7 +102,6 @@ inline auto sum(const LatticeTrinaryExpression & expr) return sum(closure(expr)); } -// FIXME precision promoted summation template inline typename vobj::scalar_object sum(const Lattice &arg) { @@ -141,14 +141,22 @@ inline typename vobj::scalar_object sum(const Lattice &arg) return ssum; } + +////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc... +////////////////////////////////////////////////////////////////////////////////////////////////////////////// + template inline void sliceSum(const Lattice &Data,std::vector &result,int orthogdim) { + /////////////////////////////////////////////////////// + // FIXME precision promoted summation + // may be important for correlation functions + // But easily avoided by using double precision fields + /////////////////////////////////////////////////////// typedef typename vobj::scalar_object sobj; GridBase *grid = Data._grid; assert(grid!=NULL); - // FIXME - // std::cout<SumArraySize()<<" threads "<_ndimension; const int Nsimd = grid->Nsimd(); @@ -163,18 +171,27 @@ template inline void sliceSum(const Lattice &Data,std::vector< std::vector lsSum(ld,zero); // sum across these down to scalars std::vector extracted(Nsimd); // splitting the SIMD - result.resize(fd); // And then global sum to return the same vector to every node for IO to file + result.resize(fd); // And then global sum to return the same vector to every node for(int r=0;r coor(Nd); + int e1= grid->_slice_nblock[orthogdim]; + int e2= grid->_slice_block [orthogdim]; + int stride=grid->_slice_stride[orthogdim]; // sum over reduced dimension planes, breaking out orthog dir - for(int ss=0;ssoSites();ss++){ - Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions); - int r = coor[orthogdim]; - lvSum[r]=lvSum[r]+Data._odata[ss]; + // Parallel over orthog direction + parallel_for(int r=0;r_ostride[orthogdim]; // base offset for start of plane + + for(int n=0;n inline void sliceSum(const Lattice &Data,std::vector< } } -template - static void sliceInnerProductVectorSlow( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) - { - // FIXME: Implementation is slow - // Look at localInnerProduct implementation, - // and do inside a site loop with block strided iterators - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - typedef typename vobj::tensor_reduced scalar; - typedef typename scalar::scalar_object scomplex; - - int Nblock = lhs._grid->GlobalDimensions()[Orthog]; - - vec.resize(Nblock); - std::vector sip(Nblock); - Lattice IP(lhs._grid); - - IP=localInnerProduct(lhs,rhs); - sliceSum(IP,sip,Orthog); - - for(int ss=0;ss static void sliceInnerProductVector( std::vector & result, const Lattice &lhs,const Lattice &rhs,int orthogdim) { @@ -247,8 +238,6 @@ static void sliceInnerProductVector( std::vector & result, const Latti assert(grid!=NULL); conformable(grid,rhs._grid); - // FIXME - // std::cout<SumArraySize()<<" threads "<_ndimension; const int Nsimd = grid->Nsimd(); @@ -268,16 +257,18 @@ static void sliceInnerProductVector( std::vector & result, const Latti lvSum[r]=zero; } - // sum over reduced dimension planes, breaking out orthog dir - PARALLEL_REGION { - std::vector coor(Nd); - vector_type vv; - PARALLEL_FOR_LOOP_INTERN - for(int ss=0;ssoSites();ss++){ - Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions); - int r = coor[orthogdim]; - vv = TensorRemove(innerProduct(lhs._odata[ss],rhs._odata[ss])); - PARALLEL_CRITICAL { // ouch slow rfo thrashing atomic fp add + int e1= grid->_slice_nblock[orthogdim]; + int e2= grid->_slice_block [orthogdim]; + int stride=grid->_slice_stride[orthogdim]; + + parallel_for(int r=0;r_ostride[orthogdim]; // base offset for start of plane + + for(int n=0;n & result, const Latti std::vector icoor(Nd); for(int rt=0;rt temp; temp._internal = lvSum[rt]; + iScalar temp; + temp._internal = lvSum[rt]; extract(temp,extracted); for(int idx=0;idx & result, const Latti result[t]=gsum; } } -#if 0 template -static void sliceInnerProductVector( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) +static void sliceNorm (std::vector &sn,const Lattice &rhs,int Orthog) { - // FIXME: Implementation is slow - // Look at sliceSum implementation, - // and do inside a site loop with block strided iterators - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - typedef typename vobj::tensor_reduced scalar; - typedef typename scalar::scalar_object scomplex; - - GridBase * grid = lhs._grid; - - - const int Nd = grid->_ndimension; - const int Nsimd = grid->Nsimd(); - - assert(orthogdim >= 0); - assert(orthogdim < Nd); - - int fd=grid->_fdimensions[orthogdim]; - int ld=grid->_ldimensions[orthogdim]; - int rd=grid->_rdimensions[orthogdim]; - - int Nblock = grid->GlobalDimensions()[Orthog]; - int Nrblock = grid->_rdimensions[Orthog]; - int Nthr = grid->SumArraySize(); - - std::vector > sumarray(Nrblock*Nthr); - - parallel_for(int thr=0;thrSumArraySize();thr++){ - - int nwork, mywork, myoff; - - for(int rb=0;rboSites()/Nrblock),thr,mywork,myoff); - int off = rb * grid->_slice_ - vector_type vnrm=zero; // private to thread; sub summation - for(int ss=myoff;ss sip(Nblock); - Lattice IP(lhs._grid); - - IP=localInnerProduct(lhs,rhs); - sliceSum(IP,sip,Orthog); - - for(int ss=0;ss_ndimension; - int nsimd = BlockSolverGrid->Nsimd(); - - std::vector latt_phys(0); - std::vector simd_phys(0); - std::vector mpi_phys(0); - - for(int d=0;d_fdimensions[d]); - simd_phys.push_back(BlockSolverGrid->_simd_layout[d]); - mpi_phys.push_back(BlockSolverGrid->_processors[d]); - } - } - return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); - } - ////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Need to move sliceInnerProduct, sliceAxpy, sliceNorm etc... into lattice sector along with sliceSum - ////////////////////////////////////////////////////////////////////////////////////////////////////////////// -template - static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,const Lattice &Y,int Orthog,RealD scale=1.0) - { - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - int Nblock = X._grid->GlobalDimensions()[Orthog]; - - GridBase *FullGrid = X._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - - Lattice Xslice(SliceGrid); - Lattice Rslice(SliceGrid); - // FIXME: Implementation is slow - // If we based this on Cshift it would work for spread out - // but it would be even slower - // - // Repeated extract slice is inefficient - // - // Best base the linear combination by constructing a - // set of vectors of size grid->_rdimensions[Orthog]. - for(int i=0;i - static void sliceMaddVector (Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, - int Orthog,RealD scale=1.0) - { - // FIXME: Implementation is slow - // Best base the linear combination by constructing a - // set of vectors of size grid->_rdimensions[Orthog]. - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - int Nblock = X._grid->GlobalDimensions()[Orthog]; - - GridBase *FullGrid = X._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - - Lattice Xslice(SliceGrid); - Lattice Rslice(SliceGrid); - // If we based this on Cshift it would work for spread out - // but it would be even slower - for(int i=0;i - static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) - { - // FIXME: Implementation is slow - // Not sure of best solution.. think about it - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - GridBase *FullGrid = lhs._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - - int Nblock = FullGrid->GlobalDimensions()[Orthog]; - - Lattice Lslice(SliceGrid); - Lattice Rslice(SliceGrid); - - mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); - - for(int i=0;i - static void sliceNorm (std::vector &sn,const Lattice &rhs,int Orthog) { - typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; @@ -499,9 +324,207 @@ template for(int ss=0;ss +static void sliceMaddVector(Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, + int orthogdim,RealD scale=1.0) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + typedef typename vobj::tensor_reduced tensor_reduced; + + GridBase *grid = X._grid; + + int Nsimd =grid->Nsimd(); + int Nblock =grid->GlobalDimensions()[orthogdim]; + + int fd =grid->_fdimensions[orthogdim]; + int ld =grid->_ldimensions[orthogdim]; + int rd =grid->_rdimensions[orthogdim]; + + int e1 =grid->_slice_nblock[orthogdim]; + int e2 =grid->_slice_block [orthogdim]; + int stride =grid->_slice_stride[orthogdim]; + + std::vector icoor; + + for(int r=0;r_ostride[orthogdim]; // base offset for start of plane + + vector_type av; + + for(int l=0;liCoorFromIindex(icoor,l); + int ldx =r+icoor[orthogdim]*rd; + scalar_type *as =(scalar_type *)&av; + as[l] = scalar_type(a[ldx])*scale; + } + + tensor_reduced at; at=av; + + parallel_for_nest2(int n=0;n +static void sliceMaddVectorSlow (Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, + int Orthog,RealD scale=1.0) +{ + // FIXME: Implementation is slow + // Best base the linear combination by constructing a + // set of vectors of size grid->_rdimensions[Orthog]. + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = X._grid->GlobalDimensions()[Orthog]; + + GridBase *FullGrid = X._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + Lattice Xslice(SliceGrid); + Lattice Rslice(SliceGrid); + // If we based this on Cshift it would work for spread out + // but it would be even slower + for(int i=0;i +static void sliceInnerProductVectorSlow( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) + { + // FIXME: Implementation is slow + // Look at localInnerProduct implementation, + // and do inside a site loop with block strided iterators + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + typedef typename vobj::tensor_reduced scalar; + typedef typename scalar::scalar_object scomplex; + + int Nblock = lhs._grid->GlobalDimensions()[Orthog]; + + vec.resize(Nblock); + std::vector sip(Nblock); + Lattice IP(lhs._grid); + + IP=localInnerProduct(lhs,rhs); + sliceSum(IP,sip,Orthog); + + for(int ss=0;ss_rdimensions[Orthog]. +////////////////////////////////////////////////////////////////////////////////////////// + +inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) +{ + int NN = BlockSolverGrid->_ndimension; + int nsimd = BlockSolverGrid->Nsimd(); + + std::vector latt_phys(0); + std::vector simd_phys(0); + std::vector mpi_phys(0); + + for(int d=0;d_fdimensions[d]); + simd_phys.push_back(BlockSolverGrid->_simd_layout[d]); + mpi_phys.push_back(BlockSolverGrid->_processors[d]); + } + } + return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); } + + +template +static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,const Lattice &Y,int Orthog,RealD scale=1.0) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = X._grid->GlobalDimensions()[Orthog]; + + GridBase *FullGrid = X._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + Lattice Xslice(SliceGrid); + Lattice Rslice(SliceGrid); + + for(int i=0;i +static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) +{ + // FIXME: Implementation is slow + // Not sure of best solution.. think about it + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + GridBase *FullGrid = lhs._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + int Nblock = FullGrid->GlobalDimensions()[Orthog]; + + Lattice Lslice(SliceGrid); + Lattice Rslice(SliceGrid); + + mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); + + for(int i=0;i = 0> inline Grid_simd rotate(Grid_simd b, int nrot) { nrot = nrot % Grid_simd::Nsimd(); Grid_simd ret; - // std::cout << "Rotate Real by "< = 0> inline Grid_simd rotate(Grid_simd b, int nrot) { nrot = nrot % Grid_simd::Nsimd(); Grid_simd ret; - // std::cout << "Rotate Complex by "< =0> inline void rotate( Grid_simd &ret,Grid_simd b,int nrot) { nrot = nrot % Grid_simd::Nsimd(); - // std::cout << "Rotate Real by "< =0> inline void rotate(Grid_simd &ret,Grid_simd b,int nrot) { nrot = nrot % Grid_simd::Nsimd(); - // std::cout << "Rotate Complex by "<::vector_type vector_type; typedef typename GridTypeMapper::vector_typeD vector_typeD; typedef typename GridTypeMapper::tensor_reduced tensor_reduced_v; - typedef iScalar tensor_reduced; typedef typename GridTypeMapper::scalar_object recurse_scalar_object; + typedef iScalar tensor_reduced; typedef iScalar scalar_object; - // substitutes a real or complex version with same tensor structure typedef iScalar::Complexified> Complexified; typedef iScalar::Realified> Realified; @@ -78,8 +77,12 @@ class iScalar { iScalar & operator= (const iScalar ©me) = default; iScalar & operator= (iScalar &©me) = default; */ - iScalar(scalar_type s) - : _internal(s){}; // recurse down and hit the constructor for vector_type + + // template + // iScalar(EnableIf, vector_type> s) : _internal(s){}; // recurse down and hit the constructor for vector_type + + iScalar(scalar_type s) : _internal(s){}; // recurse down and hit the constructor for vector_type + iScalar(const Zero &z) { *this = zero; }; iScalar &operator=(const Zero &hero) { @@ -135,33 +138,28 @@ class iScalar { strong_inline const vtype &operator()(void) const { return _internal; } // Type casts meta programmed, must be pure scalar to match TensorRemove - template = 0, - IfNotSimd = 0> + template = 0, IfNotSimd = 0> operator ComplexF() const { return (TensorRemove(_internal)); }; - template = 0, - IfNotSimd = 0> + template = 0, IfNotSimd = 0> operator ComplexD() const { return (TensorRemove(_internal)); }; // template = 0,IfNotSimd = // 0> operator RealD () const { return(real(TensorRemove(_internal))); } - template = 0, - IfNotSimd = 0> + template = 0,IfNotSimd = 0> operator RealD() const { return TensorRemove(_internal); } - template = 0, - IfNotSimd = 0> + template = 0, IfNotSimd = 0> operator Integer() const { return Integer(TensorRemove(_internal)); } // convert from a something to a scalar via constructor of something arg - template ::value, T>::type - * = nullptr> - strong_inline iScalar operator=(T arg) { + template ::value, T>::type * = nullptr> + strong_inline iScalar operator=(T arg) { _internal = arg; return *this; } diff --git a/lib/tensors/Tensor_traits.h b/lib/tensors/Tensor_traits.h index e630c217..80009391 100644 --- a/lib/tensors/Tensor_traits.h +++ b/lib/tensors/Tensor_traits.h @@ -252,7 +252,8 @@ namespace Grid { template class isSIMDvectorized{ template - static typename std::enable_if< !std::is_same< typename GridTypeMapper::type>::scalar_type, typename GridTypeMapper::type>::vector_type>::value, char>::type test(void *); + static typename std::enable_if< !std::is_same< typename GridTypeMapper::type>::scalar_type, + typename GridTypeMapper::type>::vector_type>::value, char>::type test(void *); template static double test(...); diff --git a/tests/solver/Test_staggered_block_cg_unprec.cc b/tests/solver/Test_staggered_block_cg_unprec.cc index e9142f8c..44e8fb52 100644 --- a/tests/solver/Test_staggered_block_cg_unprec.cc +++ b/tests/solver/Test_staggered_block_cg_unprec.cc @@ -51,7 +51,7 @@ int main (int argc, char ** argv) typedef typename ImprovedStaggeredFermion5DR::ComplexField ComplexField; typename ImprovedStaggeredFermion5DR::ImplParams params; - const int Ls=8; + const int Ls=4; Grid_init(&argc,&argv); @@ -76,24 +76,44 @@ int main (int argc, char ** argv) RealD mass=0.01; ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass); - MdagMLinearOperator HermOp(Ds); ConjugateGradient CG(1.0e-8,10000); BlockConjugateGradient BCG(1.0e-8,10000); MultiRHSConjugateGradient mCG(1.0e-8,10000); - std::cout << GridLogMessage << " Calling CG "< HermOp4d(Ds4d); + FermionField src4d(UGrid); random(pRNG,src4d); + FermionField result4d(UGrid); result4d=zero; + CG(HermOp4d,src4d,result4d); + std::cout << GridLogMessage << "************************************************************************ "< HermOp(Ds); - ConjugateGradient CG(1.0e-8,10000); CG(HermOp,src,result); Grid_finalize(); From a839d5bc55d900f74289ffba21b7f0bcbe8bcfa2 Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 18 Apr 2017 11:22:17 +0100 Subject: [PATCH 36/36] Updated todo list --- TODO | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/TODO b/TODO index 27579ad3..a2b7bf03 100644 --- a/TODO +++ b/TODO @@ -2,25 +2,24 @@ TODO: --------------- Peter's work list: +1)- Half-precision comms <-- started -- SIMD is prepared +2)- Precision conversion and sort out localConvert <-- --- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- started --- Merge high precision reduction into develop <-- done --- Precision conversion and sort out localConvert <-- +3)- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- started +4)- Binary I/O speed up & x-strips + +-- Profile CG, BlockCG, etc... Flop count/rate -- PARTIAL, time but no flop/s yet -- Physical propagator interface - --- multiRHS DWF; benchmark on Cori/BNL for comms elimination - -- slice* linalg routines for multiRHS, BlockCG <-- started - --- Profile CG, BlockCG, etc... Flop count/rate --- Binary I/O speed up & x-strips --- Half-precision comms <-- started --- GaugeFix into central location --- FFTfix in sensible place --- Multigrid Wilson and DWF, compare to other Multigrid implementations --- quaternions -- Might not need - - -- Conserved currents +-- GaugeFix into central location + +-- Multigrid Wilson and DWF, compare to other Multigrid implementations +-- HDCR resume + +Recent DONE +-- Merge high precision reduction into develop +-- multiRHS DWF; benchmark on Cori/BNL for comms elimination + -- slice* linalg routines for multiRHS, BlockCG ----- * Forces; the UdSdU term in gauge force term is half of what I think it should