From 7ede6961269516638aa21560f9511ec5140fbcb2 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 16 Apr 2017 23:40:00 +0100 Subject: [PATCH] Non compile of tests fixed --- TODO | 17 +- .../iterative/BlockConjugateGradient.h | 201 ------------------ lib/lattice/Lattice_reduction.h | 159 ++++++++++++++ lib/simd/Grid_vector_types.h | 2 +- lib/tensors/Tensor_traits.h | 1 + 5 files changed, 170 insertions(+), 210 deletions(-) diff --git a/TODO b/TODO index 91034f20..27579ad3 100644 --- a/TODO +++ b/TODO @@ -3,19 +3,20 @@ TODO: Peter's work list: --- Merge high precision reduction into develop +-- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- started +-- Merge high precision reduction into develop <-- done +-- Precision conversion and sort out localConvert <-- -- Physical propagator interface --- Precision conversion and sort out localConvert --- slice* linalg routines for multiRHS, BlockCG + +-- multiRHS DWF; benchmark on Cori/BNL for comms elimination + -- slice* linalg routines for multiRHS, BlockCG <-- started + -- Profile CG, BlockCG, etc... Flop count/rate -- Binary I/O speed up & x-strips --- Half-precision comms --- multiRHS DWF; benchmark on Cori/BNL for comms elimination +-- Half-precision comms <-- started -- GaugeFix into central location --- Help Julia with NPR code --- Switch to measurements +-- FFTfix in sensible place -- Multigrid Wilson and DWF, compare to other Multigrid implementations --- Remove DenseVector, DenseMatrix; Use Eigen instead. -- quaternions -- Might not need diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h index 0f4a3a80..1db89512 100644 --- a/lib/algorithms/iterative/BlockConjugateGradient.h +++ b/lib/algorithms/iterative/BlockConjugateGradient.h @@ -30,210 +30,9 @@ directory #ifndef GRID_BLOCK_CONJUGATE_GRADIENT_H #define GRID_BLOCK_CONJUGATE_GRADIENT_H -#include namespace Grid { -GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) -{ - int NN = BlockSolverGrid->_ndimension; - int nsimd = BlockSolverGrid->Nsimd(); - - std::vector latt_phys(0); - std::vector simd_phys(0); - std::vector mpi_phys(0); - - for(int d=0;d_fdimensions[d]); - simd_phys.push_back(BlockSolverGrid->_simd_layout[d]); - mpi_phys.push_back(BlockSolverGrid->_processors[d]); - } - } - return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); -} - ////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Need to move sliceInnerProduct, sliceAxpy, sliceNorm etc... into lattice sector along with sliceSum - ////////////////////////////////////////////////////////////////////////////////////////////////////////////// -template -static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,const Lattice &Y,int Orthog,RealD scale=1.0) -{ - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - int Nblock = X._grid->GlobalDimensions()[Orthog]; - - GridBase *FullGrid = X._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - - Lattice Xslice(SliceGrid); - Lattice Rslice(SliceGrid); - // FIXME: Implementation is slow - // If we based this on Cshift it would work for spread out - // but it would be even slower - // - // Repeated extract slice is inefficient - // - // Best base the linear combination by constructing a - // set of vectors of size grid->_rdimensions[Orthog]. - for(int i=0;i -static void sliceMaddVector (Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, - int Orthog,RealD scale=1.0) -{ - // FIXME: Implementation is slow - // Best base the linear combination by constructing a - // set of vectors of size grid->_rdimensions[Orthog]. - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - int Nblock = X._grid->GlobalDimensions()[Orthog]; - - GridBase *FullGrid = X._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - - Lattice Xslice(SliceGrid); - Lattice Rslice(SliceGrid); - // If we based this on Cshift it would work for spread out - // but it would be even slower - for(int i=0;i -static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) -{ - // FIXME: Implementation is slow - // Not sure of best solution.. think about it - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - GridBase *FullGrid = lhs._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - - int Nblock = FullGrid->GlobalDimensions()[Orthog]; - - Lattice Lslice(SliceGrid); - Lattice Rslice(SliceGrid); - - mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); - - for(int i=0;i -static void sliceInnerProductVector( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) -{ - // FIXME: Implementation is slow - // Look at localInnerProduct implementation, - // and do inside a site loop with block strided iterators - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - typedef typename vobj::tensor_reduced scalar; - typedef typename scalar::scalar_object scomplex; - - int Nblock = lhs._grid->GlobalDimensions()[Orthog]; - - vec.resize(Nblock); - std::vector sip(Nblock); - Lattice IP(lhs._grid); - - IP=localInnerProduct(lhs,rhs); - sliceSum(IP,sip,Orthog); - - for(int ss=0;ss -static void sliceNorm (std::vector &sn,const Lattice &rhs,int Orthog) { - - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - int Nblock = rhs._grid->GlobalDimensions()[Orthog]; - std::vector ip(Nblock); - sn.resize(Nblock); - - sliceInnerProductVector(ip,rhs,rhs,Orthog); - for(int ss=0;ss -static void sliceInnerProductMatrixOld( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) -{ - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - typedef typename vobj::tensor_reduced scalar; - typedef typename scalar::scalar_object scomplex; - - int Nblock = lhs._grid->GlobalDimensions()[Orthog]; - - std::cout << " sliceInnerProductMatrix Dim "< IP(lhs._grid); - std::vector sip(Nblock); - - mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); - - Lattice tmp = rhs; - - for(int s1=0;s1 #ifndef GRID_LATTICE_REDUCTION_H #define GRID_LATTICE_REDUCTION_H +#include + namespace Grid { #ifdef GRID_WARN_SUBOPTIMAL #warning "Optimisation alert all these reduction loops are NOT threaded " @@ -215,6 +217,163 @@ template inline void sliceSum(const Lattice &Data,std::vector< } } +inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) + { + int NN = BlockSolverGrid->_ndimension; + int nsimd = BlockSolverGrid->Nsimd(); + + std::vector latt_phys(0); + std::vector simd_phys(0); + std::vector mpi_phys(0); + + for(int d=0;d_fdimensions[d]); + simd_phys.push_back(BlockSolverGrid->_simd_layout[d]); + mpi_phys.push_back(BlockSolverGrid->_processors[d]); + } + } + return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); + } + ////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Need to move sliceInnerProduct, sliceAxpy, sliceNorm etc... into lattice sector along with sliceSum + ////////////////////////////////////////////////////////////////////////////////////////////////////////////// +template + static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,const Lattice &Y,int Orthog,RealD scale=1.0) + { + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = X._grid->GlobalDimensions()[Orthog]; + + GridBase *FullGrid = X._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + Lattice Xslice(SliceGrid); + Lattice Rslice(SliceGrid); + // FIXME: Implementation is slow + // If we based this on Cshift it would work for spread out + // but it would be even slower + // + // Repeated extract slice is inefficient + // + // Best base the linear combination by constructing a + // set of vectors of size grid->_rdimensions[Orthog]. + for(int i=0;i + static void sliceMaddVector (Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, + int Orthog,RealD scale=1.0) + { + // FIXME: Implementation is slow + // Best base the linear combination by constructing a + // set of vectors of size grid->_rdimensions[Orthog]. + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = X._grid->GlobalDimensions()[Orthog]; + + GridBase *FullGrid = X._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + Lattice Xslice(SliceGrid); + Lattice Rslice(SliceGrid); + // If we based this on Cshift it would work for spread out + // but it would be even slower + for(int i=0;i + static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) + { + // FIXME: Implementation is slow + // Not sure of best solution.. think about it + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + GridBase *FullGrid = lhs._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + int Nblock = FullGrid->GlobalDimensions()[Orthog]; + + Lattice Lslice(SliceGrid); + Lattice Rslice(SliceGrid); + + mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); + + for(int i=0;i + static void sliceInnerProductVector( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) + { + // FIXME: Implementation is slow + // Look at localInnerProduct implementation, + // and do inside a site loop with block strided iterators + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + typedef typename vobj::tensor_reduced scalar; + typedef typename scalar::scalar_object scomplex; + + int Nblock = lhs._grid->GlobalDimensions()[Orthog]; + + vec.resize(Nblock); + std::vector sip(Nblock); + Lattice IP(lhs._grid); + + IP=localInnerProduct(lhs,rhs); + sliceSum(IP,sip,Orthog); + + for(int ss=0;ss + static void sliceNorm (std::vector &sn,const Lattice &rhs,int Orthog) { + + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = rhs._grid->GlobalDimensions()[Orthog]; + std::vector ip(Nblock); + sn.resize(Nblock); + + sliceInnerProductVector(ip,rhs,rhs,Orthog); + for(int ss=0;ss