From 8e161152e457954ba2493b9d807516ae0f2eb030 Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 18 Apr 2017 10:51:55 +0100 Subject: [PATCH] MultiRHS solver improvements with slice operations moved into lattice and sped up. Block solver requires a lot of performance work. --- .../iterative/BlockConjugateGradient.h | 85 +++- lib/algorithms/iterative/ConjugateGradient.h | 39 +- lib/lattice/Lattice_reduction.h | 457 +++++++++--------- lib/log/Log.h | 4 +- lib/simd/Grid_vector_types.h | 4 - lib/tensors/Tensor_class.h | 28 +- lib/tensors/Tensor_traits.h | 3 +- .../solver/Test_staggered_block_cg_unprec.cc | 30 +- tests/solver/Test_staggered_cg_unprec.cc | 1 - 9 files changed, 366 insertions(+), 285 deletions(-) diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h index 42a9af56..d90194ae 100644 --- a/lib/algorithms/iterative/BlockConjugateGradient.h +++ b/lib/algorithms/iterative/BlockConjugateGradient.h @@ -60,8 +60,8 @@ void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) { int Orthog = 0; // First dimension is block dim Nblock = Src._grid->_fdimensions[Orthog]; - std::cout< &Linop, const Field &Src, Field &Psi) Field AP(Src); Field R(Src); - GridStopWatch LinalgTimer; - GridStopWatch MatrixTimer; - GridStopWatch SolverTimer; - Eigen::MatrixXcd m_pAp = Eigen::MatrixXcd::Identity(Nblock,Nblock); Eigen::MatrixXcd m_pAp_inv= Eigen::MatrixXcd::Identity(Nblock,Nblock); Eigen::MatrixXcd m_rr = Eigen::MatrixXcd::Zero(Nblock,Nblock); @@ -116,33 +112,49 @@ void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) P = R; sliceInnerProductMatrix(m_rr,R,R,Orthog); + GridStopWatch sliceInnerTimer; + GridStopWatch sliceMaddTimer; + GridStopWatch MatrixTimer; + GridStopWatch SolverTimer; + SolverTimer.Start(); + int k; for (k = 1; k <= MaxIterations; k++) { RealD rrsum=0; for(int b=0;b &Linop, const Field &Src, Field &Psi) if ( max_resid < Tolerance*Tolerance ) { - std::cout << GridLogMessage<<" Block solver has converged in " - < &Linop, const Field &Src, Field &Psi) { int Orthog = 0; // First dimension is block dim Nblock = Src._grid->_fdimensions[Orthog]; - std::cout< &Linop, const Field &Src, Field &Psi) P = R; sliceNorm(v_rr,R,Orthog); + GridStopWatch sliceInnerTimer; + GridStopWatch sliceMaddTimer; + GridStopWatch sliceNormTimer; + GridStopWatch MatrixTimer; + GridStopWatch SolverTimer; + + SolverTimer.Start(); int k; for (k = 1; k <= MaxIterations; k++) { RealD rrsum=0; for(int b=0;b &Linop, const Field &Src, Field &Psi) } if ( max_resid < Tolerance*Tolerance ) { - std::cout << GridLogMessage<<" MultiRHS solver has converged in " - < { cp = a; ssq = norm2(src); - std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: guess " << guess << std::endl; - std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: src " << ssq << std::endl; - std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: mp " << d << std::endl; - std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: mmp " << b << std::endl; - std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: cp,r " << cp << std::endl; - std::cout << GridLogIterative << std::setprecision(4) - << "ConjugateGradient: p " << a << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: guess " << guess << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: src " << ssq << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: mp " << d << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: mmp " << b << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: cp,r " << cp << std::endl; + std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: p " << a << std::endl; RealD rsq = Tolerance * Tolerance * ssq; @@ -144,19 +138,20 @@ class ConjugateGradient : public OperatorFunction { RealD resnorm = sqrt(norm2(p)); RealD true_residual = resnorm / srcnorm; - std::cout << GridLogMessage - << "ConjugateGradient: Converged on iteration " << k << std::endl; - std::cout << GridLogMessage << "Computed residual " << sqrt(cp / ssq) - << " true residual " << true_residual << " target " - << Tolerance << std::endl; - std::cout << GridLogMessage << "Time elapsed: Iterations " - << SolverTimer.Elapsed() << " Matrix " - << MatrixTimer.Elapsed() << " Linalg " - << LinalgTimer.Elapsed(); - std::cout << std::endl; + std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl; + std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)< inline RealD norm2(const Lattice &arg){ ComplexD nrm = innerProduct(arg,arg); return std::real(nrm); } + // Double inner product template inline ComplexD innerProduct(const Lattice &left,const Lattice &right) @@ -101,7 +102,6 @@ inline auto sum(const LatticeTrinaryExpression & expr) return sum(closure(expr)); } -// FIXME precision promoted summation template inline typename vobj::scalar_object sum(const Lattice &arg) { @@ -141,14 +141,22 @@ inline typename vobj::scalar_object sum(const Lattice &arg) return ssum; } + +////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc... +////////////////////////////////////////////////////////////////////////////////////////////////////////////// + template inline void sliceSum(const Lattice &Data,std::vector &result,int orthogdim) { + /////////////////////////////////////////////////////// + // FIXME precision promoted summation + // may be important for correlation functions + // But easily avoided by using double precision fields + /////////////////////////////////////////////////////// typedef typename vobj::scalar_object sobj; GridBase *grid = Data._grid; assert(grid!=NULL); - // FIXME - // std::cout<SumArraySize()<<" threads "<_ndimension; const int Nsimd = grid->Nsimd(); @@ -163,18 +171,27 @@ template inline void sliceSum(const Lattice &Data,std::vector< std::vector lsSum(ld,zero); // sum across these down to scalars std::vector extracted(Nsimd); // splitting the SIMD - result.resize(fd); // And then global sum to return the same vector to every node for IO to file + result.resize(fd); // And then global sum to return the same vector to every node for(int r=0;r coor(Nd); + int e1= grid->_slice_nblock[orthogdim]; + int e2= grid->_slice_block [orthogdim]; + int stride=grid->_slice_stride[orthogdim]; // sum over reduced dimension planes, breaking out orthog dir - for(int ss=0;ssoSites();ss++){ - Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions); - int r = coor[orthogdim]; - lvSum[r]=lvSum[r]+Data._odata[ss]; + // Parallel over orthog direction + parallel_for(int r=0;r_ostride[orthogdim]; // base offset for start of plane + + for(int n=0;n inline void sliceSum(const Lattice &Data,std::vector< } } -template - static void sliceInnerProductVectorSlow( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) - { - // FIXME: Implementation is slow - // Look at localInnerProduct implementation, - // and do inside a site loop with block strided iterators - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - typedef typename vobj::tensor_reduced scalar; - typedef typename scalar::scalar_object scomplex; - - int Nblock = lhs._grid->GlobalDimensions()[Orthog]; - - vec.resize(Nblock); - std::vector sip(Nblock); - Lattice IP(lhs._grid); - - IP=localInnerProduct(lhs,rhs); - sliceSum(IP,sip,Orthog); - - for(int ss=0;ss static void sliceInnerProductVector( std::vector & result, const Lattice &lhs,const Lattice &rhs,int orthogdim) { @@ -247,8 +238,6 @@ static void sliceInnerProductVector( std::vector & result, const Latti assert(grid!=NULL); conformable(grid,rhs._grid); - // FIXME - // std::cout<SumArraySize()<<" threads "<_ndimension; const int Nsimd = grid->Nsimd(); @@ -268,16 +257,18 @@ static void sliceInnerProductVector( std::vector & result, const Latti lvSum[r]=zero; } - // sum over reduced dimension planes, breaking out orthog dir - PARALLEL_REGION { - std::vector coor(Nd); - vector_type vv; - PARALLEL_FOR_LOOP_INTERN - for(int ss=0;ssoSites();ss++){ - Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions); - int r = coor[orthogdim]; - vv = TensorRemove(innerProduct(lhs._odata[ss],rhs._odata[ss])); - PARALLEL_CRITICAL { // ouch slow rfo thrashing atomic fp add + int e1= grid->_slice_nblock[orthogdim]; + int e2= grid->_slice_block [orthogdim]; + int stride=grid->_slice_stride[orthogdim]; + + parallel_for(int r=0;r_ostride[orthogdim]; // base offset for start of plane + + for(int n=0;n & result, const Latti std::vector icoor(Nd); for(int rt=0;rt temp; temp._internal = lvSum[rt]; + iScalar temp; + temp._internal = lvSum[rt]; extract(temp,extracted); for(int idx=0;idx & result, const Latti result[t]=gsum; } } -#if 0 template -static void sliceInnerProductVector( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) +static void sliceNorm (std::vector &sn,const Lattice &rhs,int Orthog) { - // FIXME: Implementation is slow - // Look at sliceSum implementation, - // and do inside a site loop with block strided iterators - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - typedef typename vobj::tensor_reduced scalar; - typedef typename scalar::scalar_object scomplex; - - GridBase * grid = lhs._grid; - - - const int Nd = grid->_ndimension; - const int Nsimd = grid->Nsimd(); - - assert(orthogdim >= 0); - assert(orthogdim < Nd); - - int fd=grid->_fdimensions[orthogdim]; - int ld=grid->_ldimensions[orthogdim]; - int rd=grid->_rdimensions[orthogdim]; - - int Nblock = grid->GlobalDimensions()[Orthog]; - int Nrblock = grid->_rdimensions[Orthog]; - int Nthr = grid->SumArraySize(); - - std::vector > sumarray(Nrblock*Nthr); - - parallel_for(int thr=0;thrSumArraySize();thr++){ - - int nwork, mywork, myoff; - - for(int rb=0;rboSites()/Nrblock),thr,mywork,myoff); - int off = rb * grid->_slice_ - vector_type vnrm=zero; // private to thread; sub summation - for(int ss=myoff;ss sip(Nblock); - Lattice IP(lhs._grid); - - IP=localInnerProduct(lhs,rhs); - sliceSum(IP,sip,Orthog); - - for(int ss=0;ss_ndimension; - int nsimd = BlockSolverGrid->Nsimd(); - - std::vector latt_phys(0); - std::vector simd_phys(0); - std::vector mpi_phys(0); - - for(int d=0;d_fdimensions[d]); - simd_phys.push_back(BlockSolverGrid->_simd_layout[d]); - mpi_phys.push_back(BlockSolverGrid->_processors[d]); - } - } - return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); - } - ////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Need to move sliceInnerProduct, sliceAxpy, sliceNorm etc... into lattice sector along with sliceSum - ////////////////////////////////////////////////////////////////////////////////////////////////////////////// -template - static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,const Lattice &Y,int Orthog,RealD scale=1.0) - { - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - int Nblock = X._grid->GlobalDimensions()[Orthog]; - - GridBase *FullGrid = X._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - - Lattice Xslice(SliceGrid); - Lattice Rslice(SliceGrid); - // FIXME: Implementation is slow - // If we based this on Cshift it would work for spread out - // but it would be even slower - // - // Repeated extract slice is inefficient - // - // Best base the linear combination by constructing a - // set of vectors of size grid->_rdimensions[Orthog]. - for(int i=0;i - static void sliceMaddVector (Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, - int Orthog,RealD scale=1.0) - { - // FIXME: Implementation is slow - // Best base the linear combination by constructing a - // set of vectors of size grid->_rdimensions[Orthog]. - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - int Nblock = X._grid->GlobalDimensions()[Orthog]; - - GridBase *FullGrid = X._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - - Lattice Xslice(SliceGrid); - Lattice Rslice(SliceGrid); - // If we based this on Cshift it would work for spread out - // but it would be even slower - for(int i=0;i - static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) - { - // FIXME: Implementation is slow - // Not sure of best solution.. think about it - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - GridBase *FullGrid = lhs._grid; - GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - - int Nblock = FullGrid->GlobalDimensions()[Orthog]; - - Lattice Lslice(SliceGrid); - Lattice Rslice(SliceGrid); - - mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); - - for(int i=0;i - static void sliceNorm (std::vector &sn,const Lattice &rhs,int Orthog) { - typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; @@ -499,9 +324,207 @@ template for(int ss=0;ss +static void sliceMaddVector(Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, + int orthogdim,RealD scale=1.0) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + typedef typename vobj::tensor_reduced tensor_reduced; + + GridBase *grid = X._grid; + + int Nsimd =grid->Nsimd(); + int Nblock =grid->GlobalDimensions()[orthogdim]; + + int fd =grid->_fdimensions[orthogdim]; + int ld =grid->_ldimensions[orthogdim]; + int rd =grid->_rdimensions[orthogdim]; + + int e1 =grid->_slice_nblock[orthogdim]; + int e2 =grid->_slice_block [orthogdim]; + int stride =grid->_slice_stride[orthogdim]; + + std::vector icoor; + + for(int r=0;r_ostride[orthogdim]; // base offset for start of plane + + vector_type av; + + for(int l=0;liCoorFromIindex(icoor,l); + int ldx =r+icoor[orthogdim]*rd; + scalar_type *as =(scalar_type *)&av; + as[l] = scalar_type(a[ldx])*scale; + } + + tensor_reduced at; at=av; + + parallel_for_nest2(int n=0;n +static void sliceMaddVectorSlow (Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, + int Orthog,RealD scale=1.0) +{ + // FIXME: Implementation is slow + // Best base the linear combination by constructing a + // set of vectors of size grid->_rdimensions[Orthog]. + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = X._grid->GlobalDimensions()[Orthog]; + + GridBase *FullGrid = X._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + Lattice Xslice(SliceGrid); + Lattice Rslice(SliceGrid); + // If we based this on Cshift it would work for spread out + // but it would be even slower + for(int i=0;i +static void sliceInnerProductVectorSlow( std::vector & vec, const Lattice &lhs,const Lattice &rhs,int Orthog) + { + // FIXME: Implementation is slow + // Look at localInnerProduct implementation, + // and do inside a site loop with block strided iterators + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + typedef typename vobj::tensor_reduced scalar; + typedef typename scalar::scalar_object scomplex; + + int Nblock = lhs._grid->GlobalDimensions()[Orthog]; + + vec.resize(Nblock); + std::vector sip(Nblock); + Lattice IP(lhs._grid); + + IP=localInnerProduct(lhs,rhs); + sliceSum(IP,sip,Orthog); + + for(int ss=0;ss_rdimensions[Orthog]. +////////////////////////////////////////////////////////////////////////////////////////// + +inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) +{ + int NN = BlockSolverGrid->_ndimension; + int nsimd = BlockSolverGrid->Nsimd(); + + std::vector latt_phys(0); + std::vector simd_phys(0); + std::vector mpi_phys(0); + + for(int d=0;d_fdimensions[d]); + simd_phys.push_back(BlockSolverGrid->_simd_layout[d]); + mpi_phys.push_back(BlockSolverGrid->_processors[d]); + } + } + return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); } + + +template +static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,const Lattice &Y,int Orthog,RealD scale=1.0) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Nblock = X._grid->GlobalDimensions()[Orthog]; + + GridBase *FullGrid = X._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + Lattice Xslice(SliceGrid); + Lattice Rslice(SliceGrid); + + for(int i=0;i +static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) +{ + // FIXME: Implementation is slow + // Not sure of best solution.. think about it + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + GridBase *FullGrid = lhs._grid; + GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); + + int Nblock = FullGrid->GlobalDimensions()[Orthog]; + + Lattice Lslice(SliceGrid); + Lattice Rslice(SliceGrid); + + mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); + + for(int i=0;i = 0> inline Grid_simd rotate(Grid_simd b, int nrot) { nrot = nrot % Grid_simd::Nsimd(); Grid_simd ret; - // std::cout << "Rotate Real by "< = 0> inline Grid_simd rotate(Grid_simd b, int nrot) { nrot = nrot % Grid_simd::Nsimd(); Grid_simd ret; - // std::cout << "Rotate Complex by "< =0> inline void rotate( Grid_simd &ret,Grid_simd b,int nrot) { nrot = nrot % Grid_simd::Nsimd(); - // std::cout << "Rotate Real by "< =0> inline void rotate(Grid_simd &ret,Grid_simd b,int nrot) { nrot = nrot % Grid_simd::Nsimd(); - // std::cout << "Rotate Complex by "<::vector_type vector_type; typedef typename GridTypeMapper::vector_typeD vector_typeD; typedef typename GridTypeMapper::tensor_reduced tensor_reduced_v; - typedef iScalar tensor_reduced; typedef typename GridTypeMapper::scalar_object recurse_scalar_object; + typedef iScalar tensor_reduced; typedef iScalar scalar_object; - // substitutes a real or complex version with same tensor structure typedef iScalar::Complexified> Complexified; typedef iScalar::Realified> Realified; @@ -78,8 +77,12 @@ class iScalar { iScalar & operator= (const iScalar ©me) = default; iScalar & operator= (iScalar &©me) = default; */ - iScalar(scalar_type s) - : _internal(s){}; // recurse down and hit the constructor for vector_type + + // template + // iScalar(EnableIf, vector_type> s) : _internal(s){}; // recurse down and hit the constructor for vector_type + + iScalar(scalar_type s) : _internal(s){}; // recurse down and hit the constructor for vector_type + iScalar(const Zero &z) { *this = zero; }; iScalar &operator=(const Zero &hero) { @@ -135,33 +138,28 @@ class iScalar { strong_inline const vtype &operator()(void) const { return _internal; } // Type casts meta programmed, must be pure scalar to match TensorRemove - template = 0, - IfNotSimd = 0> + template = 0, IfNotSimd = 0> operator ComplexF() const { return (TensorRemove(_internal)); }; - template = 0, - IfNotSimd = 0> + template = 0, IfNotSimd = 0> operator ComplexD() const { return (TensorRemove(_internal)); }; // template = 0,IfNotSimd = // 0> operator RealD () const { return(real(TensorRemove(_internal))); } - template = 0, - IfNotSimd = 0> + template = 0,IfNotSimd = 0> operator RealD() const { return TensorRemove(_internal); } - template = 0, - IfNotSimd = 0> + template = 0, IfNotSimd = 0> operator Integer() const { return Integer(TensorRemove(_internal)); } // convert from a something to a scalar via constructor of something arg - template ::value, T>::type - * = nullptr> - strong_inline iScalar operator=(T arg) { + template ::value, T>::type * = nullptr> + strong_inline iScalar operator=(T arg) { _internal = arg; return *this; } diff --git a/lib/tensors/Tensor_traits.h b/lib/tensors/Tensor_traits.h index e630c217..80009391 100644 --- a/lib/tensors/Tensor_traits.h +++ b/lib/tensors/Tensor_traits.h @@ -252,7 +252,8 @@ namespace Grid { template class isSIMDvectorized{ template - static typename std::enable_if< !std::is_same< typename GridTypeMapper::type>::scalar_type, typename GridTypeMapper::type>::vector_type>::value, char>::type test(void *); + static typename std::enable_if< !std::is_same< typename GridTypeMapper::type>::scalar_type, + typename GridTypeMapper::type>::vector_type>::value, char>::type test(void *); template static double test(...); diff --git a/tests/solver/Test_staggered_block_cg_unprec.cc b/tests/solver/Test_staggered_block_cg_unprec.cc index e9142f8c..44e8fb52 100644 --- a/tests/solver/Test_staggered_block_cg_unprec.cc +++ b/tests/solver/Test_staggered_block_cg_unprec.cc @@ -51,7 +51,7 @@ int main (int argc, char ** argv) typedef typename ImprovedStaggeredFermion5DR::ComplexField ComplexField; typename ImprovedStaggeredFermion5DR::ImplParams params; - const int Ls=8; + const int Ls=4; Grid_init(&argc,&argv); @@ -76,24 +76,44 @@ int main (int argc, char ** argv) RealD mass=0.01; ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass); - MdagMLinearOperator HermOp(Ds); ConjugateGradient CG(1.0e-8,10000); BlockConjugateGradient BCG(1.0e-8,10000); MultiRHSConjugateGradient mCG(1.0e-8,10000); - std::cout << GridLogMessage << " Calling CG "< HermOp4d(Ds4d); + FermionField src4d(UGrid); random(pRNG,src4d); + FermionField result4d(UGrid); result4d=zero; + CG(HermOp4d,src4d,result4d); + std::cout << GridLogMessage << "************************************************************************ "< HermOp(Ds); - ConjugateGradient CG(1.0e-8,10000); CG(HermOp,src,result); Grid_finalize();