diff --git a/lib/algorithms/CoarsenedMatrix.h b/lib/algorithms/CoarsenedMatrix.h index c2910151..8af8d7ac 100644 --- a/lib/algorithms/CoarsenedMatrix.h +++ b/lib/algorithms/CoarsenedMatrix.h @@ -103,29 +103,32 @@ namespace Grid { GridBase *CoarseGrid; GridBase *FineGrid; std::vector > subspace; + int checkerboard; - Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid) : - CoarseGrid(_CoarseGrid), + Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) : + CoarseGrid(_CoarseGrid), FineGrid(_FineGrid), - subspace(nbasis,_FineGrid) + subspace(nbasis,_FineGrid), + checkerboard(_checkerboard) { }; void Orthogonalise(void){ CoarseScalar InnerProd(CoarseGrid); + std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"< pokey(CoarseGrid); - - for(int i=0;ioSites();ss++){ + parallel_for(int ss=0;ssoSites();ss++){ eProj._odata[ss](i)=CComplex(1.0); } eProj=eProj - iProj; @@ -137,6 +140,7 @@ namespace Grid { blockProject(CoarseVec,FineVec,subspace); } void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){ + FineVec.checkerboard = subspace[0].checkerboard; blockPromote(CoarseVec,FineVec,subspace); } void CreateSubspaceRandom(GridParallelRNG &RNG){ @@ -147,6 +151,7 @@ namespace Grid { Orthogonalise(); } + /* virtual void CreateSubspaceLanczos(GridParallelRNG &RNG,LinearOperatorBase &hermop,int nn=nbasis) { // Run a Lanczos with sloppy convergence @@ -195,7 +200,7 @@ namespace Grid { std::cout << GridLogMessage <<"subspace["< &hermop,int nn=nbasis) { RealD scale; diff --git a/lib/algorithms/LinearOperator.h b/lib/algorithms/LinearOperator.h index f1b8820e..26746e6e 100644 --- a/lib/algorithms/LinearOperator.h +++ b/lib/algorithms/LinearOperator.h @@ -308,20 +308,34 @@ namespace Grid { public: SchurStaggeredOperator (Matrix &Mat): _Mat(Mat){}; virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ + GridLogIterative.TimingMode(1); + std::cout << GridLogIterative << " HermOpAndNorm "< class IdentityLinearFunction : public LinearFunction { + public: + void operator() (const Field &in, Field &out){ + out = in; + }; + }; + + ///////////////////////////////////////////////////////////// // Base classes for Multishift solvers for operators ///////////////////////////////////////////////////////////// @@ -368,6 +390,64 @@ namespace Grid { }; */ + //////////////////////////////////////////////////////////////////////////////////////////// + // Hermitian operator Linear function and operator function + //////////////////////////////////////////////////////////////////////////////////////////// + template + class HermOpOperatorFunction : public OperatorFunction { + void operator() (LinearOperatorBase &Linop, const Field &in, Field &out) { + Linop.HermOp(in,out); + }; + }; + + template + class PlainHermOp : public LinearFunction { + public: + LinearOperatorBase &_Linop; + + PlainHermOp(LinearOperatorBase& linop) : _Linop(linop) + {} + + void operator()(const Field& in, Field& out) { + _Linop.HermOp(in,out); + } + }; + + template + class FunctionHermOp : public LinearFunction { + public: + OperatorFunction & _poly; + LinearOperatorBase &_Linop; + + FunctionHermOp(OperatorFunction & poly,LinearOperatorBase& linop) + : _poly(poly), _Linop(linop) {}; + + void operator()(const Field& in, Field& out) { + _poly(_Linop,in,out); + } + }; + + template + class Polynomial : public OperatorFunction { + private: + std::vector Coeffs; + public: + Polynomial(std::vector &_Coeffs) : Coeffs(_Coeffs) { }; + + // Implement the required interface + void operator() (LinearOperatorBase &Linop, const Field &in, Field &out) { + + Field AtoN(in._grid); + Field Mtmp(in._grid); + AtoN = in; + out = AtoN*Coeffs[0]; + for(int n=1;n namespace Grid { - //////////////////////////////////////////////////////////////////////////////////////////// - // Simple general polynomial with user supplied coefficients - //////////////////////////////////////////////////////////////////////////////////////////// - template - class HermOpOperatorFunction : public OperatorFunction { - void operator() (LinearOperatorBase &Linop, const Field &in, Field &out) { - Linop.HermOp(in,out); - }; - }; - - template - class Polynomial : public OperatorFunction { - private: - std::vector Coeffs; - public: - Polynomial(std::vector &_Coeffs) : Coeffs(_Coeffs) { }; - - // Implement the required interface - void operator() (LinearOperatorBase &Linop, const Field &in, Field &out) { - - Field AtoN(in._grid); - Field Mtmp(in._grid); - AtoN = in; - out = AtoN*Coeffs[0]; -// std::cout <<"Poly in " < -Author: paboyle -Author: Chulwoo Jung -Author: Christoph Lehner - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ -#ifndef GRID_BIRL_H -#define GRID_BIRL_H - -#include //memset - -#include -#include - -#include -#include -#include - -namespace Grid { - -///////////////////////////////////////////////////////////// -// Implicitly restarted lanczos -///////////////////////////////////////////////////////////// - - template - class BlockImplicitlyRestartedLanczos { - - const RealD small = 1.0e-16; -public: - int lock; - int get; - int Niter; - int converged; - - int Nminres; // Minimum number of restarts; only check for convergence after - int Nstop; // Number of evecs checked for convergence - int Nk; // Number of converged sought - int Np; // Np -- Number of spare vecs in kryloc space - int Nm; // Nm -- total number of vectors - - int orth_period; - - RealD OrthoTime; - - RealD eresid, betastp; - SortEigen _sort; - LinearFunction &_HermOp; - LinearFunction &_HermOpTest; - ///////////////////////// - // Constructor - ///////////////////////// - - BlockImplicitlyRestartedLanczos( - LinearFunction & HermOp, - LinearFunction & HermOpTest, - int _Nstop, // sought vecs - int _Nk, // sought vecs - int _Nm, // spare vecs - RealD _eresid, // resid in lmdue deficit - RealD _betastp, // if beta(k) < betastp: converged - int _Niter, // Max iterations - int _Nminres, int _orth_period = 1) : - _HermOp(HermOp), - _HermOpTest(HermOpTest), - Nstop(_Nstop), - Nk(_Nk), - Nm(_Nm), - eresid(_eresid), - betastp(_betastp), - Niter(_Niter), - Nminres(_Nminres), - orth_period(_orth_period) - { - Np = Nm-Nk; assert(Np>0); - }; - - BlockImplicitlyRestartedLanczos( - LinearFunction & HermOp, - LinearFunction & HermOpTest, - int _Nk, // sought vecs - int _Nm, // spare vecs - RealD _eresid, // resid in lmdue deficit - RealD _betastp, // if beta(k) < betastp: converged - int _Niter, // Max iterations - int _Nminres, - int _orth_period = 1) : - _HermOp(HermOp), - _HermOpTest(HermOpTest), - Nstop(_Nk), - Nk(_Nk), - Nm(_Nm), - eresid(_eresid), - betastp(_betastp), - Niter(_Niter), - Nminres(_Nminres), - orth_period(_orth_period) - { - Np = Nm-Nk; assert(Np>0); - }; - - -/* Saad PP. 195 -1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0 -2. For k = 1,2,...,m Do: -3. wk:=Avk−βkv_{k−1} -4. αk:=(wk,vk) // -5. wk:=wk−αkvk // wk orthog vk -6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop -7. vk+1 := wk/βk+1 -8. EndDo - */ - void step(std::vector& lmd, - std::vector& lme, - BasisFieldVector& evec, - Field& w,int Nm,int k) - { - assert( k< Nm ); - - GridStopWatch gsw_op,gsw_o; - - Field& evec_k = evec[k]; - - gsw_op.Start(); - _HermOp(evec_k,w); - gsw_op.Stop(); - - if(k>0){ - w -= lme[k-1] * evec[k-1]; - } - - ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk) - RealD alph = real(zalph); - - w = w - alph * evec_k;// 5. wk:=wk−αkvk - - RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop - // 7. vk+1 := wk/βk+1 - - std::cout<0 && k % orth_period == 0) { - orthogonalize(w,evec,k); // orthonormalise - } - gsw_o.Stop(); - - if(k < Nm-1) { - evec[k+1] = w; - } - - std::cout << GridLogMessage << "Timing: operator=" << gsw_op.Elapsed() << - " orth=" << gsw_o.Elapsed() << std::endl; - - } - - void qr_decomp(std::vector& lmd, - std::vector& lme, - int Nk, - int Nm, - std::vector& Qt, - RealD Dsh, - int kmin, - int kmax) - { - int k = kmin-1; - RealD x; - - RealD Fden = 1.0/hypot(lmd[k]-Dsh,lme[k]); - RealD c = ( lmd[k] -Dsh) *Fden; - RealD s = -lme[k] *Fden; - - RealD tmpa1 = lmd[k]; - RealD tmpa2 = lmd[k+1]; - RealD tmpb = lme[k]; - - lmd[k] = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb; - lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb; - lme[k] = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb; - x =-s*lme[k+1]; - lme[k+1] = c*lme[k+1]; - - for(int i=0; i& lmd, - std::vector& lme, - int N1, - int N2, - std::vector& Qt, - GridBase *grid){ - - std::cout << GridLogMessage << "diagonalize_lapack start\n"; - GridStopWatch gsw; - - const int size = Nm; - // tevals.resize(size); - // tevecs.resize(size); - LAPACK_INT NN = N1; - std::vector evals_tmp(NN); - std::vector evec_tmp(NN*NN); - memset(&evec_tmp[0],0,sizeof(double)*NN*NN); - // double AA[NN][NN]; - std::vector DD(NN); - std::vector EE(NN); - for (int i = 0; i< NN; i++) - for (int j = i - 1; j <= i + 1; j++) - if ( j < NN && j >= 0 ) { - if (i==j) DD[i] = lmd[i]; - if (i==j) evals_tmp[i] = lmd[i]; - if (j==(i-1)) EE[j] = lme[j]; - } - LAPACK_INT evals_found; - LAPACK_INT lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ; - LAPACK_INT liwork = 3+NN*10 ; - std::vector iwork(liwork); - std::vector work(lwork); - std::vector isuppz(2*NN); - char jobz = 'V'; // calculate evals & evecs - char range = 'I'; // calculate all evals - // char range = 'A'; // calculate all evals - char uplo = 'U'; // refer to upper half of original matrix - char compz = 'I'; // Compute eigenvectors of tridiagonal matrix - std::vector ifail(NN); - LAPACK_INT info; - // int total = QMP_get_number_of_nodes(); - // int node = QMP_get_node_number(); - // GridBase *grid = evec[0]._grid; - int total = grid->_Nprocessors; - int node = grid->_processor; - int interval = (NN/total)+1; - double vl = 0.0, vu = 0.0; - LAPACK_INT il = interval*node+1 , iu = interval*(node+1); - if (iu > NN) iu=NN; - double tol = 0.0; - if (1) { - memset(&evals_tmp[0],0,sizeof(double)*NN); - if ( il <= NN){ - std::cout << GridLogMessage << "dstegr started" << std::endl; - gsw.Start(); - dstegr(&jobz, &range, &NN, - (double*)&DD[0], (double*)&EE[0], - &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A' - &tol, // tolerance - &evals_found, &evals_tmp[0], (double*)&evec_tmp[0], &NN, - &isuppz[0], - &work[0], &lwork, &iwork[0], &liwork, - &info); - gsw.Stop(); - std::cout << GridLogMessage << "dstegr completed in " << gsw.Elapsed() << std::endl; - for (int i = iu-1; i>= il-1; i--){ - evals_tmp[i] = evals_tmp[i - (il-1)]; - if (il>1) evals_tmp[i-(il-1)]=0.; - for (int j = 0; j< NN; j++){ - evec_tmp[i*NN + j] = evec_tmp[(i - (il-1)) * NN + j]; - if (il>1) evec_tmp[(i-(il-1)) * NN + j]=0.; - } - } - } - { - // QMP_sum_double_array(evals_tmp,NN); - // QMP_sum_double_array((double *)evec_tmp,NN*NN); - grid->GlobalSumVector(&evals_tmp[0],NN); - grid->GlobalSumVector(&evec_tmp[0],NN*NN); - } - } - // cheating a bit. It is better to sort instead of just reversing it, but the document of the routine says evals are sorted in increasing order. qr gives evals in decreasing order. - for(int i=0;i& lmd, - std::vector& lme, - int N2, - int N1, - std::vector& Qt, - GridBase *grid) - { - -#ifdef USE_LAPACK_IRL - const int check_lapack=0; // just use lapack if 0, check against lapack if 1 - - if(!check_lapack) - return diagonalize_lapack(lmd,lme,N2,N1,Qt,grid); - - std::vector lmd2(N1); - std::vector lme2(N1); - std::vector Qt2(N1*N1); - for(int k=0; k= kmin; --j){ - RealD dds = fabs(lmd[j-1])+fabs(lmd[j]); - if(fabs(lme[j-1])+dds > dds){ - kmax = j+1; - goto continued; - } - } - Niter = iter; -#ifdef USE_LAPACK_IRL - if(check_lapack){ - const double SMALL=1e-8; - diagonalize_lapack(lmd2,lme2,N2,N1,Qt2,grid); - std::vector lmd3(N2); - for(int k=0; kSMALL) std::cout<SMALL) std::cout<SMALL) std::cout< dds){ - kmin = j+1; - break; - } - } - } - std::cout< - static RealD normalise(T& v) - { - RealD nn = norm2(v); - nn = sqrt(nn); - v = v * (1.0/nn); - return nn; - } - - void orthogonalize(Field& w, - BasisFieldVector& evec, - int k) - { - double t0=-usecond()/1e6; - - evec.orthogonalize(w,k); - - normalise(w); - t0+=usecond()/1e6; - OrthoTime +=t0; - } - - void setUnit_Qt(int Nm, std::vector &Qt) { - for(int i=0; i K P = M − K † -Compute the factorization AVM = VM HM + fM eM -repeat - Q=I - for i = 1,...,P do - QiRi =HM −θiI Q = QQi - H M = Q †i H M Q i - end for - βK =HM(K+1,K) σK =Q(M,K) - r=vK+1βK +rσK - VK =VM(1:M)Q(1:M,1:K) - HK =HM(1:K,1:K) - →AVK =VKHK +fKe†K † Extend to an M = K + P step factorization AVM = VMHM + fMeM -until convergence -*/ - - void calc(std::vector& eval, - BasisFieldVector& evec, - const Field& src, - int& Nconv, - bool reverse, - int SkipTest) - { - - GridBase *grid = evec._v[0]._grid;//evec.get(0 + evec_offset)._grid; - assert(grid == src._grid); - - std::cout< lme(Nm); - std::vector lme2(Nm); - std::vector eval2(Nm); - std::vector eval2_copy(Nm); - std::vector Qt(Nm*Nm); - - - Field f(grid); - Field v(grid); - - int k1 = 1; - int k2 = Nk; - - Nconv = 0; - - RealD beta_k; - - // Set initial vector - evec[0] = src; - normalise(evec[0]); - std:: cout<0); - evec.rotate(Qt,k1-1,k2+1,0,Nm,Nm); - - t1=usecond()/1e6; - std::cout<= Nminres) { - std::cout << GridLogMessage << "Rotation to test convergence " << std::endl; - - Field ev0_orig(grid); - ev0_orig = evec[0]; - - evec.rotate(Qt,0,Nk,0,Nk,Nm); - - { - std::cout << GridLogMessage << "Test convergence" << std::endl; - Field B(grid); - - for(int j = 0; j=Nstop || beta_k < betastp){ - goto converged; - } - - std::cout << GridLogMessage << "Rotate back" << std::endl; - //B[j] +=Qt[k+_Nm*j] * _v[k]._odata[ss]; - { - Eigen::MatrixXd qm = Eigen::MatrixXd::Zero(Nk,Nk); - for (int k=0;k QtI(Nm*Nm); - for (int k=0;k -class BasisFieldVector { - public: - int _Nm; - - typedef typename Field::scalar_type Coeff_t; - typedef typename Field::vector_type vCoeff_t; - typedef typename Field::vector_object vobj; - typedef typename vobj::scalar_object sobj; - - std::vector _v; // _Nfull vectors - - void report(int n,GridBase* value) { - - std::cout << GridLogMessage << "BasisFieldVector allocated:\n"; - std::cout << GridLogMessage << " Delta N = " << n << "\n"; - std::cout << GridLogMessage << " Size of full vectors (size) = " << - ((double)n*sizeof(vobj)*value->oSites() / 1024./1024./1024.) << " GB\n"; - std::cout << GridLogMessage << " Size = " << _v.size() << " Capacity = " << _v.capacity() << std::endl; - - value->Barrier(); - - if (value->IsBoss()) { - system("cat /proc/meminfo"); - } - - value->Barrier(); - - } - - BasisFieldVector(int Nm,GridBase* value) : _Nm(Nm), _v(Nm,value) { - report(Nm,value); - } - - ~BasisFieldVector() { - } - - Field& operator[](int i) { - return _v[i]; - } - - void orthogonalize(Field& w, int k) { - for(int j=0; j& Qt,int j0, int j1, int k0,int k1,int Nm) { - - GridBase* grid = _v[0]._grid; - -#pragma omp parallel - { - std::vector < vobj > B(Nm); - -#pragma omp for - for(int ss=0;ss < grid->oSites();ss++){ - for(int j=j0; j _Nm) - _v.reserve(n); - - _v.resize(n,_v[0]._grid); - - if (n < _Nm) - _v.shrink_to_fit(); - - report(n - _Nm,_v[0]._grid); - - _Nm = n; - } - - std::vector getIndex(std::vector& sort_vals) { - - std::vector idx(sort_vals.size()); - iota(idx.begin(), idx.end(), 0); - - // sort indexes based on comparing values in v - sort(idx.begin(), idx.end(), - [&sort_vals](int i1, int i2) {return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);}); - - return idx; - } - - void reorderInPlace(std::vector& sort_vals, std::vector& idx) { - GridStopWatch gsw; - gsw.Start(); - - int nswaps = 0; - for (size_t i=0;i& sort_vals, bool reverse) { - - std::vector idx = getIndex(sort_vals); - if (reverse) - std::reverse(idx.begin(), idx.end()); - - reorderInPlace(sort_vals,idx); - - } - - void deflate(const std::vector& eval,const Field& src_orig,Field& result) { - result = zero; - int N = (int)_v.size(); - for (int i=0;i { cp = a; ssq = norm2(src); - std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: guess " << guess << std::endl; - std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: src " << ssq << std::endl; - std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: mp " << d << std::endl; - std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: mmp " << b << std::endl; - std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: cp,r " << cp << std::endl; - std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: p " << a << std::endl; + std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: guess " << guess << std::endl; + std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: src " << ssq << std::endl; + std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: mp " << d << std::endl; + std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: mmp " << b << std::endl; + std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: cp,r " << cp << std::endl; + std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: p " << a << std::endl; RealD rsq = Tolerance * Tolerance * ssq; @@ -92,7 +92,7 @@ class ConjugateGradient : public OperatorFunction { return; } - std::cout << GridLogIterative << std::setprecision(4) + std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl; GridStopWatch LinalgTimer; diff --git a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h index a8723f32..7b85c095 100644 --- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h +++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h @@ -7,8 +7,9 @@ Copyright (C) 2015 Author: Peter Boyle -Author: Chulwoo Jung -Author: Guido Cossu +Author: paboyle +Author: Chulwoo Jung +Author: Christoph Lehner This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -27,125 +28,288 @@ Author: Guido Cossu See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ -#ifndef GRID_IRL_H -#define GRID_IRL_H +#ifndef GRID_BIRL_H +#define GRID_BIRL_H #include //memset +//#include +#include -namespace Grid { +namespace Grid { - enum IRLdiagonalisation { - IRLdiagonaliseWithDSTEGR, - IRLdiagonaliseWithQR, - IRLdiagonaliseWithEigen - }; - -//////////////////////////////////////////////////////////////////////////////// -// Helper class for sorting the evalues AND evectors by Field -// Use pointer swizzle on vectors -//////////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////// + // Move following 100 LOC to lattice/Lattice_basis.h + //////////////////////////////////////////////////////// template -class SortEigen { - private: - static bool less_lmd(RealD left,RealD right){ - return left > right; - } - static bool less_pair(std::pair& left, - std::pair& right){ - return left.first > (right.first); - } - - public: - void push(std::vector& lmd,std::vector& evec,int N) { - - //////////////////////////////////////////////////////////////////////// - // PAB: FIXME: VERY VERY VERY wasteful: takes a copy of the entire vector set. - // : The vector reorder should be done by pointer swizzle somehow - //////////////////////////////////////////////////////////////////////// - std::vector cpy(lmd.size(),evec[0]._grid); - for(int i=0;i > emod(lmd.size()); +void basisOrthogonalize(std::vector &basis,Field &w,int k) +{ + for(int j=0; j(lmd[i],&cpy[i]); - - partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair); - - typename std::vector >::iterator it = emod.begin(); - for(int i=0;ifirst; - evec[i]=*(it->second); - ++it; +template +void basisRotate(std::vector &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm) +{ + typedef typename Field::vector_object vobj; + GridBase* grid = basis[0]._grid; + + parallel_region + { + std::vector < vobj > B(Nm); // Thread private + + parallel_for_internal(int ss=0;ss < grid->oSites();ss++){ + for(int j=j0; j& lmd,int N) { - std::partial_sort(lmd.begin(),lmd.begin()+N,lmd.end(),less_lmd); +} + +// Extract a single rotated vector +template +void basisRotateJ(Field &result,std::vector &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) +{ + typedef typename Field::vector_object vobj; + GridBase* grid = basis[0]._grid; + + result.checkerboard = basis[0].checkerboard; + parallel_for(int ss=0;ss < grid->oSites();ss++){ + vobj B = zero; + for(int k=k0; k fabs(thrs); +} + +template +void basisReorderInPlace(std::vector &_v,std::vector& sort_vals, std::vector& idx) +{ + int vlen = idx.size(); + + assert(vlen>=1); + assert(vlen<=sort_vals.size()); + assert(vlen<=_v.size()); + + for (size_t i=0;ii for which _vnew[j] = _vold[i], + // track the move idx[j] => idx[i] + // track the move idx[i] => i + ////////////////////////////////////// + size_t j; + for (j=i;j i); assert(j!=idx.size()); assert(idx[j]==i); + + std::swap(_v[i]._odata,_v[idx[i]]._odata); // should use vector move constructor, no data copy + std::swap(sort_vals[i],sort_vals[idx[i]]); + + idx[j] = idx[i]; + idx[i] = i; + } } -}; +} + +inline std::vector basisSortGetIndex(std::vector& sort_vals) +{ + std::vector idx(sort_vals.size()); + std::iota(idx.begin(), idx.end(), 0); + + // sort indexes based on comparing values in v + std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) { + return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]); + }); + return idx; +} + +template +void basisSortInPlace(std::vector & _v,std::vector& sort_vals, bool reverse) +{ + std::vector idx = basisSortGetIndex(sort_vals); + if (reverse) + std::reverse(idx.begin(), idx.end()); + + basisReorderInPlace(_v,sort_vals,idx); +} + +// PAB: faster to compute the inner products first then fuse loops. +// If performance critical can improve. +template +void basisDeflate(const std::vector &_v,const std::vector& eval,const Field& src_orig,Field& result) { + result = zero; + assert(_v.size()==eval.size()); + int N = (int)_v.size(); + for (int i=0;i class ImplicitlyRestartedLanczosTester +{ + public: + virtual int TestConvergence(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox)=0; + virtual int ReconstructEval(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox)=0; +}; + +enum IRLdiagonalisation { + IRLdiagonaliseWithDSTEGR, + IRLdiagonaliseWithQR, + IRLdiagonaliseWithEigen +}; + +template class ImplicitlyRestartedLanczosHermOpTester : public ImplicitlyRestartedLanczosTester +{ + public: + LinearFunction &_HermOp; + ImplicitlyRestartedLanczosHermOpTester(LinearFunction &HermOp) : _HermOp(HermOp) { }; + int ReconstructEval(int j,RealD resid,Field &B, RealD &eval,RealD evalMaxApprox) + { + return TestConvergence(j,resid,B,eval,evalMaxApprox); + } + int TestConvergence(int j,RealD eresid,Field &B, RealD &eval,RealD evalMaxApprox) + { + Field v(B); + RealD eval_poly = eval; + // Apply operator + _HermOp(B,v); + + RealD vnum = real(innerProduct(B,v)); // HermOp. + RealD vden = norm2(B); + RealD vv0 = norm2(v); + eval = vnum/vden; + v -= eval*B; + + RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0); + + std::cout.precision(13); + std::cout< class ImplicitlyRestartedLanczos { - -private: - - int MaxIter; // Max iterations - int Nstop; // Number of evecs checked for convergence - int Nk; // Number of converged sought - int Nm; // Nm -- total number of vectors - RealD eresid; + private: + const RealD small = 1.0e-8; + int MaxIter; + int MinRestart; // Minimum number of restarts; only check for convergence after + int Nstop; // Number of evecs checked for convergence + int Nk; // Number of converged sought + // int Np; // Np -- Number of spare vecs in krylov space // == Nm - Nk + int Nm; // Nm -- total number of vectors IRLdiagonalisation diagonalisation; - //////////////////////////////////// + int orth_period; + + RealD OrthoTime; + RealD eresid, betastp; + //////////////////////////////// // Embedded objects - //////////////////////////////////// - SortEigen _sort; - LinearOperatorBase &_Linop; - OperatorFunction &_poly; - + //////////////////////////////// + LinearFunction &_PolyOp; + LinearFunction &_HermOp; + ImplicitlyRestartedLanczosTester &_Tester; + // Default tester provided (we need a ref to something in default case) + ImplicitlyRestartedLanczosHermOpTester SimpleTester; ///////////////////////// // Constructor ///////////////////////// + public: - ImplicitlyRestartedLanczos(LinearOperatorBase &Linop, // op - OperatorFunction & poly, // polynomial - int _Nstop, // really sought vecs - int _Nk, // sought vecs - int _Nm, // total vecs - RealD _eresid, // resid in lmd deficit - int _MaxIter, // Max iterations - IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen ) : - _Linop(Linop), _poly(poly), - Nstop(_Nstop), Nk(_Nk), Nm(_Nm), - eresid(_eresid), MaxIter(_MaxIter), - diagonalisation(_diagonalisation) - { }; + ////////////////////////////////////////////////////////////////// + // PAB: + ////////////////////////////////////////////////////////////////// + // Too many options & knobs. + // Eliminate: + // orth_period + // betastp + // MinRestart + // + // Do we really need orth_period + // What is the theoretical basis & guarantees of betastp ? + // Nstop=Nk viable? + // MinRestart avoidable with new convergence test? + // Could cut to PolyOp, HermOp, Tester, Nk, Nm, resid, maxiter (+diagonalisation) + // HermOp could be eliminated if we dropped the Power method for max eval. + // -- also: The eval, eval2, eval2_copy stuff is still unnecessarily unclear + ////////////////////////////////////////////////////////////////// + ImplicitlyRestartedLanczos(LinearFunction & PolyOp, + LinearFunction & HermOp, + ImplicitlyRestartedLanczosTester & Tester, + int _Nstop, // sought vecs + int _Nk, // sought vecs + int _Nm, // spare vecs + RealD _eresid, // resid in lmdue deficit + int _MaxIter, // Max iterations + RealD _betastp=0.0, // if beta(k) < betastp: converged + int _MinRestart=1, int _orth_period = 1, + IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) : + SimpleTester(HermOp), _PolyOp(PolyOp), _HermOp(HermOp), _Tester(Tester), + Nstop(_Nstop) , Nk(_Nk), Nm(_Nm), + eresid(_eresid), betastp(_betastp), + MaxIter(_MaxIter) , MinRestart(_MinRestart), + orth_period(_orth_period), diagonalisation(_diagonalisation) { }; + + ImplicitlyRestartedLanczos(LinearFunction & PolyOp, + LinearFunction & HermOp, + int _Nstop, // sought vecs + int _Nk, // sought vecs + int _Nm, // spare vecs + RealD _eresid, // resid in lmdue deficit + int _MaxIter, // Max iterations + RealD _betastp=0.0, // if beta(k) < betastp: converged + int _MinRestart=1, int _orth_period = 1, + IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) : + SimpleTester(HermOp), _PolyOp(PolyOp), _HermOp(HermOp), _Tester(SimpleTester), + Nstop(_Nstop) , Nk(_Nk), Nm(_Nm), + eresid(_eresid), betastp(_betastp), + MaxIter(_MaxIter) , MinRestart(_MinRestart), + orth_period(_orth_period), diagonalisation(_diagonalisation) { }; //////////////////////////////// // Helpers //////////////////////////////// - static RealD normalise(Field& v) + template static RealD normalise(T& v) { RealD nn = norm2(v); nn = sqrt(nn); v = v * (1.0/nn); return nn; } - - void orthogonalize(Field& w, std::vector& evec, int k) + + void orthogonalize(Field& w, std::vector& evec,int k) { - typedef typename Field::scalar_type MyComplex; - MyComplex ip; - - for(int j=0; j& eval, std::vector& evec, const Field& src, int& Nconv) + void calc(std::vector& eval, std::vector& evec, const Field& src, int& Nconv, bool reverse=false) { + GridBase *grid = src._grid; + assert(grid == evec[0]._grid); - GridBase *grid = evec[0]._grid; - assert(grid == src._grid); - - std::cout << GridLogMessage <<"**************************************************************************"<< std::endl; - std::cout << GridLogMessage <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 / "<< MaxIter<< std::endl; - std::cout << GridLogMessage <<"**************************************************************************"<< std::endl; - std::cout << GridLogMessage <<" -- seek Nk = " << Nk <<" vectors"<< std::endl; - std::cout << GridLogMessage <<" -- accept Nstop = " << Nstop <<" vectors"<< std::endl; - std::cout << GridLogMessage <<" -- total Nm = " << Nm <<" vectors"<< std::endl; - std::cout << GridLogMessage <<" -- size of eval = " << eval.size() << std::endl; - std::cout << GridLogMessage <<" -- size of evec = " << evec.size() << std::endl; + GridLogIRL.TimingMode(1); + std::cout << GridLogIRL <<"**************************************************************************"<< std::endl; + std::cout << GridLogIRL <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 / "<< MaxIter<< std::endl; + std::cout << GridLogIRL <<"**************************************************************************"<< std::endl; + std::cout << GridLogIRL <<" -- seek Nk = " << Nk <<" vectors"<< std::endl; + std::cout << GridLogIRL <<" -- accept Nstop = " << Nstop <<" vectors"<< std::endl; + std::cout << GridLogIRL <<" -- total Nm = " << Nm <<" vectors"<< std::endl; + std::cout << GridLogIRL <<" -- size of eval = " << eval.size() << std::endl; + std::cout << GridLogIRL <<" -- size of evec = " << evec.size() << std::endl; if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) { - std::cout << GridLogMessage << "Diagonalisation is DSTEGR "< lme(Nm); std::vector lme2(Nm); std::vector eval2(Nm); + std::vector eval2_copy(Nm); + Eigen::MatrixXd Qt = Eigen::MatrixXd::Zero(Nm,Nm); - Eigen::MatrixXd Qt = Eigen::MatrixXd::Zero(Nm,Nm); - - std::vector Iconv(Nm); - std::vector B(Nm,grid); // waste of space replicating - Field f(grid); Field v(grid); - int k1 = 1; int k2 = Nk; - - Nconv = 0; - RealD beta_k; + + Nconv = 0; // Set initial vector evec[0] = src; - std::cout << GridLogMessage <<"norm2(src)= " << norm2(src)<()); + std::cout<0); + + basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis + std::cout<= MinRestart) { - if( Nconv>=Nstop ){ - goto converged; - } - } // end of iter loop - - std::cout << GridLogMessage <<"**************************************************************************"<< std::endl; - std::cout<< GridLogError <<" ImplicitlyRestartedLanczos::calc() NOT converged."; - std::cout << GridLogMessage <<"**************************************************************************"<< std::endl; + std::cout << GridLogIRL << "Test convergence: rotate subset of vectors to test convergence " << std::endl; + + Field B(grid); B.checkerboard = evec[0].checkerboard; + + // power of two search pattern; not every evalue in eval2 is assessed. + for(int jj = 1; jj<=Nstop; jj*=2){ + int j = Nstop-jj; + RealD e = eval2_copy[j]; // Discard the evalue + basisRotateJ(B,evec,Qt,j,0,Nk,Nm); + if( _Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) { + if ( j > Nconv ) { + Nconv=j+1; + jj=Nstop; // Terminate the scan + } + } + } + // Do evec[0] for good measure + { + int j=0; + RealD e = eval2_copy[0]; + basisRotateJ(B,evec,Qt,j,0,Nk,Nm); + _Tester.TestConvergence(j,eresid,B,e,evalMaxApprox); + } + // test if we converged, if so, terminate + std::cout<= "<=Nstop || beta_k < betastp){ + if( Nconv>=Nstop){ + goto converged; + } + + } else { + std::cout << GridLogIRL << "iter < MinRestart: do not yet test for convergence\n"; + } // end of iter loop + } + + std::cout<0) w -= lme[k-1] * evec[k-1]; - - ComplexD zalph = innerProduct(evec[k],w); // 4. αk:=(wk,vk) + + ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk) RealD alph = real(zalph); - - w = w - alph * evec[k];// 5. wk:=wk−αkvk - + + w = w - alph * evec_k;// 5. wk:=wk−αkvk + RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop // 7. vk+1 := wk/βk+1 - + lmd[k] = alph; lme[k] = beta; - - if ( k > 0 ) orthogonalize(w,evec,k); // orthonormalise - if ( k < Nm-1) evec[k+1] = w; - - if ( beta < tiny ) std::cout << GridLogMessage << " beta is tiny "<0 && k % orth_period == 0) { + orthogonalize(w,evec,k); // orthonormalise + std::cout<& lmd, std::vector& lme, int Nk, int Nm, Eigen::MatrixXd & Qt, // Nm x Nm @@ -404,11 +632,11 @@ private: } } } - /////////////////////////////////////////////////////////////////////////// - // File could end here if settle on Eigen ??? - /////////////////////////////////////////////////////////////////////////// - void qr_decomp(std::vector& lmd, // Nm + /////////////////////////////////////////////////////////////////////////// + // File could end here if settle on Eigen ??? !!! + /////////////////////////////////////////////////////////////////////////// + void QR_decomp(std::vector& lmd, // Nm std::vector& lme, // Nm int Nk, int Nm, // Nk, Nm Eigen::MatrixXd& Qt, // Nm x Nm matrix @@ -575,51 +803,50 @@ void diagonalize_lapack(std::vector& lmd, #endif } - void diagonalize_QR(std::vector& lmd, std::vector& lme, - int Nk, int Nm, - Eigen::MatrixXd & Qt, - GridBase *grid) - { - int Niter = 100*Nm; - int kmin = 1; - int kmax = Nk; - - // (this should be more sophisticated) - for(int iter=0; iter= kmin; --j){ - RealD dds = fabs(lmd[j-1])+fabs(lmd[j]); - if(fabs(lme[j-1])+dds > dds){ - kmax = j+1; - goto continued; - } - } - Niter = iter; - return; - - continued: - for(int j=0; j dds){ - kmin = j+1; - break; - } +void diagonalize_QR(std::vector& lmd, std::vector& lme, + int Nk, int Nm, + Eigen::MatrixXd & Qt, + GridBase *grid) +{ + int QRiter = 100*Nm; + int kmin = 1; + int kmax = Nk; + + // (this should be more sophisticated) + for(int iter=0; iter= kmin; --j){ + RealD dds = fabs(lmd[j-1])+fabs(lmd[j]); + if(fabs(lme[j-1])+dds > dds){ + kmax = j+1; + goto continued; + } + } + QRiter = iter; + return; + + continued: + for(int j=0; j dds){ + kmin = j+1; + break; } } - std::cout << GridLogError << "[QL method] Error - Too many iteration: "< +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#ifndef GRID_LOCAL_COHERENCE_IRL_H +#define GRID_LOCAL_COHERENCE_IRL_H +namespace Grid { +struct LanczosParams : Serializable { + public: + GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams, + ChebyParams, Cheby,/*Chebyshev*/ + int, Nstop, /*Vecs in Lanczos must converge Nstop < Nk < Nm*/ + int, Nk, /*Vecs in Lanczos seek converge*/ + int, Nm, /*Total vecs in Lanczos include restart*/ + RealD, resid, /*residual*/ + int, MaxIt, + RealD, betastp, /* ? */ + int, MinRes); // Must restart +}; + +struct LocalCoherenceLanczosParams : Serializable { + public: + GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams, + bool, doFine, + bool, doFineRead, + bool, doCoarse, + bool, doCoarseRead, + LanczosParams, FineParams, + LanczosParams, CoarseParams, + ChebyParams, Smoother, + RealD , coarse_relax_tol, + std::vector, blockSize, + std::string, config, + std::vector < std::complex >, omega, + RealD, mass, + RealD, M5); +}; + +// Duplicate functionality; ProjectedFunctionHermOp could be used with the trivial function +template +class ProjectedHermOp : public LinearFunction > > { +public: + typedef iVector CoarseSiteVector; + typedef Lattice CoarseField; + typedef Lattice CoarseScalar; // used for inner products on fine field + typedef Lattice FineField; + + LinearOperatorBase &_Linop; + Aggregation &_Aggregate; + + ProjectedHermOp(LinearOperatorBase& linop, Aggregation &aggregate) : + _Linop(linop), + _Aggregate(aggregate) { }; + + void operator()(const CoarseField& in, CoarseField& out) { + + GridBase *FineGrid = _Aggregate.FineGrid; + FineField fin(FineGrid); + FineField fout(FineGrid); + + _Aggregate.PromoteFromSubspace(in,fin); std::cout< +class ProjectedFunctionHermOp : public LinearFunction > > { +public: + typedef iVector CoarseSiteVector; + typedef Lattice CoarseField; + typedef Lattice CoarseScalar; // used for inner products on fine field + typedef Lattice FineField; + + + OperatorFunction & _poly; + LinearOperatorBase &_Linop; + Aggregation &_Aggregate; + + ProjectedFunctionHermOp(OperatorFunction & poly,LinearOperatorBase& linop, + Aggregation &aggregate) : + _poly(poly), + _Linop(linop), + _Aggregate(aggregate) { }; + + void operator()(const CoarseField& in, CoarseField& out) { + + GridBase *FineGrid = _Aggregate.FineGrid; + + FineField fin(FineGrid) ;fin.checkerboard =_Aggregate.checkerboard; + FineField fout(FineGrid);fout.checkerboard =_Aggregate.checkerboard; + + _Aggregate.PromoteFromSubspace(in,fin); std::cout< +class ImplicitlyRestartedLanczosSmoothedTester : public ImplicitlyRestartedLanczosTester > > +{ + public: + typedef iVector CoarseSiteVector; + typedef Lattice CoarseField; + typedef Lattice CoarseScalar; // used for inner products on fine field + typedef Lattice FineField; + + LinearFunction & _Poly; + OperatorFunction & _smoother; + LinearOperatorBase &_Linop; + Aggregation &_Aggregate; + RealD _coarse_relax_tol; + ImplicitlyRestartedLanczosSmoothedTester(LinearFunction &Poly, + OperatorFunction &smoother, + LinearOperatorBase &Linop, + Aggregation &Aggregate, + RealD coarse_relax_tol=5.0e3) + : _smoother(smoother), _Linop(Linop),_Aggregate(Aggregate), _Poly(Poly), _coarse_relax_tol(coarse_relax_tol) { }; + + int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox) + { + CoarseField v(B); + RealD eval_poly = eval; + // Apply operator + _Poly(B,v); + + RealD vnum = real(innerProduct(B,v)); // HermOp. + RealD vden = norm2(B); + RealD vv0 = norm2(v); + eval = vnum/vden; + v -= eval*B; + + RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0); + + std::cout.precision(13); + std::cout< nbasis ) eresid = eresid*_coarse_relax_tol; + if( (vv +class LocalCoherenceLanczos +{ +public: + typedef iVector CoarseSiteVector; + typedef Lattice CoarseScalar; // used for inner products on fine field + typedef Lattice CoarseField; + typedef Lattice FineField; + +protected: + GridBase *_CoarseGrid; + GridBase *_FineGrid; + int _checkerboard; + LinearOperatorBase & _FineOp; + + // FIXME replace Aggregation with vector of fine; the code reuse is too small for + // the hassle and complexity of cross coupling. + Aggregation _Aggregate; + std::vector evals_fine; + std::vector evals_coarse; + std::vector evec_coarse; +public: + LocalCoherenceLanczos(GridBase *FineGrid, + GridBase *CoarseGrid, + LinearOperatorBase &FineOp, + int checkerboard) : + _CoarseGrid(CoarseGrid), + _FineGrid(FineGrid), + _Aggregate(CoarseGrid,FineGrid,checkerboard), + _FineOp(FineOp), + _checkerboard(checkerboard) + { + evals_fine.resize(0); + evals_coarse.resize(0); + }; + void Orthogonalise(void ) { _Aggregate.Orthogonalise(); } + + template static RealD normalise(T& v) + { + RealD nn = norm2(v); + nn = ::sqrt(nn); + v = v * (1.0/nn); + return nn; + } + + void fakeFine(void) + { + int Nk = nbasis; + _Aggregate.subspace.resize(Nk,_FineGrid); + _Aggregate.subspace[0]=1.0; + _Aggregate.subspace[0].checkerboard=_checkerboard; + normalise(_Aggregate.subspace[0]); + PlainHermOp Op(_FineOp); + for(int k=1;k Op(_FineOp); + ImplicitlyRestartedLanczosHermOpTester SimpleTester(Op); + for(int k=0;k ChebySmooth(cheby_smooth); + ProjectedFunctionHermOp ChebyOp (ChebySmooth,_FineOp,_Aggregate); + ImplicitlyRestartedLanczosSmoothedTester ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax); + + for(int k=0;k Cheby(cheby_parms); + FunctionHermOp ChebyOp(Cheby,_FineOp); + PlainHermOp Op(_FineOp); + + evals_fine.resize(Nm); + _Aggregate.subspace.resize(Nm,_FineGrid); + + ImplicitlyRestartedLanczos IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes); + + FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard; + + int Nconv; + IRL.calc(evals_fine,_Aggregate.subspace,src,Nconv,false); + + // Shrink down to number saved + assert(Nstop>=nbasis); + assert(Nconv>=nbasis); + evals_fine.resize(nbasis); + _Aggregate.subspace.resize(nbasis,_FineGrid); + } + void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax, + int Nstop, int Nk, int Nm,RealD resid, + RealD MaxIt, RealD betastp, int MinRes) + { + Chebyshev Cheby(cheby_op); + ProjectedHermOp Op(_FineOp,_Aggregate); + ProjectedFunctionHermOp ChebyOp (Cheby,_FineOp,_Aggregate); + ////////////////////////////////////////////////////////////////////////////////////////////////// + // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL + ////////////////////////////////////////////////////////////////////////////////////////////////// + + Chebyshev ChebySmooth(cheby_smooth); + ImplicitlyRestartedLanczosSmoothedTester ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax); + + evals_coarse.resize(Nm); + evec_coarse.resize(Nm,_CoarseGrid); + + CoarseField src(_CoarseGrid); src=1.0; + + ImplicitlyRestartedLanczos IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes); + int Nconv=0; + IRL.calc(evals_coarse,evec_coarse,src,Nconv,false); + assert(Nconv>=Nstop); + evals_coarse.resize(Nstop); + evec_coarse.resize (Nstop,_CoarseGrid); + for (int i=0;i friend class Lattice; GridBase(const std::vector & processor_grid) : CartesianCommunicator(processor_grid) {}; GridBase(const std::vector & processor_grid, - const CartesianCommunicator &parent) : CartesianCommunicator(processor_grid,parent) {}; + const CartesianCommunicator &parent, + int &split_rank) + : CartesianCommunicator(processor_grid,parent,split_rank) {}; + GridBase(const std::vector & processor_grid, + const CartesianCommunicator &parent) + : CartesianCommunicator(processor_grid,parent,dummy) {}; + + virtual ~GridBase() = default; // Physics Grid information. std::vector _simd_layout;// Which dimensions get relayed out over simd lanes. diff --git a/lib/cartesian/Cartesian_full.h b/lib/cartesian/Cartesian_full.h index a6a85ab7..9273abf3 100644 --- a/lib/cartesian/Cartesian_full.h +++ b/lib/cartesian/Cartesian_full.h @@ -38,7 +38,7 @@ namespace Grid{ class GridCartesian: public GridBase { public: - + int dummy; virtual int CheckerBoardFromOindexTable (int Oindex) { return 0; } @@ -67,7 +67,14 @@ public: GridCartesian(const std::vector &dimensions, const std::vector &simd_layout, const std::vector &processor_grid, - const GridCartesian &parent) : GridBase(processor_grid,parent) + const GridCartesian &parent) : GridBase(processor_grid,parent,dummy) + { + Init(dimensions,simd_layout,processor_grid); + } + GridCartesian(const std::vector &dimensions, + const std::vector &simd_layout, + const std::vector &processor_grid, + const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank) { Init(dimensions,simd_layout,processor_grid); } @@ -81,6 +88,8 @@ public: Init(dimensions,simd_layout,processor_grid); } + virtual ~GridCartesian() = default; + void Init(const std::vector &dimensions, const std::vector &simd_layout, const std::vector &processor_grid) diff --git a/lib/cartesian/Cartesian_red_black.h b/lib/cartesian/Cartesian_red_black.h index f89cacc5..ee424385 100644 --- a/lib/cartesian/Cartesian_red_black.h +++ b/lib/cartesian/Cartesian_red_black.h @@ -133,6 +133,8 @@ public: { Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim) ; } + + virtual ~GridRedBlackCartesian() = default; #if 0 //////////////////////////////////////////////////////////// // Create redblack grid ;; deprecate these. Should not @@ -205,6 +207,7 @@ public: { assert((_gdimensions[d] & 0x1) == 0); _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard + _gsites /= 2; } _ldimensions[d] = _gdimensions[d] / _processors[d]; assert(_ldimensions[d] * _processors[d] == _gdimensions[d]); diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc index ce9a3cf0..3e561405 100644 --- a/lib/communicator/Communicator_base.cc +++ b/lib/communicator/Communicator_base.cc @@ -97,13 +97,55 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N) } -#if defined( GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT) +#if defined( GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT) || defined (GRID_COMMS_MPI3) +void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes) +{ + std::vector row(_ndimension,1); + assert(dim>=0 && dim<_ndimension); -CartesianCommunicator::CartesianCommunicator(const std::vector &processors,const CartesianCommunicator &parent) + // Split the communicator + row[dim] = _processors[dim]; + + int me; + CartesianCommunicator Comm(row,*this,me); + Comm.AllToAll(in,out,words,bytes); +} +void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes) +{ + // MPI is a pain and uses "int" arguments + // 64*64*64*128*16 == 500Million elements of data. + // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug. + // (Turns up on 32^3 x 64 Gparity too) + MPI_Datatype object; + int iwords; + int ibytes; + iwords = words; + ibytes = bytes; + assert(words == iwords); // safe to cast to int ? + assert(bytes == ibytes); // safe to cast to int ? + MPI_Type_contiguous(ibytes,MPI_BYTE,&object); + MPI_Type_commit(&object); + MPI_Alltoall(in,iwords,object,out,iwords,object,communicator); + MPI_Type_free(&object); +} +#endif + +#if defined( GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT) +CartesianCommunicator::CartesianCommunicator(const std::vector &processors,const CartesianCommunicator &parent,int &srank) { _ndimension = processors.size(); - assert(_ndimension = parent._ndimension); - + + int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension); + std::vector parent_processor_coor(_ndimension,0); + std::vector parent_processors (_ndimension,1); + + // Can make 5d grid from 4d etc... + int pad = _ndimension-parent_ndimension; + for(int d=0;d &processors, std::vector ssize(_ndimension); // coor of split within parent for(int d=0;d<_ndimension;d++){ - ccoor[d] = parent._processor_coor[d] % processors[d]; - scoor[d] = parent._processor_coor[d] / processors[d]; - ssize[d] = parent._processors[d]/ processors[d]; + ccoor[d] = parent_processor_coor[d] % processors[d]; + scoor[d] = parent_processor_coor[d] / processors[d]; + ssize[d] = parent_processors[d] / processors[d]; } - int crank,srank; // rank within subcomm ; rank of subcomm within blocks of subcomms - Lexicographic::IndexFromCoor(ccoor,crank,processors); - Lexicographic::IndexFromCoor(scoor,srank,ssize); + int crank; // rank within subcomm ; srank is rank of subcomm within blocks of subcomms + // Mpi uses the reverse Lexico convention to us + Lexicographic::IndexFromCoorReversed(ccoor,crank,processors); + Lexicographic::IndexFromCoorReversed(scoor,srank,ssize); MPI_Comm comm_split; if ( Nchild > 1 ) { - // std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec< &processors, // Declare victory ////////////////////////////////////////////////////////////////////////////////////////////////////// // std::cout << GridLogMessage<<"Divided communicator "<< parent._Nprocessors<<" into " - // << Nchild <<" communicators with " << childsize << " ranks"< &processors, MPI_Comm communicator_base) { - // if ( communicator_base != communicator_world ) { - // std::cout << "Cartesian communicator created with a non-world communicator"< &proc } std::vector periodic(_ndimension,1); - MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],1,&communicator); + MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],0,&communicator); MPI_Comm_rank(communicator,&_processor); MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]); + if ( 0 && (communicator_base != communicator_world) ) { + std::cout << "InitFromMPICommunicator Cartesian communicator created with a non-world communicator"< &proc assert(Size==_Nprocessors); } +#endif +#if defined( GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT) CartesianCommunicator::CartesianCommunicator(const std::vector &processors) { InitFromMPICommunicator(processors,communicator_world); @@ -204,10 +283,10 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors) #endif #if !defined( GRID_COMMS_MPI3) - int CartesianCommunicator::NodeCount(void) { return ProcessorCount();}; int CartesianCommunicator::RankCount(void) { return ProcessorCount();}; #endif + #if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPIT) double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, int xmit_to_rank, diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h index 8ff22dbd..73ea6165 100644 --- a/lib/communicator/Communicator_base.h +++ b/lib/communicator/Communicator_base.h @@ -153,11 +153,12 @@ class CartesianCommunicator { // Constructors to sub-divide a parent communicator // and default to comm world //////////////////////////////////////////////// - CartesianCommunicator(const std::vector &processors,const CartesianCommunicator &parent); + CartesianCommunicator(const std::vector &processors,const CartesianCommunicator &parent,int &srank); CartesianCommunicator(const std::vector &pdimensions_in); + virtual ~CartesianCommunicator(); private: -#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT) +#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT) || defined (GRID_COMMS_MPI3) //////////////////////////////////////////////// // Private initialise from an MPI communicator // Can use after an MPI_Comm_split, but hidden from user so private @@ -273,12 +274,16 @@ class CartesianCommunicator { // std::cerr << " AllToAll in.size() "< void Broadcast(int root,obj &data) { diff --git a/lib/communicator/Communicator_mpi.cc b/lib/communicator/Communicator_mpi.cc index 678e4517..2075e4bf 100644 --- a/lib/communicator/Communicator_mpi.cc +++ b/lib/communicator/Communicator_mpi.cc @@ -52,6 +52,15 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); ShmInitGeneric(); } + +CartesianCommunicator::~CartesianCommunicator() +{ + int MPI_is_finalised; + MPI_Finalized(&MPI_is_finalised); + if (communicator && !MPI_is_finalised) + MPI_Comm_free(&communicator); +} + void CartesianCommunicator::GlobalSum(uint32_t &u){ int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); assert(ierr==0); @@ -187,21 +196,6 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes) root, communicator); assert(ierr==0); -} -void CartesianCommunicator::AllToAll(int dim,void *in,void *out,int bytes) -{ - std::vector row(_ndimension,1); - assert(dim>=0 && dim<_ndimension); - - // Split the communicator - row[dim] = _processors[dim]; - - CartesianCommunicator Comm(row,*this); - Comm.AllToAll(in,out,bytes); -} -void CartesianCommunicator::AllToAll(void *in,void *out,int bytes) -{ - MPI_Alltoall(in ,bytes,MPI_BYTE,out,bytes,MPI_BYTE,communicator); } /////////////////////////////////////////////////////// // Should only be used prior to Grid Init finished. diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc index dce9588a..e41749d4 100644 --- a/lib/communicator/Communicator_mpi3.cc +++ b/lib/communicator/Communicator_mpi3.cc @@ -454,11 +454,15 @@ void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector &c ////////////////////////////////// // Try to subdivide communicator ////////////////////////////////// -CartesianCommunicator::CartesianCommunicator(const std::vector &processors,const CartesianCommunicator &parent) +/* + * Use default in MPI compile + */ +CartesianCommunicator::CartesianCommunicator(const std::vector &processors,const CartesianCommunicator &parent,int &srank) : CartesianCommunicator(processors) { std::cout << "Attempts to split MPI3 communicators will fail until implemented" < &processors) { int ierr; @@ -596,6 +600,17 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors) } } }; +CartesianCommunicator::~CartesianCommunicator() +{ + int MPI_is_finalised; + MPI_Finalized(&MPI_is_finalised); + if (communicator && !MPI_is_finalised) { + MPI_Comm_free(&communicator); + for(int i=0;i &processors,const CartesianCommunicator &parent) - : CartesianCommunicator(processors) {} +CartesianCommunicator::CartesianCommunicator(const std::vector &processors,const CartesianCommunicator &parent,int &srank) + : CartesianCommunicator(processors) { srank=0;} CartesianCommunicator::CartesianCommunicator(const std::vector &processors) { @@ -56,6 +56,8 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors) } } +CartesianCommunicator::~CartesianCommunicator(){} + void CartesianCommunicator::GlobalSum(float &){} void CartesianCommunicator::GlobalSumVector(float *,int N){} void CartesianCommunicator::GlobalSum(double &){} @@ -98,9 +100,13 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector & { assert(0); } -void CartesianCommunicator::AllToAll(int dim,void *in,void *out,int bytes) +void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes) { - bcopy(in,out,bytes); + bcopy(in,out,bytes*words); +} +void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes) +{ + bcopy(in,out,bytes*words); } int CartesianCommunicator::RankWorld(void){return 0;} diff --git a/lib/communicator/Communicator_shmem.cc b/lib/communicator/Communicator_shmem.cc index ed49285d..03e3173e 100644 --- a/lib/communicator/Communicator_shmem.cc +++ b/lib/communicator/Communicator_shmem.cc @@ -75,6 +75,8 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { ShmInitGeneric(); } +CartesianCommunicator::~CartesianCommunicator(){} + CartesianCommunicator::CartesianCommunicator(const std::vector &processors,const CartesianCommunicator &parent) : CartesianCommunicator(processors) { diff --git a/lib/lattice/Lattice_transfer.h b/lib/lattice/Lattice_transfer.h index 713a8788..c7e2a507 100644 --- a/lib/lattice/Lattice_transfer.h +++ b/lib/lattice/Lattice_transfer.h @@ -50,26 +50,22 @@ inline void subdivides(GridBase *coarse,GridBase *fine) //////////////////////////////////////////////////////////////////////////////////////////// template inline void pickCheckerboard(int cb,Lattice &half,const Lattice &full){ half.checkerboard = cb; - int ssh=0; - //parallel_for - for(int ss=0;ssoSites();ss++){ - std::vector coor; + + parallel_for(int ss=0;ssoSites();ss++){ int cbos; - + std::vector coor; full._grid->oCoorFromOindex(coor,ss); cbos=half._grid->CheckerBoard(coor); if (cbos==cb) { + int ssh=half._grid->oIndex(coor); half._odata[ssh] = full._odata[ss]; - ssh++; } } } template inline void setCheckerboard(Lattice &full,const Lattice &half){ int cb = half.checkerboard; - int ssh=0; - //parallel_for - for(int ss=0;ssoSites();ss++){ + parallel_for(int ss=0;ssoSites();ss++){ std::vector coor; int cbos; @@ -77,8 +73,8 @@ inline void subdivides(GridBase *coarse,GridBase *fine) cbos=half._grid->CheckerBoard(coor); if (cbos==cb) { + int ssh=half._grid->oIndex(coor); full._odata[ss]=half._odata[ssh]; - ssh++; } } } @@ -109,8 +105,8 @@ inline void blockProject(Lattice > &coarseData, coarseData=zero; - // Loop with a cache friendly loop ordering - for(int sf=0;sfoSites();sf++){ + // Loop over coars parallel, and then loop over fine associated with coarse. + parallel_for(int sf=0;sfoSites();sf++){ int sc; std::vector coor_c(_ndimension); @@ -119,8 +115,9 @@ inline void blockProject(Lattice > &coarseData, for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); +PARALLEL_CRITICAL for(int i=0;i &fineZ, GridBase * coarse= coarseA._grid; fineZ.checkerboard=fineX.checkerboard; + assert(fineX.checkerboard==fineY.checkerboard); subdivides(coarse,fine); // require they map conformable(fineX,fineY); conformable(fineX,fineZ); @@ -180,9 +178,10 @@ template GridBase *coarse(CoarseInner._grid); GridBase *fine (fineX._grid); - Lattice fine_inner(fine); + Lattice fine_inner(fine); fine_inner.checkerboard = fineX.checkerboard; Lattice coarse_inner(coarse); + // Precision promotion? fine_inner = localInnerProduct(fineX,fineY); blockSum(coarse_inner,fine_inner); parallel_for(int ss=0;ssoSites();ss++){ @@ -193,7 +192,7 @@ template inline void blockNormalise(Lattice &ip,Lattice &fineX) { GridBase *coarse = ip._grid; - Lattice zz(fineX._grid); zz=zero; + Lattice zz(fineX._grid); zz=zero; zz.checkerboard=fineX.checkerboard; blockInnerProduct(ip,fineX,fineX); ip = pow(ip,-0.5); blockZAXPY(fineX,ip,fineX,zz); @@ -216,19 +215,25 @@ inline void blockSum(Lattice &coarseData,const Lattice &fineData) block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; } + // Turn this around to loop threaded over sc and interior loop + // over sf would thread better coarseData=zero; - for(int sf=0;sfoSites();sf++){ - + parallel_region { + int sc; std::vector coor_c(_ndimension); std::vector coor_f(_ndimension); - Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); - for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; - Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); - - coarseData._odata[sc]=coarseData._odata[sc]+fineData._odata[sf]; + parallel_for_internal(int sf=0;sfoSites();sf++){ + + Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); + for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; + Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); + +PARALLEL_CRITICAL + coarseData._odata[sc]=coarseData._odata[sc]+fineData._odata[sf]; + } } return; } @@ -238,7 +243,7 @@ inline void blockPick(GridBase *coarse,const Lattice &unpicked,Lattice zz(fine); + Lattice zz(fine); zz.checkerboard = unpicked.checkerboard; Lattice > fcoor(fine); zz = zero; @@ -303,20 +308,21 @@ inline void blockPromote(const Lattice > &coarseData, } // Loop with a cache friendly loop ordering - for(int sf=0;sfoSites();sf++){ - + parallel_region { int sc; std::vector coor_c(_ndimension); std::vector coor_f(_ndimension); - Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); - for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; - Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); - - for(int i=0;ioSites();sf++){ + Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); + for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; + Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); + + for(int i=0;i &out, const Lattice &in){ //////////////////////////////////////////////////////////////////////////////// // Communicate between grids //////////////////////////////////////////////////////////////////////////////// -// -// All to all plan -// -// Subvolume on fine grid is v. Vectors a,b,c,d -// -/////////////////////////////////////////////////////////////////////////////////////////////////////////// -// SIMPLEST CASE: -/////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Mesh of nodes (2) ; subdivide to 1 subdivisions -// -// Lex ord: -// N0 va0 vb0 N1 va1 vb1 -// -// For each dimension do an all to all -// -// full AllToAll(0) -// N0 va0 va1 N1 vb0 vb1 -// -// REARRANGE -// N0 va01 N1 vb01 -// -// Must also rearrange data to get into the NEW lex order of grid at each stage. Some kind of "insert/extract". -// NB: Easiest to programme if keep in lex order. -// /////////////////////////////////////////////////////////////////////////////////////////////////////////// // SIMPLE CASE: /////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -745,8 +727,17 @@ void precisionChange(Lattice &out, const Lattice &in){ // // Must also rearrange data to get into the NEW lex order of grid at each stage. Some kind of "insert/extract". // NB: Easiest to programme if keep in lex order. -// -///////////////////////////////////////////////////////// +/* + * Let chunk = (fvol*nvec)/sP be size of a chunk. ( Divide lexico vol * nvec into fP/sP = M chunks ) + * + * 2nd A2A (over sP nodes; subdivide the fP into sP chunks of M) + * + * node 0 1st chunk of node 0M..(1M-1); 2nd chunk of node 0M..(1M-1).. data chunk x M x sP = fL / sP * M * sP = fL * M growth + * node 1 1st chunk of node 1M..(2M-1); 2nd chunk of node 1M..(2M-1).. + * node 2 1st chunk of node 2M..(3M-1); 2nd chunk of node 2M..(3M-1).. + * node 3 1st chunk of node 3M..(3M-1); 2nd chunk of node 2M..(3M-1).. + * etc... + */ template void Grid_split(std::vector > & full,Lattice & split) { @@ -790,11 +781,12 @@ void Grid_split(std::vector > & full,Lattice & split) ratio[d] = full_grid->_processors[d]/ split_grid->_processors[d]; } - int lsites = full_grid->lSites(); - Integer sz = lsites * nvector; + uint64_t lsites = full_grid->lSites(); + uint64_t sz = lsites * nvector; std::vector tmpdata(sz); std::vector alldata(sz); std::vector scalardata(lsites); + for(int v=0;v > & full,Lattice & split) int nvec = nvector; // Counts down to 1 as we collapse dims std::vector ldims = full_grid->_ldimensions; - std::vector lcoor(ndim); - for(int d=0;d=0;d--){ if ( ratio[d] != 1 ) { full_grid ->AllToAll(d,alldata,tmpdata); + if ( split_grid->_processors[d] > 1 ) { + alldata=tmpdata; + split_grid->AllToAll(d,alldata,tmpdata); + } - ////////////////////////////////////////// - //Local volume for this dimension is expanded by ratio of processor extents - // Number of vectors is decreased by same factor - // Rearrange to lexico for bigger volume - ////////////////////////////////////////// - nvec /= ratio[d]; - auto rdims = ldims; rdims[d] *= ratio[d]; - auto rsites= lsites*ratio[d]; - for(int v=0;v_processors[d]; + int fP = full_grid->_processors[d]; - Lexicographic::CoorFromIndex(lcoor, lsite, ldims); + int fvol = lsites; + + int chunk = (nvec*fvol)/sP; assert(chunk*sP == nvec*fvol); - for(int r=0;r coor(ndim); + Lexicographic::CoorFromIndex(coor, lex_fvol, ldims); + coor[d] += m*ldims[d]; + Lexicographic::IndexFromCoor(coor, lex_r, rdims); + lex_r += lex_vec * rsites; - int rsite; Lexicographic::IndexFromCoor(rcoor, rsite, rdims); - rsite += v * rsites; - - int rmul=nvec*lsites; - int vmul= lsites; - alldata[rsite] = tmpdata[lsite+r*rmul+v*vmul]; + // LexicoFind coordinate & vector number within split lattice + alldata[lex_r] = tmpdata[lex_c]; } } @@ -844,13 +848,8 @@ void Grid_split(std::vector > & full,Lattice & split) ldims[d]*= ratio[d]; lsites *= ratio[d]; - if ( split_grid->_processors[d] > 1 ) { - tmpdata = alldata; - split_grid->AllToAll(d,tmpdata,alldata); - } } } - vectorizeFromLexOrdArray(alldata,split); } @@ -908,8 +907,8 @@ void Grid_unsplit(std::vector > & full,Lattice & split) ratio[d] = full_grid->_processors[d]/ split_grid->_processors[d]; } - int lsites = full_grid->lSites(); - Integer sz = lsites * nvector; + uint64_t lsites = full_grid->lSites(); + uint64_t sz = lsites * nvector; std::vector tmpdata(sz); std::vector alldata(sz); std::vector scalardata(lsites); @@ -919,72 +918,74 @@ void Grid_unsplit(std::vector > & full,Lattice & split) ///////////////////////////////////////////////////////////////// // Start from split grid and work towards full grid ///////////////////////////////////////////////////////////////// - std::vector lcoor(ndim); - std::vector rcoor(ndim); int nvec = 1; - lsites = split_grid->lSites(); - std::vector ldims = split_grid->_ldimensions; + uint64_t rsites = split_grid->lSites(); + std::vector rdims = split_grid->_ldimensions; - for(int d=ndim-1;d>=0;d--){ + for(int d=0;d_processors[d] > 1 ) { - tmpdata = alldata; - split_grid->AllToAll(d,tmpdata,alldata); - } - - ////////////////////////////////////////// - //Local volume for this dimension is expanded by ratio of processor extents - // Number of vectors is decreased by same factor - // Rearrange to lexico for bigger volume - ////////////////////////////////////////// - auto rsites= lsites/ratio[d]; - auto rdims = ldims; rdims[d]/=ratio[d]; - - for(int v=0;v smaller local volume - // lsite, lcoor --> bigger original (single node?) volume - // For loop over each site within smaller subvol - for(int rsite=0;rsite_processors[d]; + int fP = full_grid->_processors[d]; + + auto ldims = rdims; ldims[d] /= M; // Decrease local dims by same factor + auto lsites= rsites/M; // Decreases rsites by M + + int fvol = lsites; + int chunk = (nvec*fvol)/sP; assert(chunk*sP == nvec*fvol); + + { + // Loop over reordered data post A2A + for(int c=0;c coor(ndim); + Lexicographic::CoorFromIndex(coor, lex_fvol, ldims); + coor[d] += m*ldims[d]; + Lexicographic::IndexFromCoor(coor, lex_r, rdims); + lex_r += lex_vec * rsites; + + // LexicoFind coordinate & vector number within split lattice + tmpdata[lex_c] = alldata[lex_r]; + } } } } - nvec *= ratio[d]; - ldims[d]=rdims[d]; - lsites =rsites; + if ( split_grid->_processors[d] > 1 ) { + split_grid->AllToAll(d,tmpdata,alldata); + tmpdata=alldata; + } full_grid ->AllToAll(d,tmpdata,alldata); + rdims[d]/= M; + rsites /= M; + nvec *= M; // Increase nvec by subdivision factor } } lsites = full_grid->lSites(); for(int v=0;v &logstreams) { GridLogError.Active(0); diff --git a/lib/log/Log.h b/lib/log/Log.h index 74d080bb..ddff4c1d 100644 --- a/lib/log/Log.h +++ b/lib/log/Log.h @@ -85,12 +85,15 @@ class Logger { protected: Colours &Painter; int active; + int timing_mode; static int timestamp; std::string name, topName; std::string COLOUR; public: - static GridStopWatch StopWatch; + static GridStopWatch GlobalStopWatch; + GridStopWatch LocalStopWatch; + GridStopWatch *StopWatch; static std::ostream devnull; std::string background() {return Painter.colour["NORMAL"];} @@ -101,22 +104,38 @@ public: name(nm), topName(topNm), Painter(col_class), - COLOUR(col) {} ; + timing_mode(0), + COLOUR(col) + { + StopWatch = & GlobalStopWatch; + }; void Active(int on) {active = on;}; int isActive(void) {return active;}; static void Timestamp(int on) {timestamp = on;}; - + void Reset(void) { + StopWatch->Reset(); + StopWatch->Start(); + } + void TimingMode(int on) { + timing_mode = on; + if(on) { + StopWatch = &LocalStopWatch; + Reset(); + } + } + friend std::ostream& operator<< (std::ostream& stream, Logger& log){ if ( log.active ) { - stream << log.background()<< std::setw(8) << std::left << log.topName << log.background()<< " : "; - stream << log.colour() << std::setw(10) << std::left << log.name << log.background() << " : "; + stream << log.background()<< std::left << log.topName << log.background()<< " : "; + stream << log.colour() << std::left << log.name << log.background() << " : "; if ( log.timestamp ) { - StopWatch.Stop(); - GridTime now = StopWatch.Elapsed(); - StopWatch.Start(); - stream << log.evidence()<< now << log.background() << " : " ; + log.StopWatch->Stop(); + GridTime now = log.StopWatch->Elapsed(); + if ( log.timing_mode==1 ) log.StopWatch->Reset(); + log.StopWatch->Start(); + stream << log.evidence()<< std::setw(6)< &logstreams); +extern GridLogger GridLogIRL; +extern GridLogger GridLogSolver; extern GridLogger GridLogError; extern GridLogger GridLogWarning; extern GridLogger GridLogMessage; diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h index d14f3fe2..b40a75af 100644 --- a/lib/parallelIO/BinaryIO.h +++ b/lib/parallelIO/BinaryIO.h @@ -261,7 +261,7 @@ class BinaryIO { GridBase *grid, std::vector &iodata, std::string file, - int offset, + Integer offset, const std::string &format, int control, uint32_t &nersc_csum, uint32_t &scidac_csuma, @@ -356,7 +356,7 @@ class BinaryIO { if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) { #ifdef USE_MPI_IO - std::cout<< GridLogMessage<< "MPI read I/O "<< file<< std::endl; + std::cout<< GridLogMessage<<"IOobject: MPI read I/O "<< file<< std::endl; ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh); assert(ierr==0); ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr==0); ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status); assert(ierr==0); @@ -367,7 +367,7 @@ class BinaryIO { assert(0); #endif } else { - std::cout << GridLogMessage << "C++ read I/O " << file << " : " + std::cout << GridLogMessage <<"IOobject: C++ read I/O " << file << " : " << iodata.size() * sizeof(fobj) << " bytes" << std::endl; std::ifstream fin; fin.open(file, std::ios::binary | std::ios::in); @@ -413,9 +413,9 @@ class BinaryIO { timer.Start(); if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) { #ifdef USE_MPI_IO - std::cout << GridLogMessage << "MPI write I/O " << file << std::endl; + std::cout << GridLogMessage <<"IOobject: MPI write I/O " << file << std::endl; ierr = MPI_File_open(grid->communicator, (char *)file.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh); - std::cout << GridLogMessage << "Checking for errors" << std::endl; + // std::cout << GridLogMessage << "Checking for errors" << std::endl; if (ierr != MPI_SUCCESS) { char error_string[BUFSIZ]; @@ -444,48 +444,56 @@ class BinaryIO { assert(0); #endif } else { + + std::cout << GridLogMessage << "IOobject: C++ write I/O " << file << " : " + << iodata.size() * sizeof(fobj) << " bytes" << std::endl; std::ofstream fout; - fout.exceptions ( std::fstream::failbit | std::fstream::badbit ); - try { - fout.open(file,std::ios::binary|std::ios::out|std::ios::in); - } catch (const std::fstream::failure& exc) { - std::cout << GridLogError << "Error in opening the file " << file << " for output" < &Umu, std::string file, munger munge, - int offset, + Integer offset, const std::string &format, uint32_t &nersc_csum, uint32_t &scidac_csuma, @@ -552,7 +560,7 @@ class BinaryIO { static inline void writeLatticeObject(Lattice &Umu, std::string file, munger munge, - int offset, + Integer offset, const std::string &format, uint32_t &nersc_csum, uint32_t &scidac_csuma, @@ -589,7 +597,7 @@ class BinaryIO { static inline void readRNG(GridSerialRNG &serial, GridParallelRNG ¶llel, std::string file, - int offset, + Integer offset, uint32_t &nersc_csum, uint32_t &scidac_csuma, uint32_t &scidac_csumb) @@ -651,7 +659,7 @@ class BinaryIO { static inline void writeRNG(GridSerialRNG &serial, GridParallelRNG ¶llel, std::string file, - int offset, + Integer offset, uint32_t &nersc_csum, uint32_t &scidac_csuma, uint32_t &scidac_csumb) diff --git a/lib/parallelIO/IldgIO.h b/lib/parallelIO/IldgIO.h index ba71153d..b86e250f 100644 --- a/lib/parallelIO/IldgIO.h +++ b/lib/parallelIO/IldgIO.h @@ -147,7 +147,7 @@ namespace QCD { _scidacRecord = sr; - std::cout << GridLogMessage << "Build SciDAC datatype " < munge; BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb); @@ -237,7 +237,7 @@ class GridLimeReader : public BinaryIO { ///////////////////////////////////////////// // Verify checksums ///////////////////////////////////////////// - scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb); + assert(scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb)==1); return; } } @@ -253,16 +253,13 @@ class GridLimeReader : public BinaryIO { while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { // std::cout << GridLogMessage<< " readLimeObject seeking "<< record_name <<" found record :" < xmlc(nbytes+1,'\0'); limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR); - // std::cout << GridLogMessage<< " readLimeObject matches XML " << &xmlc[0] <=0); err=limeWriterCloseRecord(LimeW); assert(err>=0); limeDestroyHeader(h); - // std::cout << " File offset is now"<_gsites; createLimeRecordHeader(record_name, 0, 0, PayloadSize); - // std::cout << "W sizeof(sobj)" <_gsites<(); BinarySimpleMunger munge; BinaryIO::writeLatticeObject(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb); + // fseek(File,0,SEEK_END); offset = ftello(File);std::cout << " offset now "<=0); + //////////////////////////////////////// // Write checksum element, propagaing forward from the BinaryIO // Always pair a checksum with a binary object, and close message @@ -382,7 +380,7 @@ class GridLimeWriter : public BinaryIO { std::stringstream streamb; streamb << std::hex << scidac_csumb; checksum.suma= streama.str(); checksum.sumb= streamb.str(); - std::cout << GridLogMessage<<" writing scidac checksums "< xmlc(nbytes+1,'\0'); limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR); - std::cout << GridLogMessage<< "Non binary record :" < munge; BinaryIO::readLatticeObject< vobj, dobj >(Umu, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb); diff --git a/lib/qcd/hmc/integrators/Integrator_algorithm.h b/lib/qcd/hmc/integrators/Integrator_algorithm.h index ecc125ef..13a37aeb 100644 --- a/lib/qcd/hmc/integrators/Integrator_algorithm.h +++ b/lib/qcd/hmc/integrators/Integrator_algorithm.h @@ -231,7 +231,7 @@ class ForceGradient : public Integrator +#include using namespace Grid; using namespace std; diff --git a/lib/serialisation/MacroMagic.h b/lib/serialisation/MacroMagic.h index 774c947f..5df2c780 100644 --- a/lib/serialisation/MacroMagic.h +++ b/lib/serialisation/MacroMagic.h @@ -125,7 +125,11 @@ static inline void write(Writer &WR,const std::string &s, const cname &obj){ }\ template \ static inline void read(Reader &RD,const std::string &s, cname &obj){ \ - push(RD,s);\ + if (!push(RD,s))\ + {\ + std::cout << Grid::GridLogWarning << "IO: Cannot open node '" << s << "'" << std::endl;\ + return;\ + };\ GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_READ_MEMBER,__VA_ARGS__)) \ pop(RD);\ }\ diff --git a/lib/serialisation/XmlIO.cc b/lib/serialisation/XmlIO.cc index a132a2f0..8ac7422c 100644 --- a/lib/serialisation/XmlIO.cc +++ b/lib/serialisation/XmlIO.cc @@ -70,8 +70,8 @@ XmlReader::XmlReader(const char *xmlstring,string toplev) : fileName_("") pugi::xml_parse_result result; result = doc_.load_string(xmlstring); if ( !result ) { - cerr << "XML error description: " << result.description() << "\n"; - cerr << "XML error offset : " << result.offset << "\n"; + cerr << "XML error description (from char *): " << result.description() << "\nXML\n"<< xmlstring << "\n"; + cerr << "XML error offset (from char *) " << result.offset << "\nXML\n"<< xmlstring <<"\n"; abort(); } if ( toplev == std::string("") ) { @@ -87,8 +87,8 @@ XmlReader::XmlReader(const string &fileName,string toplev) : fileName_(fileName) pugi::xml_parse_result result; result = doc_.load_file(fileName_.c_str()); if ( !result ) { - cerr << "XML error description: " << result.description() << "\n"; - cerr << "XML error offset : " << result.offset << "\n"; + cerr << "XML error description: " << result.description() <<" "<< fileName_ <<"\n"; + cerr << "XML error offset : " << result.offset <<" "<< fileName_ <<"\n"; abort(); } if ( toplev == std::string("") ) { @@ -100,13 +100,16 @@ XmlReader::XmlReader(const string &fileName,string toplev) : fileName_(fileName) bool XmlReader::push(const string &s) { + if (node_.child(s.c_str())) + { + node_ = node_.child(s.c_str()); - if (node_.child(s.c_str()) == NULL ) + return true; + } + else + { return false; - - node_ = node_.child(s.c_str()); - return true; - + } } void XmlReader::pop(void) @@ -117,20 +120,30 @@ void XmlReader::pop(void) bool XmlReader::nextElement(const std::string &s) { if (node_.next_sibling(s.c_str())) - { - node_ = node_.next_sibling(s.c_str()); - - return true; - } + { + node_ = node_.next_sibling(s.c_str()); + + return true; + } else - { - return false; - } + { + return false; + } } template <> void XmlReader::readDefault(const string &s, string &output) { - output = node_.child(s.c_str()).first_child().value(); + if (node_.child(s.c_str())) + { + output = node_.child(s.c_str()).first_child().value(); + } + else + { + std::cout << GridLogWarning << "XML: cannot open node '" << s << "'"; + std::cout << std::endl; + + output = ""; + } } diff --git a/lib/serialisation/XmlIO.h b/lib/serialisation/XmlIO.h index fcdbf1e4..e37eb8d9 100644 --- a/lib/serialisation/XmlIO.h +++ b/lib/serialisation/XmlIO.h @@ -39,6 +39,7 @@ Author: paboyle #include #include +#include namespace Grid { @@ -119,7 +120,6 @@ namespace Grid std::string buf; readDefault(s, buf); - // std::cout << s << " " << buf << std::endl; fromString(output, buf); } @@ -132,7 +132,13 @@ namespace Grid std::string buf; unsigned int i = 0; - push(s); + if (!push(s)) + { + std::cout << GridLogWarning << "XML: cannot open node '" << s << "'"; + std::cout << std::endl; + + return; + } while (node_.child("elem")) { output.resize(i + 1); diff --git a/lib/threads/Threads.h b/lib/threads/Threads.h index d15f15ce..36daf2af 100644 --- a/lib/threads/Threads.h +++ b/lib/threads/Threads.h @@ -51,7 +51,9 @@ Author: paboyle #define PARALLEL_CRITICAL #endif +#define parallel_region PARALLEL_REGION #define parallel_for PARALLEL_FOR_LOOP for +#define parallel_for_internal PARALLEL_FOR_LOOP_INTERN for #define parallel_for_nest2 PARALLEL_NESTED_LOOP2 for namespace Grid { diff --git a/lib/util/Init.cc b/lib/util/Init.cc index 1266d34d..031f8f5a 100644 --- a/lib/util/Init.cc +++ b/lib/util/Init.cc @@ -208,7 +208,7 @@ static int Grid_is_initialised = 0; void Grid_init(int *argc,char ***argv) { - GridLogger::StopWatch.Start(); + GridLogger::GlobalStopWatch.Start(); std::string arg; diff --git a/lib/util/Lexicographic.h b/lib/util/Lexicographic.h index b922dba5..f5c55b74 100644 --- a/lib/util/Lexicographic.h +++ b/lib/util/Lexicographic.h @@ -26,6 +26,25 @@ namespace Grid{ } } + static inline void IndexFromCoorReversed (const std::vector& coor,int &index,const std::vector &dims){ + int nd=dims.size(); + int stride=1; + index=0; + for(int d=nd-1;d>=0;d--){ + index = index+stride*coor[d]; + stride=stride*dims[d]; + } + } + static inline void CoorFromIndexReversed (std::vector& coor,int index,const std::vector &dims){ + int nd= dims.size(); + coor.resize(nd); + for(int d=nd-1;d>=0;d--){ + coor[d] = index % dims[d]; + index = index / dims[d]; + } + } + + }; } diff --git a/tests/debug/Test_cheby.cc b/tests/debug/Test_cheby.cc index 40544c56..72d07885 100644 --- a/tests/debug/Test_cheby.cc +++ b/tests/debug/Test_cheby.cc @@ -37,8 +37,15 @@ RealD InverseApproximation(RealD x){ RealD SqrtApproximation(RealD x){ return std::sqrt(x); } +RealD Approximation32(RealD x){ + return std::pow(x,-1.0/32.0); +} +RealD Approximation2(RealD x){ + return std::pow(x,-1.0/2.0); +} + RealD StepFunction(RealD x){ - if ( x<0.1 ) return 1.0; + if ( x<10.0 ) return 1.0; else return 0.0; } @@ -56,7 +63,6 @@ int main (int argc, char ** argv) Chebyshev ChebyInv(lo,hi,2000,InverseApproximation); - { std::ofstream of("chebyinv"); ChebyInv.csv(of); @@ -78,7 +84,6 @@ int main (int argc, char ** argv) ChebyStep.JacksonSmooth(); - { std::ofstream of("chebystepjack"); ChebyStep.csv(of); @@ -100,5 +105,30 @@ int main (int argc, char ** argv) ChebyNE.csv(of); } + lo=0.0; + hi=4.0; + Chebyshev Cheby32(lo,hi,2000,Approximation32); + { + std::ofstream of("cheby32"); + Cheby32.csv(of); + } + Cheby32.JacksonSmooth(); + { + std::ofstream of("cheby32jack"); + Cheby32.csv(of); + } + + Chebyshev ChebySqrt(lo,hi,2000,Approximation2); + { + std::ofstream of("chebysqrt"); + ChebySqrt.csv(of); + } + ChebySqrt.JacksonSmooth(); + { + std::ofstream of("chebysqrtjack"); + ChebySqrt.csv(of); + } + + Grid_finalize(); } diff --git a/tests/hmc/Test_remez.cc b/tests/hmc/Test_remez.cc index bc851173..5f4b0a25 100644 --- a/tests/hmc/Test_remez.cc +++ b/tests/hmc/Test_remez.cc @@ -38,11 +38,11 @@ int main (int argc, char ** argv) std::cout< +class BasisFieldVector { + public: + int _Nm; + + typedef typename Field::scalar_type Coeff_t; + typedef typename Field::vector_type vCoeff_t; + typedef typename Field::vector_object vobj; + typedef typename vobj::scalar_object sobj; + + std::vector _v; // _Nfull vectors + + void report(int n,GridBase* value) { + + std::cout << GridLogMessage << "BasisFieldVector allocated:\n"; + std::cout << GridLogMessage << " Delta N = " << n << "\n"; + std::cout << GridLogMessage << " Size of full vectors (size) = " << + ((double)n*sizeof(vobj)*value->oSites() / 1024./1024./1024.) << " GB\n"; + std::cout << GridLogMessage << " Size = " << _v.size() << " Capacity = " << _v.capacity() << std::endl; + + value->Barrier(); + +#ifdef __linux + if (value->IsBoss()) { + system("cat /proc/meminfo"); + } +#endif + + value->Barrier(); + + } + + BasisFieldVector(int Nm,GridBase* value) : _Nm(Nm), _v(Nm,value) { + report(Nm,value); + } + + ~BasisFieldVector() { + } + + Field& operator[](int i) { + return _v[i]; + } + + void orthogonalize(Field& w, int k) { + basisOrthogonalize(_v,w,k); + } + + void rotate(Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm) { + basisRotate(_v,Qt,j0,j1,k0,k1,Nm); + } + + size_t size() const { + return _Nm; + } + + void resize(int n) { + if (n > _Nm) + _v.reserve(n); + + _v.resize(n,_v[0]._grid); + + if (n < _Nm) + _v.shrink_to_fit(); + + report(n - _Nm,_v[0]._grid); + + _Nm = n; + } + + void sortInPlace(std::vector& sort_vals, bool reverse) { + basisSortInPlace(_v,sort_vals,reverse); + } + + void deflate(const std::vector& eval,const Field& src_orig,Field& result) { + basisDeflate(_v,eval,src_orig,result); + } + + }; +} diff --git a/tests/lanczos/Test_dwf_compressed_lanczos.cc b/tests/lanczos/Test_dwf_compressed_lanczos.cc index 7fe37387..45690f05 100644 --- a/tests/lanczos/Test_dwf_compressed_lanczos.cc +++ b/tests/lanczos/Test_dwf_compressed_lanczos.cc @@ -21,7 +21,14 @@ (ortho krylov low poly); and then fix up lowest say 200 eigenvalues by 1 run with high-degree poly (600 could be enough) */ #include -#include +#include +///////////////////////////////////////////////////////////////////////////// +// The following are now decoupled from the Lanczos and deal with grids. +// Safe to replace functionality +///////////////////////////////////////////////////////////////////////////// +#include "BlockedGrid.h" +#include "FieldBasisVector.h" +#include "BlockProjector.h" #include "FieldVectorIO.h" #include "Params.h" @@ -93,19 +100,6 @@ void write_history(char* fn, std::vector& hist) { fclose(f); } -template -class FunctionHermOp : public LinearFunction { -public: - OperatorFunction & _poly; - LinearOperatorBase &_Linop; - - FunctionHermOp(OperatorFunction & poly,LinearOperatorBase& linop) : _poly(poly), _Linop(linop) { - } - - void operator()(const Field& in, Field& out) { - _poly(_Linop,in,out); - } -}; template class CheckpointedLinearFunction : public LinearFunction { @@ -261,19 +255,6 @@ public: } }; -template -class PlainHermOp : public LinearFunction { -public: - LinearOperatorBase &_Linop; - - PlainHermOp(LinearOperatorBase& linop) : _Linop(linop) { - } - - void operator()(const Field& in, Field& out) { - _Linop.HermOp(in,out); - } -}; - template using CoarseSiteFieldGeneral = iScalar< iVector >; template using CoarseSiteFieldD = CoarseSiteFieldGeneral< vComplexD, N >; template using CoarseSiteFieldF = CoarseSiteFieldGeneral< vComplexF, N >; @@ -319,7 +300,7 @@ void CoarseGridLanczos(BlockProjector& pr,RealD alpha2,RealD beta,int Npo Op2 = &Op2plain; } ProjectedHermOp,LatticeFermion> Op2nopoly(pr,HermOp); - BlockImplicitlyRestartedLanczos > IRL2(*Op2,*Op2,Nstop2,Nk2,Nm2,resid2,betastp2,MaxIt,MinRes2); + ImplicitlyRestartedLanczos > IRL2(*Op2,*Op2,Nstop2,Nk2,Nm2,resid2,MaxIt,betastp2,MinRes2); src_coarse = 1.0; @@ -350,7 +331,7 @@ void CoarseGridLanczos(BlockProjector& pr,RealD alpha2,RealD beta,int Npo ) { - IRL2.calc(eval2,coef,src_coarse,Nconv,true,SkipTest2); + IRL2.calc(eval2,coef._v,src_coarse,Nconv,true); coef.resize(Nstop2); eval2.resize(Nstop2); @@ -450,6 +431,7 @@ void CoarseGridLanczos(BlockProjector& pr,RealD alpha2,RealD beta,int Npo auto result = src_orig; // undeflated solve + std::cout << GridLogMessage << " Undeflated solve "<IsBoss()) @@ -457,6 +439,7 @@ void CoarseGridLanczos(BlockProjector& pr,RealD alpha2,RealD beta,int Npo // CG.ResHistory.clear(); // deflated solve with all eigenvectors + std::cout << GridLogMessage << " Deflated solve with all evectors"<& pr,RealD alpha2,RealD beta,int Npo // CG.ResHistory.clear(); // deflated solve with non-blocked eigenvectors + std::cout << GridLogMessage << " Deflated solve with non-blocked evectors"<& pr,RealD alpha2,RealD beta,int Npo // CG.ResHistory.clear(); // deflated solve with all eigenvectors and original eigenvalues from proj + std::cout << GridLogMessage << " Deflated solve with all eigenvectors and original eigenvalues from proj"< IRL1(Op1,Op1test,Nstop1,Nk1,Nm1,resid1,betastp1,MaxIt,MinRes1); + ImplicitlyRestartedLanczos IRL1(Op1,Op1test,Nstop1,Nk1,Nm1,resid1,MaxIt,betastp1,MinRes1); int Nconv; char tag[1024]; @@ -650,7 +635,7 @@ int main (int argc, char ** argv) { if (simple_krylov_basis) { quick_krylov_basis(evec,src,Op1,Nstop1); } else { - IRL1.calc(eval1,evec,src,Nconv,false,1); + IRL1.calc(eval1,evec._v,src,Nconv,false); } evec.resize(Nstop1); // and throw away superfluous eval1.resize(Nstop1); diff --git a/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc b/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc new file mode 100644 index 00000000..4c702a33 --- /dev/null +++ b/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc @@ -0,0 +1,254 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_dwf_compressed_lanczos_reorg.cc + + Copyright (C) 2017 + +Author: Leans heavily on Christoph Lehner's code +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +/* + * Reimplement the badly named "multigrid" lanczos as compressed Lanczos using the features + * in Grid that were intended to be used to support blocked Aggregates, from + */ +#include +#include +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +template +class LocalCoherenceLanczosScidac : public LocalCoherenceLanczos +{ +public: + typedef iVector CoarseSiteVector; + typedef Lattice CoarseField; + typedef Lattice CoarseScalar; // used for inner products on fine field + typedef Lattice FineField; + + LocalCoherenceLanczosScidac(GridBase *FineGrid,GridBase *CoarseGrid, + LinearOperatorBase &FineOp, + int checkerboard) + // Base constructor + : LocalCoherenceLanczos(FineGrid,CoarseGrid,FineOp,checkerboard) + {}; + + void checkpointFine(std::string evecs_file,std::string evals_file) + { + assert(this->_Aggregate.subspace.size()==nbasis); + emptyUserRecord record; + Grid::QCD::ScidacWriter WR; + WR.open(evecs_file); + for(int k=0;k_Aggregate.subspace[k],record); + } + WR.close(); + + XmlWriter WRx(evals_file); + write(WRx,"evals",this->evals_fine); + } + + void checkpointFineRestore(std::string evecs_file,std::string evals_file) + { + this->evals_fine.resize(nbasis); + this->_Aggregate.subspace.resize(nbasis,this->_FineGrid); + + std::cout << GridLogIRL<< "checkpointFineRestore: Reading evals from "<evals_fine); + + assert(this->evals_fine.size()==nbasis); + + std::cout << GridLogIRL<< "checkpointFineRestore: Reading evecs from "<_Aggregate.subspace[k].checkerboard=this->_checkerboard; + RD.readScidacFieldRecord(this->_Aggregate.subspace[k],record); + + } + RD.close(); + } + + void checkpointCoarse(std::string evecs_file,std::string evals_file) + { + int n = this->evec_coarse.size(); + emptyUserRecord record; + Grid::QCD::ScidacWriter WR; + WR.open(evecs_file); + for(int k=0;kevec_coarse[k],record); + } + WR.close(); + + XmlWriter WRx(evals_file); + write(WRx,"evals",this->evals_coarse); + } + + void checkpointCoarseRestore(std::string evecs_file,std::string evals_file,int nvec) + { + std::cout << "resizing coarse vecs to " << nvec<< std::endl; + this->evals_coarse.resize(nvec); + this->evec_coarse.resize(nvec,this->_CoarseGrid); + std::cout << GridLogIRL<< "checkpointCoarseRestore: Reading evals from "<evals_coarse); + + assert(this->evals_coarse.size()==nvec); + emptyUserRecord record; + std::cout << GridLogIRL<< "checkpointCoarseRestore: Reading evecs from "<evec_coarse[k],record); + } + RD.close(); + } +}; + +int main (int argc, char ** argv) { + + Grid_init(&argc,&argv); + GridLogIRL.TimingMode(1); + + LocalCoherenceLanczosParams Params; + { + Params.omega.resize(10); + Params.blockSize.resize(5); + XmlWriter writer("Params_template.xml"); + write(writer,"Params",Params); + std::cout << GridLogMessage << " Written Params_template.xml" < blockSize = Params.blockSize; + + // Grids + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), + GridDefaultSimd(Nd,vComplex::Nsimd()), + GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::vector fineLatt = GridDefaultLatt(); + int dims=fineLatt.size(); + assert(blockSize.size()==dims+1); + std::vector coarseLatt(dims); + std::vector coarseLatt5d ; + + for (int d=0;d HermOp(Ddwf); + + // Eigenvector storage + LanczosParams fine =Params.FineParams; + LanczosParams coarse=Params.CoarseParams; + + const int Ns1 = fine.Nstop; const int Ns2 = coarse.Nstop; + const int Nk1 = fine.Nk; const int Nk2 = coarse.Nk; + const int Nm1 = fine.Nm; const int Nm2 = coarse.Nm; + + std::cout << GridLogMessage << "Keep " << fine.Nstop << " fine vectors" << std::endl; + std::cout << GridLogMessage << "Keep " << coarse.Nstop << " coarse vectors" << std::endl; + assert(Nm2 >= Nm1); + + const int nbasis= 60; + assert(nbasis==Ns1); + LocalCoherenceLanczosScidac _LocalCoherenceLanczos(FrbGrid,CoarseGrid5rb,HermOp,Odd); + std::cout << GridLogMessage << "Constructed LocalCoherenceLanczos" << std::endl; + + assert( (Params.doFine)||(Params.doFineRead)); + + if ( Params.doFine ) { + std::cout << GridLogMessage << "Performing fine grid IRL Nstop "<< Ns1 << " Nk "< Coeffs { 0.,-1.}; Polynomial PolyX(Coeffs); - Chebyshev Cheb(0.2,5.,11); -// ChebyshevLanczos Cheb(9.,1.,0.,20); -// Cheb.csv(std::cout); -// exit(-24); - ImplicitlyRestartedLanczos IRL(HermOp,Cheb,Nstop,Nk,Nm,resid,MaxIt); + Chebyshev Cheby(0.2,5.,11); + + FunctionHermOp OpCheby(Cheby,HermOp); + PlainHermOp Op (HermOp); + + ImplicitlyRestartedLanczos IRL(OpCheby,Op,Nstop,Nk,Nm,resid,MaxIt); std::vector eval(Nm); diff --git a/tests/lanczos/Test_synthetic_lanczos.cc b/tests/lanczos/Test_synthetic_lanczos.cc index 32fd6f32..4be9ca31 100644 --- a/tests/lanczos/Test_synthetic_lanczos.cc +++ b/tests/lanczos/Test_synthetic_lanczos.cc @@ -119,12 +119,13 @@ int main (int argc, char ** argv) RealD beta = 0.1; RealD mu = 0.0; int order = 11; - ChebyshevLanczos Cheby(alpha,beta,mu,order); + Chebyshev Cheby(alpha,beta,order); std::ofstream file("cheby.dat"); Cheby.csv(file); - HermOpOperatorFunction X; DumbOperator HermOp(grid); + FunctionHermOp OpCheby(Cheby,HermOp); + PlainHermOp Op(HermOp); const int Nk = 40; const int Nm = 80; @@ -133,8 +134,9 @@ int main (int argc, char ** argv) int Nconv; RealD eresid = 1.0e-6; - ImplicitlyRestartedLanczos IRL(HermOp,X,Nk,Nk,Nm,eresid,Nit); - ImplicitlyRestartedLanczos ChebyIRL(HermOp,Cheby,Nk,Nk,Nm,eresid,Nit); + + ImplicitlyRestartedLanczos IRL(Op,Op,Nk,Nk,Nm,eresid,Nit); + ImplicitlyRestartedLanczos ChebyIRL(OpCheby,Op,Nk,Nk,Nm,eresid,Nit); LatticeComplex src(grid); gaussian(RNG,src); { diff --git a/tests/lanczos/Test_wilson_lanczos.cc b/tests/lanczos/Test_wilson_lanczos.cc index e8549234..eabc86d7 100644 --- a/tests/lanczos/Test_wilson_lanczos.cc +++ b/tests/lanczos/Test_wilson_lanczos.cc @@ -86,9 +86,12 @@ int main(int argc, char** argv) { std::vector Coeffs{0, 1.}; Polynomial PolyX(Coeffs); - Chebyshev Cheb(0.0, 10., 12); - ImplicitlyRestartedLanczos IRL(HermOp, PolyX, Nstop, Nk, Nm, - resid, MaxIt); + Chebyshev Cheby(0.0, 10., 12); + + FunctionHermOp OpCheby(Cheby,HermOp); + PlainHermOp Op (HermOp); + + ImplicitlyRestartedLanczos IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt); std::vector eval(Nm); FermionField src(FGrid); diff --git a/tests/solver/Test_dwf_hdcr.cc b/tests/solver/Test_dwf_hdcr.cc index c553ba0a..b3373238 100644 --- a/tests/solver/Test_dwf_hdcr.cc +++ b/tests/solver/Test_dwf_hdcr.cc @@ -555,13 +555,13 @@ int main (int argc, char ** argv) std::cout< HermDefOp(Ddwf); - Subspace Aggregates(Coarse5d,FGrid); + Subspace Aggregates(Coarse5d,FGrid,0); // Aggregates.CreateSubspace(RNG5,HermDefOp,nbasis); assert ( (nbasis & 0x1)==0); int nb=nbasis/2; std::cout<RankCount() ; - ///////////////////////////////////////////// // Split into 1^4 mpi communicators ///////////////////////////////////////////// + for(int i=0;i> mpi_split[k]; + } + break; + } + } + + int nrhs = 1; + int me; + for(int i=0;i seeds({1,2,3,4}); - GridParallelRNG pRNG(UGrid ); pRNG.SeedFixedIntegers(seeds); GridParallelRNG pRNG5(FGrid); pRNG5.SeedFixedIntegers(seeds); std::vector src(nrhs,FGrid); @@ -93,7 +105,7 @@ int main (int argc, char ** argv) emptyUserRecord record; std::string file("./scratch.scidac"); std::string filef("./scratch.scidac.ferm"); - int me = UGrid->ThisRank(); + LatticeGaugeField s_Umu(SGrid); FermionField s_src(SFGrid); FermionField s_src_split(SFGrid); @@ -169,7 +181,7 @@ int main (int argc, char ** argv) for(int n=0;nBarrier(); if ( n==me ) { - std::cerr << GridLogMessage<<"Split "<< me << " " << norm2(s_src_split) << " " << norm2(s_src)<< " diff " << norm2(s_tmp)<Barrier(); } @@ -190,7 +202,7 @@ int main (int argc, char ** argv) MdagMLinearOperator HermOp(Ddwf); MdagMLinearOperator HermOpCk(Dchk); - ConjugateGradient CG((1.0e-8/(me+1)),10000); + ConjugateGradient CG((1.0e-5/(me+1)),10000); s_res = zero; CG(HermOp,s_src,s_res); @@ -218,7 +230,6 @@ int main (int argc, char ** argv) std::cout << " diff " < mpi_layout = GridDefaultMpi(); std::vector mpi_split (mpi_layout.size(),1); - GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), + GridDefaultSimd(Nd,vComplex::Nsimd()), + GridDefaultMpi()); GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); GridRedBlackCartesian * rbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); - int nrhs = UGrid->RankCount() ; - ///////////////////////////////////////////// // Split into 1^4 mpi communicators ///////////////////////////////////////////// + + for(int i=0;i> mpi_split[k]; + } + break; + } + } + + int nrhs = 1; + int me; + for(int i=0;i result(nrhs,FGrid); FermionField tmp(FGrid); - for(int s=0;sThisRank(); - LatticeGaugeField s_Umu(SGrid); FermionField s_src(SFGrid); FermionField s_tmp(SFGrid); @@ -98,6 +144,37 @@ int main (int argc, char ** argv) /////////////////////////////////////////////////////////////// Grid_split (Umu,s_Umu); Grid_split (src,s_src); + std::cout << GridLogMessage << " split rank " < HermOp(Ddwf); MdagMLinearOperator HermOpCk(Dchk); - ConjugateGradient CG((1.0e-8/(me+1)),10000); + ConjugateGradient CG((1.0e-2),10000); s_res = zero; CG(HermOp,s_src,s_res); + std::cout << GridLogMessage << " split residual norm "< mpi_layout = GridDefaultMpi(); std::vector mpi_split (mpi_layout.size(),1); - GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), + GridDefaultSimd(Nd,vComplex::Nsimd()), + GridDefaultMpi()); GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); GridRedBlackCartesian * rbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); @@ -57,10 +59,11 @@ int main (int argc, char ** argv) ///////////////////////////////////////////// // Split into 1^4 mpi communicators ///////////////////////////////////////////// + int me; GridCartesian * SGrid = new GridCartesian(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()), mpi_split, - *UGrid); + *UGrid,me); GridCartesian * SFGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,SGrid); GridRedBlackCartesian * SrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(SGrid); @@ -89,8 +92,6 @@ int main (int argc, char ** argv) ///////////////// // MPI only sends ///////////////// - int me = UGrid->ThisRank(); - LatticeGaugeField s_Umu(SGrid); FermionField s_src(SFGrid); FermionField s_src_e(SFrbGrid); diff --git a/tests/solver/Test_split_grid.cc b/tests/solver/Test_split_grid.cc new file mode 100644 index 00000000..2b6a4bf7 --- /dev/null +++ b/tests/solver/Test_split_grid.cc @@ -0,0 +1,157 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_dwf_mrhs_cg.cc + + Copyright (C) 2015 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +int main (int argc, char ** argv) +{ + typedef typename DomainWallFermionR::FermionField FermionField; + typedef typename DomainWallFermionR::ComplexField ComplexField; + typename DomainWallFermionR::ImplParams params; + + const int Ls=4; + + Grid_init(&argc,&argv); + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + std::vector mpi_split (mpi_layout.size(),1); + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * rbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + ///////////////////////////////////////////// + // Split into 1^4 mpi communicators + ///////////////////////////////////////////// + + for(int i=0;i> mpi_split[k]; + } + break; + } + } + + int nrhs = 1; + for(int i=0;i seeds({1,2,3,4}); + + GridParallelRNG pRNG(UGrid ); pRNG.SeedFixedIntegers(seeds); + GridParallelRNG pRNG5(FGrid); pRNG5.SeedFixedIntegers(seeds); + std::vector src(nrhs,FGrid); + std::vector src_chk(nrhs,FGrid); + std::vector result(nrhs,FGrid); + FermionField tmp(FGrid); + + for(int s=0;sThisRank(); + + LatticeGaugeField s_Umu(SGrid); + FermionField s_src(SFGrid); + FermionField s_tmp(SFGrid); + FermionField s_res(SFGrid); + + /////////////////////////////////////////////////////////////// + // split the source out using MPI instead of I/O + /////////////////////////////////////////////////////////////// + Grid_split (Umu,s_Umu); + Grid_split (src,s_src); + + /////////////////////////////////////////////////////////////// + // Set up N-solvers as trivially parallel + /////////////////////////////////////////////////////////////// + RealD mass=0.01; + RealD M5=1.8; + DomainWallFermionR Dchk(Umu,*FGrid,*FrbGrid,*UGrid,*rbGrid,mass,M5); + DomainWallFermionR Ddwf(s_Umu,*SFGrid,*SFrbGrid,*SGrid,*SrbGrid,mass,M5); + + std::cout << GridLogMessage << "****************************************************************** "< HermOp(Ddwf); + MdagMLinearOperator HermOpCk(Dchk); + ConjugateGradient CG((1.0e-8/(me+1)),10000); + s_res = zero; + CG(HermOp,s_src,s_res); + + ///////////////////////////////////////////////////////////// + // Report how long they all took + ///////////////////////////////////////////////////////////// + std::vector iterations(nrhs,0); + iterations[me] = CG.IterationsToComplete; + + for(int n=0;nGlobalSum(iterations[n]); + std::cout << GridLogMessage<<" Rank "< + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +template +struct scal { + d internal; +}; + + Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + +int main (int argc, char ** argv) +{ + typedef typename ImprovedStaggeredFermionR::FermionField FermionField; + typename ImprovedStaggeredFermionR::ImplParams params; + Grid_init(&argc,&argv); + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridRedBlackCartesian RBGrid(&Grid); + + std::vector seeds({1,2,3,4}); + GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); + + LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + + FermionField src(&Grid); random(pRNG,src); + FermionField result(&Grid); result=zero; + FermionField resid(&Grid); + + RealD mass=0.1; + ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass); + + ConjugateGradient CG(1.0e-8,10000); + SchurRedBlackStaggeredSolve SchurSolver(CG); + + double volume=1.0; + for(int mu=0;mu volume * 1146 + double ncall=CG.IterationsToComplete; + double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146 + + std::cout<