/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h Copyright (C) 2015 Author: Peter Boyle Author: paboyle Author: Chulwoo Jung This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ #ifndef GRID_IRL_H #define GRID_IRL_H #include //memset #ifdef USE_LAPACK #ifdef USE_MKL #include #else void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e, double *vl, double *vu, int *il, int *iu, double *abstol, int *m, double *w, double *z, int *ldz, int *isuppz, double *work, int *lwork, int *iwork, int *liwork, int *info); //#include #endif #endif //#include //#include // eliminate temorary vector in calc() #define MEM_SAVE namespace Grid { ///////////////////////////////////////////////////////////// // Implicitly restarted lanczos ///////////////////////////////////////////////////////////// // creating a seaprate instance to avoid conflicts for the time being template class ImplicitlyRestartedLanczosCJ { const RealD small = 1.0e-16; public: int lock; int get; int Niter; int converged; int Nstop; // Number of evecs checked for convergence int Nk; // Number of converged sought int Np; // Np -- Number of spare vecs in kryloc space int Nm; // Nm -- total number of vectors RealD OrthoTime; RealD eresid; SortEigen _sort; LinearOperatorBase &_Linop; OperatorFunction &_poly; ///////////////////////// // Constructor ///////////////////////// void init(void){}; void Abort(int ff, DenseVector &evals, DenseVector > &evecs); ImplicitlyRestartedLanczos( LinearOperatorBase &Linop, // op OperatorFunction & poly, // polynmial int _Nstop, // sought vecs int _Nk, // sought vecs int _Nm, // spare vecs RealD _eresid, // resid in lmdue deficit int _Niter) : // Max iterations _Linop(Linop), _poly(poly), Nstop(_Nstop), Nk(_Nk), Nm(_Nm), eresid(_eresid), Niter(_Niter) { Np = Nm-Nk; assert(Np>0); }; ImplicitlyRestartedLanczos( LinearOperatorBase &Linop, // op OperatorFunction & poly, // polynmial int _Nk, // sought vecs int _Nm, // spare vecs RealD _eresid, // resid in lmdue deficit int _Niter) : // Max iterations _Linop(Linop), _poly(poly), Nstop(_Nk), Nk(_Nk), Nm(_Nm), eresid(_eresid), Niter(_Niter) { Np = Nm-Nk; assert(Np>0); }; ///////////////////////// // Sanity checked this routine (step) against Saad. ///////////////////////// void RitzMatrix(DenseVector& evec,int k){ if(1) return; GridBase *grid = evec[0]._grid; Field w(grid); std::cout<1 ) { if (abs(in) >1.0e-9 ) { std::cout<& lmd, DenseVector& lme, DenseVector& evec, Field& w,int Nm,int k, RealD &norm) { assert( k< Nm ); _poly(_Linop,evec[k],w); // 3. wk:=Avk−βkv_{k−1} if(k>0){ w -= lme[k-1] * evec[k-1]; } ComplexD zalph = innerProduct(evec[k],w); // 4. αk:=(wk,vk) RealD alph = real(zalph); w = w - alph * evec[k];// 5. wk:=wk−αkvk RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop // 7. vk+1 := wk/βk+1 norm=beta; std::cout<0) { orthogonalize(w,evec,k); // orthonormalise } if(k < Nm-1) evec[k+1] = w; } void qr_decomp(DenseVector& lmd, DenseVector& lme, int Nk, int Nm, DenseVector& Qt, RealD Dsh, int kmin, int kmax) { int k = kmin-1; RealD x; RealD Fden = 1.0/hypot(lmd[k]-Dsh,lme[k]); RealD c = ( lmd[k] -Dsh) *Fden; RealD s = -lme[k] *Fden; RealD tmpa1 = lmd[k]; RealD tmpa2 = lmd[k+1]; RealD tmpb = lme[k]; lmd[k] = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb; lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb; lme[k] = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb; x =-s*lme[k+1]; lme[k+1] = c*lme[k+1]; for(int i=0; i& lmd, DenseVector& lme, int N1, int N2, DenseVector& Qt, GridBase *grid){ const int size = Nm; // tevals.resize(size); // tevecs.resize(size); LAPACK_INT NN = N1; // double evals_tmp[NN]; // double evec_tmp[NN][NN]; std::vector evals_tmp(NN); std::vector evec_tmp(NN*NN); memset(evec_tmp.data(),0,sizeof(double)*NN*NN); std::vector DD(NN); std::vector EE(NN); for (int i = 0; i< NN; i++) for (int j = i - 1; j <= i + 1; j++) if ( j < NN && j >= 0 ) { if (i==j) DD[i] = lmd[i]; if (i==j) evals_tmp[i] = lmd[i]; if (j==(i-1)) EE[j] = lme[j]; } LAPACK_INT evals_found; // LAPACK_INT lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ; LAPACK_INT lwork = 1+(18*NN) ; LAPACK_INT liwork = 3+NN*10 ; // LAPACK_INT iwork[liwork]; // double work[lwork]; // LAPACK_INT isuppz[2*NN]; std::vector iwork(liwork); std::vector work(lwork); std::vector isuppz(2*NN); char jobz = 'V'; // calculate evals & evecs char range = 'I'; // calculate all evals // char range = 'A'; // calculate all evals char uplo = 'U'; // refer to upper half of original matrix char compz = 'I'; // Compute eigenvectors of tridiagonal matrix int ifail[NN]; LAPACK_INT info; // int total = QMP_get_number_of_nodes(); // int node = QMP_get_node_number(); // GridBase *grid = evec[0]._grid; int total = grid->_Nprocessors; int node = grid->_processor; int interval = (NN/total)+1; double vl = 0.0, vu = 0.0; LAPACK_INT il = interval*node+1 , iu = interval*(node+1); if (iu > NN) iu=NN; double tol = 0.0; if (1) { memset(evals_tmp.data(),0,sizeof(double)*NN); if ( il <= NN){ printf("total=%d node=%d il=%d iu=%d\n",total,node,il,iu); #ifdef USE_MKL dstegr(&jobz, &range, &NN, #else LAPACK_dstegr(&jobz, &range, &NN, #endif DD.data(), EE.data(), &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A' &tol, // tolerance &evals_found, evals_tmp.data(), evec_tmp.data(), &NN, isuppz.data(), work.data(), &lwork, iwork.data(), &liwork, &info); for (int i = iu-1; i>= il-1; i--){ printf("node=%d evals_found=%d evals_tmp[%d] = %g\n",node,evals_found, i - (il-1),evals_tmp[i - (il-1)]); evals_tmp[i] = evals_tmp[i - (il-1)]; if (il>1) evals_tmp[i-(il-1)]=0.; for (int j = 0; j< NN; j++){ evec_tmp[i*NN+j] = evec_tmp[(i - (il-1))*NN+j]; if (il>1) evec_tmp[(i-(il-1))*NN+j]=0.; } } } { grid->GlobalSumVector(evals_tmp.data(),NN); grid->GlobalSumVector(evec_tmp.data(),NN*NN); } } // cheating a bit. It is better to sort instead of just reversing it, but the document of the routine says evals are sorted in increasing order. qr gives evals in decreasing order. for(int i=0;i& lmd, DenseVector& lme, int N2, int N1, DenseVector& Qt, GridBase *grid) { #ifdef USE_LAPACK const int check_lapack=0; // just use lapack if 0, check against lapack if 1 if(!check_lapack) return diagonalize_lapack(lmd,lme,N2,N1,Qt,grid); DenseVector lmd2(N1); DenseVector lme2(N1); DenseVector Qt2(N1*N1); for(int k=0; k= kmin; --j){ RealD dds = fabs(lmd[j-1])+fabs(lmd[j]); if(fabs(lme[j-1])+dds > dds){ kmax = j+1; goto continued; } } Niter = iter; #ifdef USE_LAPACK if(check_lapack){ const double SMALL=1e-8; diagonalize_lapack(lmd2,lme2,N2,N1,Qt2,grid); DenseVector lmd3(N2); for(int k=0; kSMALL) std::cout<SMALL) std::cout<SMALL) std::cout< dds){ kmin = j+1; break; } } } std::cout<& evec, int k) { double t0=-usecond()/1e6; typedef typename Field::scalar_type MyComplex; MyComplex ip; if ( 0 ) { for(int j=0; j &Qt) { for(int i=0; i& Qt, DenseVector& evec, int j0, int j1, int _Nk ) { GridBase *grid = evec[0]._grid; DenseVector B(Nm,grid); // waste of space replicating if (0) { // old implementation without blocking for(int i=0; i<(Nm); ++i) B[i] = 0.0; for(int j=j0; joSites();ss++){ for(int jj=j0; jj& Qt, DenseVector& evec, //int k1, int k2 int j0, int j1, int _Nk ) { GridBase *grid = evec[0]._grid; typedef typename Field::vector_object vobj; assert(j0>=0); assert(j1 B(Nm); #pragma omp for for(int ss=0;ss < grid->oSites();ss++){ for(int j=j0; j& Qt, DenseVector& evec, int k1, int k2 ) { GridBase *grid = evec[0]._grid; int j_block = 24; int k_block=24; assert(k20); int thr=GridThread::GetThreads(); int n_field = 1; int each = 1; if( (Nm*thr)>(grid->oSites()) ) { each = (grid->oSites())/Nm ; n_field = thr/each + 1; } std::cout<oSites()); for(int j=0; j& Qt, DenseVector& evec, DenseVector &eval2, DenseVector &Iconv, int &Nconv) { GridBase *grid = evec[0]._grid; DenseVector B(Nm,grid); // waste of space replicating Field v(grid); if (0) { for(int k = 0; k<_Nk; ++k) B[k]=0.0; for(int j = 0; j<_Nk; ++j){ for(int k = 0; k<_Nk; ++k){ B[j].checkerboard = evec[k].checkerboard; B[j] += Qt[k+j*Nm] * evec[k]; } std::cout<& eval, DenseVector& evec ) { GridBase *grid = evec[0]._grid; Field v(grid); Field B(grid); for(int j = 0; j<_Nk; ++j){ std::cout<& Qt, DenseVector& evec, DenseVector &eval2, DenseVector &Iconv, int &Nconv) { GridBase *grid = evec[0]._grid; Field v(grid); Field B(grid); Nconv = 0; for(int j = 0; j<_Nk; ++j){ B=0.; B.checkerboard = evec[0].checkerboard; for(int k = 0; k<_Nk; ++k){ B += Qt[k+j*Nm] * evec[k]; } std::cout<& Qt, DenseVector& evec, DenseVector &eval, DenseVector &eval2, DenseVector &Iconv, int &Nconv) { GridBase *grid = evec[0]._grid; int thr=GridThread::GetThreads(); for(int i=0; ioSites();ss++){ int me = GridThread::ThreadBarrier(); printf("thr=%d ss=%d me=%d\n",thr,ss,me);fflush(stdout); assert( (Nm*thr)oSites()); // auto B2 = evec[0]._odata[0]; // std::vector < decltype( B2 ) > B(Nm,B2); for(int j=0; j& Qt, DenseVector& evec, DenseVector &eval, DenseVector &eval2, DenseVector &Iconv, int &Nconv) { GridBase *grid = evec[0]._grid; typedef typename Field::vector_object vobj; #pragma omp parallel { std::vector < vobj > B(Nm); #pragma omp for for(int ss=0;ss < grid->oSites();ss++){ // for(int j=0; j K P = M − K † Compute the factorization AVM = VM HM + fM eM repeat Q=I for i = 1,...,P do QiRi =HM −θiI Q = QQi H M = Q †i H M Q i end for βK =HM(K+1,K) σK =Q(M,K) r=vK+1βK +rσK VK =VM(1:M)Q(1:M,1:K) HK =HM(1:K,1:K) →AVK =VKHK +fKe†K † Extend to an M = K + P step factorization AVM = VMHM + fMeM until convergence */ // alternate implementation for minimizing memory usage. May affect the performance void calc( DenseVector& eval, DenseVector& evec, const Field& src, int& Nconv) { GridBase *grid = evec[0]._grid; assert(grid == src._grid); std::cout<=Nstop ){ goto converged; } } // end of iter loop std::cout< evals, DenseVector evecs){ int N= evals.size(); _sort.push(evals,evecs, evals.size(),N); } /** There is some matrix Q such that for any vector y Q.e_1 = y and Q is unitary. **/ template static T orthQ(DenseMatrix &Q, DenseVector y){ int N = y.size(); //Matrix Size Fill(Q,0.0); T tau; for(int i=0;i 0.0){ T gam = conj( (y[j]/tau)/tau0 ); for(int k=0;k<=j-1;k++){ Q[k][j]=-gam*y[k]; } Q[j][j]=tau0/tau; } else { Q[j-1][j]=1.0; } tau0 = tau; } return tau; } /** There is some matrix Q such that for any vector y Q.e_k = y and Q is unitary. **/ template< class T> static T orthU(DenseMatrix &Q, DenseVector y){ T tau = orthQ(Q,y); SL(Q); return tau; } /** Wind up with a matrix with the first con rows untouched say con = 2 Q is such that Qdag H Q has {x, x, val, 0, 0, 0, 0, ...} as 1st colum and the matrix is upper hessenberg and with f and Q appropriately modidied with Q is the arnoldi factorization **/ template static void Lock(DenseMatrix &H, ///Hess mtx DenseMatrix &Q, ///Lock Transform T val, ///value to be locked int con, ///number already locked RealD small, int dfg, bool herm) { //ForceTridiagonal(H); int M = H.dim; DenseVector vec; Resize(vec,M-con); DenseMatrix AH; Resize(AH,M-con,M-con); AH = GetSubMtx(H,con, M, con, M); DenseMatrix QQ; Resize(QQ,M-con,M-con); Unity(Q); Unity(QQ); DenseVector evals; Resize(evals,M-con); DenseMatrix evecs; Resize(evecs,M-con,M-con); Wilkinson(AH, evals, evecs, small); int k=0; RealD cold = abs( val - evals[k]); for(int i=1;icon+2; j--){ DenseMatrix U; Resize(U,j-1-con,j-1-con); DenseVector z; Resize(z,j-1-con); T nm = norm(z); for(int k = con+0;k Hb; Resize(Hb,j-1-con,M); for(int a = 0;a Qb; Resize(Qb,M,M); for(int a = 0;a Hc; Resize(Hc,M,M); for(int a = 0;a