/* Authors: Christoph Lehner Date: 2017 Multigrid Lanczos TODO: High priority: - Explore filtering of starting vector again, should really work: If cheby has 4 for low mode region and 1 for high mode, applying 15 iterations has 1e9 suppression of high modes, which should create the desired invariant subspace already? Missing something here??? Maybe dynamic range dangerous, i.e., could also kill interesting eigenrange if not careful. Better: Use all Cheby up to order N in order to approximate a step function; try this! Problem: width of step function. Can kill eigenspace > 1e-3 and have < 1e-5 equal to 1 Low priority: - Given that I seem to need many restarts and high degree poly to create the base and this takes about 1 day, seriously consider a simple method to create a basis (ortho krylov low poly); and then fix up lowest say 200 eigenvalues by 1 run with high-degree poly (600 could be enough) */ #include #include ///////////////////////////////////////////////////////////////////////////// // The following are now decoupled from the Lanczos and deal with grids. // Safe to replace functionality ///////////////////////////////////////////////////////////////////////////// #include "BlockedGrid.h" #include "FieldBasisVector.h" #include "BlockProjector.h" #include "FieldVectorIO.h" #include "Params.h" using namespace std; using namespace Grid; ; bool read_evals(GridBase* _grid, char* fn, std::vector& evals) { FILE* f = 0; uint32_t status = 0; if (_grid->IsBoss()) { f = fopen(fn,"rt"); status = f ? 1 : 0; } _grid->GlobalSum(status); if (!status) return false; uint32_t N; if (f) assert(fscanf(f,"%d\n",&N)==1); else N = 0; _grid->GlobalSum(N); std::cout << "Reading " << N << " eigenvalues" << std::endl; evals.resize(N); for (int i=0;iGlobalSumVector(&evals[0],evals.size()); if (f) fclose(f); return true; } void write_evals(char* fn, std::vector& evals) { FILE* f = fopen(fn,"wt"); assert(f); int N = (int)evals.size(); fprintf(f,"%d\n",N); for (int i=0;i& hist) { FILE* f = fopen(fn,"wt"); assert(f); int N = (int)hist.size(); for (int i=0;i class CheckpointedLinearFunction : public LinearFunction { public: LinearFunction& _op; std::string _dir; int _max_apply; int _apply, _apply_actual; GridBase* _grid; FILE* _f; CheckpointedLinearFunction(GridBase* grid, LinearFunction& op, const char* dir,int max_apply) : _op(op), _dir(dir), _grid(grid), _f(0), _max_apply(max_apply), _apply(0), _apply_actual(0) { FieldVectorIO::conditionalMkDir(dir); char fn[4096]; sprintf(fn,"%s/ckpt_op.%4.4d",_dir.c_str(),_grid->ThisRank()); printf("CheckpointLinearFunction:: file %s\n",fn); _f = fopen(fn,"r+b"); if (!_f) _f = fopen(fn,"w+b"); assert(_f); fseek(_f,0,SEEK_CUR); } ~CheckpointedLinearFunction() { if (_f) { fclose(_f); _f = 0; } } bool load_ckpt(const Field& in, Field& out) { off_t cur = ftello(_f); fseeko(_f,0,SEEK_END); if (cur == ftello(_f)) return false; fseeko(_f,cur,SEEK_SET); size_t sz = sizeof(out._odata[0]) * out._odata.size(); GridStopWatch gsw; gsw.Start(); uint32_t crc_exp; assert(fread(&crc_exp,4,1,_f)==1); assert(fread(&out._odata[0],sz,1,_f)==1); assert(FieldVectorIO::crc32_threaded((unsigned char*)&out._odata[0],sz,0x0)==crc_exp); gsw.Stop(); printf("CheckpointLinearFunction:: reading %lld\n",(long long)sz); std::cout << GridLogMessage << "Loading " << ((RealD)sz/1024./1024./1024.) << " GB in " << gsw.Elapsed() << std::endl; return true; } void save_ckpt(const Field& in, Field& out) { fseek(_f,0,SEEK_CUR); // switch to write size_t sz = sizeof(out._odata[0]) * out._odata.size(); GridStopWatch gsw; gsw.Start(); uint32_t crc = FieldVectorIO::crc32_threaded((unsigned char*)&out._odata[0],sz,0x0); assert(fwrite(&crc,4,1,_f)==1); assert(fwrite(&out._odata[0],sz,1,_f)==1); fflush(_f); // try this on the GPFS to suppress OPA usage for disk during dslash; this is not needed at Lustre/JLAB gsw.Stop(); printf("CheckpointLinearFunction:: writing %lld\n",(long long)sz); std::cout << GridLogMessage << "Saving " << ((RealD)sz/1024./1024./1024.) << " GB in " << gsw.Elapsed() << std::endl; } void operator()(const Field& in, Field& out) { _apply++; if (load_ckpt(in,out)) return; _op(in,out); save_ckpt(in,out); if (_apply_actual++ >= _max_apply) { std::cout << GridLogMessage << "Maximum application of operator reached, checkpoint and finish in future job" << std::endl; if (_f) { fclose(_f); _f=0; } in._grid->Barrier(); Grid_finalize(); exit(3); } } }; template class ProjectedFunctionHermOp : public LinearFunction { public: OperatorFunction & _poly; LinearOperatorBase &_Linop; BlockProjector& _pr; ProjectedFunctionHermOp(BlockProjector& pr,OperatorFunction & poly,LinearOperatorBase& linop) : _poly(poly), _Linop(linop), _pr(pr) { } void operator()(const CoarseField& in, CoarseField& out) { assert(_pr._bgrid._o_blocks == in._grid->oSites()); Field fin(_pr._bgrid._grid); Field fout(_pr._bgrid._grid); GridStopWatch gsw1,gsw2,gsw3; // fill fin gsw1.Start(); _pr.coarseToFine(in,fin); gsw1.Stop(); // apply poly gsw2.Start(); _poly(_Linop,fin,fout); gsw2.Stop(); // fill out gsw3.Start(); _pr.fineToCoarse(fout,out); gsw3.Stop(); auto eps = innerProduct(in,out); std::cout << GridLogMessage << "Operator timing details: c2f = " << gsw1.Elapsed() << " poly = " << gsw2.Elapsed() << " f2c = " << gsw3.Elapsed() << " Complimentary Hermiticity check: " << eps.imag() / std::abs(eps) << std::endl; } }; template class ProjectedHermOp : public LinearFunction { public: LinearOperatorBase &_Linop; BlockProjector& _pr; ProjectedHermOp(BlockProjector& pr,LinearOperatorBase& linop) : _Linop(linop), _pr(pr) { } void operator()(const CoarseField& in, CoarseField& out) { assert(_pr._bgrid._o_blocks == in._grid->oSites()); Field fin(_pr._bgrid._grid); Field fout(_pr._bgrid._grid); _pr.coarseToFine(in,fin); _Linop.HermOp(fin,fout); _pr.fineToCoarse(fout,out); } }; template using CoarseSiteFieldGeneral = iScalar< iVector >; template using CoarseSiteFieldD = CoarseSiteFieldGeneral< vComplexD, N >; template using CoarseSiteFieldF = CoarseSiteFieldGeneral< vComplexF, N >; template using CoarseSiteField = CoarseSiteFieldGeneral< vComplex, N >; template using CoarseLatticeFermion = Lattice< CoarseSiteField >; template using CoarseLatticeFermionD = Lattice< CoarseSiteFieldD >; template void CoarseGridLanczos(BlockProjector& pr,RealD alpha2,RealD beta,int Npoly2, int Nstop2,int Nk2,int Nm2,RealD resid2,RealD betastp2,int MaxIt,int MinRes2, LinearOperatorBase& HermOp, std::vector& eval1, bool cg_test_enabled, int cg_test_maxiter,int nsingle,int SkipTest2, int MaxApply2,bool smoothed_eval_enabled, int smoothed_eval_inner,int smoothed_eval_outer,int smoothed_eval_begin, int smoothed_eval_end,RealD smoothed_eval_inner_resid) { BlockedGrid& bgrid = pr._bgrid; BasisFieldVector& basis = pr._evec; std::vector coarseFourDimLatt; for (int i=0;i<4;i++) coarseFourDimLatt.push_back(bgrid._nb[1+i] * bgrid._grid->_processors[1+i]); assert(bgrid._grid->_processors[0] == 1); std::cout << GridLogMessage << "CoarseGrid = " << coarseFourDimLatt << " with basis = " << Nstop1 << std::endl; GridCartesian * UCoarseGrid = SpaceTimeGrid::makeFourDimGrid(coarseFourDimLatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); GridCartesian * FCoarseGrid = SpaceTimeGrid::makeFiveDimGrid(bgrid._nb[0],UCoarseGrid); Chebyshev Cheb2(alpha2,beta,Npoly2); CoarseLatticeFermion src_coarse(FCoarseGrid); // Second round of Lanczos in blocked space std::vector eval2(Nm2); std::vector eval3(Nm2); BasisFieldVector > coef(Nm2,FCoarseGrid); ProjectedFunctionHermOp,LatticeFermion> Op2plain(pr,Cheb2,HermOp); CheckpointedLinearFunction > Op2ckpt(src_coarse._grid,Op2plain,"checkpoint",MaxApply2); LinearFunction< CoarseLatticeFermion >* Op2; if (MaxApply2) { Op2 = &Op2ckpt; } else { Op2 = &Op2plain; } ProjectedHermOp,LatticeFermion> Op2nopoly(pr,HermOp); ImplicitlyRestartedLanczos > IRL2(*Op2,*Op2,Nstop2,Nk2,Nm2,resid2,MaxIt,betastp2,MinRes2); src_coarse = 1.0; // Precision test { Field tmp(bgrid._grid); CoarseLatticeFermion tmp2(FCoarseGrid); CoarseLatticeFermion tmp3(FCoarseGrid); tmp2 = 1.0; tmp3 = 1.0; pr.coarseToFine(tmp2,tmp); pr.fineToCoarse(tmp,tmp2); tmp2 -= tmp3; std::cout << GridLogMessage << "Precision Test c->f->c: " << norm2(tmp2) / norm2(tmp3) << std::endl; //bgrid._grid->Barrier(); //return; } int Nconv; if (!FieldVectorIO::read_compressed_vectors("lanczos.output",pr,coef) || !read_evals(UCoarseGrid,(char *)"lanczos.output/eigen-values.txt",eval3) || !read_evals(UCoarseGrid,(char *)"lanczos.output/eigen-values.txt.linear",eval1) || !read_evals(UCoarseGrid,(char *)"lanczos.output/eigen-values.txt.poly",eval2) ) { IRL2.calc(eval2,coef._v,src_coarse,Nconv,true); coef.resize(Nstop2); eval2.resize(Nstop2); eval3.resize(Nstop2); std::vector step3_cache; // reconstruct eigenvalues of original operator for (int i=0;iIsBoss()) { write_evals((char *)"lanczos.output/eigen-values.txt",eval3); write_evals((char *)"lanczos.output/eigen-values.txt.linear",eval1); write_evals((char *)"lanczos.output/eigen-values.txt.poly",eval2); } } // fix up eigenvalues if (!read_evals(UCoarseGrid,(char *)"lanczos.output/eigen-values.txt.smoothed",eval3) && smoothed_eval_enabled) { ConjugateGradient CG(smoothed_eval_inner_resid, smoothed_eval_inner, false); LatticeFermion v_i(basis[0]._grid); auto tmp = v_i; auto tmp2 = v_i; for (int i=smoothed_eval_begin;iIsBoss()) { write_evals((char *)"lanczos.output/eigen-values.txt.smoothed",eval3); write_evals((char *)"lanczos.output/eigen-values.txt",eval3); // also reset this to the best ones we have available } } // do CG test with and without deflation if (cg_test_enabled) { ConjugateGradient CG(1.0e-8, cg_test_maxiter, false); LatticeFermion src_orig(bgrid._grid); src_orig.checkerboard = Odd; src_orig = 1.0; src_orig = src_orig * (1.0 / ::sqrt(norm2(src_orig)) ); auto result = src_orig; // undeflated solve std::cout << GridLogMessage << " Undeflated solve "<IsBoss()) // write_history("cg_test.undefl",CG.ResHistory); // CG.ResHistory.clear(); // deflated solve with all eigenvectors std::cout << GridLogMessage << " Deflated solve with all evectors"<IsBoss()) // write_history("cg_test.defl_all",CG.ResHistory); // CG.ResHistory.clear(); // deflated solve with non-blocked eigenvectors std::cout << GridLogMessage << " Deflated solve with non-blocked evectors"<IsBoss()) // write_history("cg_test.defl_full",CG.ResHistory); // CG.ResHistory.clear(); // deflated solve with all eigenvectors and original eigenvalues from proj std::cout << GridLogMessage << " Deflated solve with all eigenvectors and original eigenvalues from proj"<IsBoss()) // write_history("cg_test.defl_all_ev3",CG.ResHistory); // CG.ResHistory.clear(); } } template void quick_krylov_basis(BasisFieldVector& evec,Field& src,LinearFunction& Op,int Nstop) { Field tmp = src; Field tmp2 = tmp; for (int i=0;i HermOp(Ddwf); // Eigenvector storage const int Nm1 = Np1 + Nk1; const int Nm2 = Np2 + Nk2; // maximum number of vectors we need to keep std::cout << GridLogMessage << "Keep " << Nm1 << " full vectors" << std::endl; std::cout << GridLogMessage << "Keep " << Nm2 << " total vectors" << std::endl; assert(Nm2 >= Nm1); BasisFieldVector evec(Nm1,FrbGrid); // start off with keeping full vectors // First and second cheby Chebyshev Cheb1(alpha1,beta,Npoly1); FunctionHermOp Op1(Cheb1,HermOp); PlainHermOp Op1test(HermOp); // Eigenvalue storage std::vector eval1(evec.size()); // Construct source vector LatticeFermion src(FrbGrid); { src=1.0; src.checkerboard = Odd; // normalize RealD nn = norm2(src); nn = Grid::sqrt(nn); src = src * (1.0/nn); } // Do a benchmark and a quick exit if performance is too little (ugly but needed due to performance fluctuations) if (max_cheb_time_ms) { // one round of warmup auto tmp = src; GridStopWatch gsw1,gsw2; gsw1.Start(); Cheb1(HermOp,src,tmp); gsw1.Stop(); Ddwf.ZeroCounters(); gsw2.Start(); Cheb1(HermOp,src,tmp); gsw2.Stop(); Ddwf.Report(); std::cout << GridLogMessage << "Performance check; warmup = " << gsw1.Elapsed() << " test = " << gsw2.Elapsed() << std::endl; int ms = (int)(gsw2.useconds()/1e3); if (ms > max_cheb_time_ms) { std::cout << GridLogMessage << "Performance too poor: " << ms << " ms, cutoff = " << max_cheb_time_ms << " ms" << std::endl; Grid_finalize(); return 2; } } // First round of Lanczos to get low mode basis ImplicitlyRestartedLanczos IRL1(Op1,Op1test,Nstop1,Nk1,Nm1,resid1,MaxIt,betastp1,MinRes1); int Nconv; char tag[1024]; if (!FieldVectorIO::read_argonne(evec,(char *)"checkpoint") || !read_evals(UGrid,(char *)"checkpoint/eigen-values.txt",eval1)) { if (simple_krylov_basis) { quick_krylov_basis(evec,src,Op1,Nstop1); } else { IRL1.calc(eval1,evec._v,src,Nconv,false); } evec.resize(Nstop1); // and throw away superfluous eval1.resize(Nstop1); if (checkpoint_basis) FieldVectorIO::write_argonne(evec,(char *)"checkpoint"); if (UGrid->IsBoss() && checkpoint_basis) write_evals((char *)"checkpoint/eigen-values.txt",eval1); Ddwf.Report(); if (exit_after_basis_calculation) { Grid_finalize(); return 0; } } // now test eigenvectors if (!simple_krylov_basis) { for (int i=0;i