From cb9ff20249d90f528ba1b2609f4cbe3e62b1f437 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 13 Oct 2017 11:30:50 +0100 Subject: [PATCH 01/63] Approx tests and lanczos improvement --- lib/algorithms/approx/Chebyshev.h | 6 +- .../BlockImplicitlyRestartedLanczos.h | 1399 +++++++++-------- .../FieldBasisVector.h | 5 +- .../iterative/ImplicitlyRestartedLanczos.h | 3 +- lib/log/Log.cc | 10 +- lib/log/Log.h | 13 +- lib/threads/Threads.h | 2 + tests/debug/Test_cheby.cc | 36 +- tests/hmc/Test_remez.cc | 61 +- 9 files changed, 823 insertions(+), 712 deletions(-) diff --git a/lib/algorithms/approx/Chebyshev.h b/lib/algorithms/approx/Chebyshev.h index f8c21a05..5088c51b 100644 --- a/lib/algorithms/approx/Chebyshev.h +++ b/lib/algorithms/approx/Chebyshev.h @@ -83,8 +83,10 @@ namespace Grid { public: void csv(std::ostream &out){ - RealD diff = hi-lo; - for (RealD x=lo-0.2*diff; x #define GRID_BIRL_H #include //memset - -#include +//#include #include #include @@ -42,420 +41,185 @@ Author: Christoph Lehner namespace Grid { +template +void basisOrthogonalize(std::vector &basis,Field &w,int k) +{ + for(int j=0; j +void basisRotate(std::vector &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm) +{ + typedef typename Field::vector_object vobj; + GridBase* grid = basis[0]._grid; + + parallel_region + { + std::vector < vobj > B(Nm); // Thread private + + parallel_for_internal(int ss=0;ss < grid->oSites();ss++){ + for(int j=j0; j +void basisReorderInPlace(std::vector &_v,std::vector& sort_vals, std::vector& idx) +{ + int vlen = idx.size(); + + assert(vlen>=1); + assert(vlen<=sort_vals.size()); + assert(vlen<=_v.size()); + + for (size_t i=0;i i); + ////////////////////////////////////// + // idx[i] is a table of desired sources giving a permutation. + // + // Swap v[i] with v[idx[i]]. + // + // Find j>i for which _vnew[j] = _vold[i], + // track the move idx[j] => idx[i] + // track the move idx[i] => i + ////////////////////////////////////// + size_t j; + for (j=i;j basisSortGetIndex(std::vector& sort_vals) +{ + std::vector idx(sort_vals.size()); + std::iota(idx.begin(), idx.end(), 0); + + // sort indexes based on comparing values in v + std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) { + return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]); + }); + return idx; +} + +template +void basisSortInPlace(std::vector & _v,std::vector& sort_vals, bool reverse) +{ + std::vector idx = basisSortGetIndex(sort_vals); + if (reverse) + std::reverse(idx.begin(), idx.end()); + + basisReorderInPlace(_v,sort_vals,idx); +} + +// PAB: faster to compute the inner products first then fuse loops. +// If performance critical can improve. +template +void basisDeflate(const std::vector &_v,const std::vector& eval,const Field& src_orig,Field& result) { + result = zero; + assert(_v.size()==eval.size()); + int N = (int)_v.size(); + for (int i=0;i - class BlockImplicitlyRestartedLanczos { - - const RealD small = 1.0e-16; +template +class BlockImplicitlyRestartedLanczos { + private: + const RealD small = 1.0e-8; + int MaxIter; + int MinRestart; // Minimum number of restarts; only check for convergence after + int Nstop; // Number of evecs checked for convergence + int Nk; // Number of converged sought + // int Np; // Np -- Number of spare vecs in krylov space // == Nm - Nk + int Nm; // Nm -- total number of vectors + IRLdiagonalisation diagonalisation; + int orth_period; + + RealD OrthoTime; + RealD eresid, betastp; + //////////////////////////////// + // Embedded objects + //////////////////////////////// + SortEigen _sort; + LinearFunction &_HermOp; + LinearFunction &_HermOpTest; + ///////////////////////// + // Constructor + ///////////////////////// public: - int lock; - int get; - int Niter; - int converged; + BlockImplicitlyRestartedLanczos(LinearFunction & HermOp, + LinearFunction & HermOpTest, + int _Nstop, // sought vecs + int _Nk, // sought vecs + int _Nm, // spare vecs + RealD _eresid, // resid in lmdue deficit + RealD _betastp, // if beta(k) < betastp: converged + int _MaxIter, // Max iterations + int _MinRestart, int _orth_period = 1, + IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) : + _HermOp(HermOp), _HermOpTest(HermOpTest), + Nstop(_Nstop) , Nk(_Nk), Nm(_Nm), + eresid(_eresid), betastp(_betastp), + MaxIter(_MaxIter) , MinRestart(_MinRestart), + orth_period(_orth_period), diagonalisation(_diagonalisation) { }; - int Nminres; // Minimum number of restarts; only check for convergence after - int Nstop; // Number of evecs checked for convergence - int Nk; // Number of converged sought - int Np; // Np -- Number of spare vecs in kryloc space - int Nm; // Nm -- total number of vectors + //////////////////////////////// + // Helpers + //////////////////////////////// + template static RealD normalise(T& v) + { + RealD nn = norm2(v); + nn = sqrt(nn); + v = v * (1.0/nn); + return nn; + } - int orth_period; - - RealD OrthoTime; - - RealD eresid, betastp; - SortEigen _sort; - LinearFunction &_HermOp; - LinearFunction &_HermOpTest; - ///////////////////////// - // Constructor - ///////////////////////// - - BlockImplicitlyRestartedLanczos( - LinearFunction & HermOp, - LinearFunction & HermOpTest, - int _Nstop, // sought vecs - int _Nk, // sought vecs - int _Nm, // spare vecs - RealD _eresid, // resid in lmdue deficit - RealD _betastp, // if beta(k) < betastp: converged - int _Niter, // Max iterations - int _Nminres, int _orth_period = 1) : - _HermOp(HermOp), - _HermOpTest(HermOpTest), - Nstop(_Nstop), - Nk(_Nk), - Nm(_Nm), - eresid(_eresid), - betastp(_betastp), - Niter(_Niter), - Nminres(_Nminres), - orth_period(_orth_period) - { - Np = Nm-Nk; assert(Np>0); - }; - - BlockImplicitlyRestartedLanczos( - LinearFunction & HermOp, - LinearFunction & HermOpTest, - int _Nk, // sought vecs - int _Nm, // spare vecs - RealD _eresid, // resid in lmdue deficit - RealD _betastp, // if beta(k) < betastp: converged - int _Niter, // Max iterations - int _Nminres, - int _orth_period = 1) : - _HermOp(HermOp), - _HermOpTest(HermOpTest), - Nstop(_Nk), - Nk(_Nk), - Nm(_Nm), - eresid(_eresid), - betastp(_betastp), - Niter(_Niter), - Nminres(_Nminres), - orth_period(_orth_period) - { - Np = Nm-Nk; assert(Np>0); - }; - - -/* Saad PP. 195 -1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0 -2. For k = 1,2,...,m Do: -3. wk:=Avk−βkv_{k−1} -4. αk:=(wk,vk) // -5. wk:=wk−αkvk // wk orthog vk -6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop -7. vk+1 := wk/βk+1 -8. EndDo - */ - void step(std::vector& lmd, - std::vector& lme, - BasisFieldVector& evec, - Field& w,int Nm,int k) - { - assert( k< Nm ); - - GridStopWatch gsw_op,gsw_o; - - Field& evec_k = evec[k]; - - gsw_op.Start(); - _HermOp(evec_k,w); - gsw_op.Stop(); - - if(k>0){ - w -= lme[k-1] * evec[k-1]; - } - - ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk) - RealD alph = real(zalph); - - w = w - alph * evec_k;// 5. wk:=wk−αkvk - - RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop - // 7. vk+1 := wk/βk+1 - - std::cout<0 && k % orth_period == 0) { - orthogonalize(w,evec,k); // orthonormalise - } - gsw_o.Stop(); - - if(k < Nm-1) { - evec[k+1] = w; - } - - std::cout << GridLogMessage << "Timing: operator=" << gsw_op.Elapsed() << - " orth=" << gsw_o.Elapsed() << std::endl; - - } - - void qr_decomp(std::vector& lmd, - std::vector& lme, - int Nk, - int Nm, - std::vector& Qt, - RealD Dsh, - int kmin, - int kmax) - { - int k = kmin-1; - RealD x; - - RealD Fden = 1.0/hypot(lmd[k]-Dsh,lme[k]); - RealD c = ( lmd[k] -Dsh) *Fden; - RealD s = -lme[k] *Fden; - - RealD tmpa1 = lmd[k]; - RealD tmpa2 = lmd[k+1]; - RealD tmpb = lme[k]; - - lmd[k] = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb; - lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb; - lme[k] = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb; - x =-s*lme[k+1]; - lme[k+1] = c*lme[k+1]; - - for(int i=0; i& lmd, - std::vector& lme, - int N1, - int N2, - std::vector& Qt, - GridBase *grid){ - - std::cout << GridLogMessage << "diagonalize_lapack start\n"; - GridStopWatch gsw; - - const int size = Nm; - // tevals.resize(size); - // tevecs.resize(size); - LAPACK_INT NN = N1; - std::vector evals_tmp(NN); - std::vector evec_tmp(NN*NN); - memset(&evec_tmp[0],0,sizeof(double)*NN*NN); - // double AA[NN][NN]; - std::vector DD(NN); - std::vector EE(NN); - for (int i = 0; i< NN; i++) - for (int j = i - 1; j <= i + 1; j++) - if ( j < NN && j >= 0 ) { - if (i==j) DD[i] = lmd[i]; - if (i==j) evals_tmp[i] = lmd[i]; - if (j==(i-1)) EE[j] = lme[j]; - } - LAPACK_INT evals_found; - LAPACK_INT lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ; - LAPACK_INT liwork = 3+NN*10 ; - std::vector iwork(liwork); - std::vector work(lwork); - std::vector isuppz(2*NN); - char jobz = 'V'; // calculate evals & evecs - char range = 'I'; // calculate all evals - // char range = 'A'; // calculate all evals - char uplo = 'U'; // refer to upper half of original matrix - char compz = 'I'; // Compute eigenvectors of tridiagonal matrix - std::vector ifail(NN); - LAPACK_INT info; - // int total = QMP_get_number_of_nodes(); - // int node = QMP_get_node_number(); - // GridBase *grid = evec[0]._grid; - int total = grid->_Nprocessors; - int node = grid->_processor; - int interval = (NN/total)+1; - double vl = 0.0, vu = 0.0; - LAPACK_INT il = interval*node+1 , iu = interval*(node+1); - if (iu > NN) iu=NN; - double tol = 0.0; - if (1) { - memset(&evals_tmp[0],0,sizeof(double)*NN); - if ( il <= NN){ - std::cout << GridLogMessage << "dstegr started" << std::endl; - gsw.Start(); - dstegr(&jobz, &range, &NN, - (double*)&DD[0], (double*)&EE[0], - &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A' - &tol, // tolerance - &evals_found, &evals_tmp[0], (double*)&evec_tmp[0], &NN, - &isuppz[0], - &work[0], &lwork, &iwork[0], &liwork, - &info); - gsw.Stop(); - std::cout << GridLogMessage << "dstegr completed in " << gsw.Elapsed() << std::endl; - for (int i = iu-1; i>= il-1; i--){ - evals_tmp[i] = evals_tmp[i - (il-1)]; - if (il>1) evals_tmp[i-(il-1)]=0.; - for (int j = 0; j< NN; j++){ - evec_tmp[i*NN + j] = evec_tmp[(i - (il-1)) * NN + j]; - if (il>1) evec_tmp[(i-(il-1)) * NN + j]=0.; - } - } - } - { - // QMP_sum_double_array(evals_tmp,NN); - // QMP_sum_double_array((double *)evec_tmp,NN*NN); - grid->GlobalSumVector(&evals_tmp[0],NN); - grid->GlobalSumVector(&evec_tmp[0],NN*NN); - } - } - // cheating a bit. It is better to sort instead of just reversing it, but the document of the routine says evals are sorted in increasing order. qr gives evals in decreasing order. - for(int i=0;i& lmd, - std::vector& lme, - int N2, - int N1, - std::vector& Qt, - GridBase *grid) - { - -#ifdef USE_LAPACK_IRL - const int check_lapack=0; // just use lapack if 0, check against lapack if 1 - - if(!check_lapack) - return diagonalize_lapack(lmd,lme,N2,N1,Qt,grid); - - std::vector lmd2(N1); - std::vector lme2(N1); - std::vector Qt2(N1*N1); - for(int k=0; k= kmin; --j){ - RealD dds = fabs(lmd[j-1])+fabs(lmd[j]); - if(fabs(lme[j-1])+dds > dds){ - kmax = j+1; - goto continued; - } - } - Niter = iter; -#ifdef USE_LAPACK_IRL - if(check_lapack){ - const double SMALL=1e-8; - diagonalize_lapack(lmd2,lme2,N2,N1,Qt2,grid); - std::vector lmd3(N2); - for(int k=0; kSMALL) std::cout<SMALL) std::cout<SMALL) std::cout< dds){ - kmin = j+1; - break; - } - } - } - std::cout< - static RealD normalise(T& v) - { - RealD nn = norm2(v); - nn = sqrt(nn); - v = v * (1.0/nn); - return nn; - } - - void orthogonalize(Field& w, - BasisFieldVector& evec, - int k) - { - double t0=-usecond()/1e6; - - evec.orthogonalize(w,k); - - normalise(w); - t0+=usecond()/1e6; - OrthoTime +=t0; - } - - void setUnit_Qt(int Nm, std::vector &Qt) { - for(int i=0; i& evec,int k) + { + OrthoTime-=usecond()/1e6; + //evec.orthogonalize(w,k); + basisOrthogonalize(evec._v,w,k); + normalise(w); + OrthoTime+=usecond()/1e6; + } /* Rudy Arthur's thesis pp.137 ------------------------ @@ -474,280 +238,555 @@ repeat →AVK =VKHK +fKe†K † Extend to an M = K + P step factorization AVM = VMHM + fMeM until convergence */ - - void calc(std::vector& eval, - BasisFieldVector& evec, - const Field& src, - int& Nconv, - bool reverse, - int SkipTest) - { - - GridBase *grid = evec._v[0]._grid;//evec.get(0 + evec_offset)._grid; - assert(grid == src._grid); - - std::cout< lme(Nm); - std::vector lme2(Nm); - std::vector eval2(Nm); - std::vector eval2_copy(Nm); - std::vector Qt(Nm*Nm); - - - Field f(grid); - Field v(grid); - - int k1 = 1; - int k2 = Nk; - - Nconv = 0; - - RealD beta_k; - - // Set initial vector - evec[0] = src; - normalise(evec[0]); - std:: cout<0); - evec.rotate(Qt,k1-1,k2+1,0,Nm,Nm); - - t1=usecond()/1e6; - std::cout<= Nminres) { - std::cout << GridLogMessage << "Rotation to test convergence " << std::endl; - - Field ev0_orig(grid); - ev0_orig = evec[0]; - - evec.rotate(Qt,0,Nk,0,Nk,Nm); - - { - std::cout << GridLogMessage << "Test convergence" << std::endl; - Field B(grid); - - for(int j = 0; j=Nstop || beta_k < betastp){ - goto converged; - } - - std::cout << GridLogMessage << "Rotate back" << std::endl; - //B[j] +=Qt[k+_Nm*j] * _v[k]._odata[ss]; - { - Eigen::MatrixXd qm = Eigen::MatrixXd::Zero(Nk,Nk); - for (int k=0;k QtI(Nm*Nm); - for (int k=0;k lme(Nm); + std::vector lme2(Nm); + std::vector eval2(Nm); + std::vector eval2_copy(Nm); + Eigen::MatrixXd Qt = Eigen::MatrixXd::Zero(Nm,Nm); + + Field f(grid); + Field v(grid); + int k1 = 1; + int k2 = Nk; + RealD beta_k; + + Nconv = 0; + + // Set initial vector + evec[0] = src; + normalise(evec[0]); + + // Initial Nk steps + OrthoTime=0.; + for(int k=0; k0); + // evec.rotate(Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis + basisRotate(evec._v,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis + + std::cout<= MinRestart) { + std::cout << GridLogIRL << "Rotation to test convergence " << std::endl; + + Field ev0_orig(grid); + ev0_orig = evec[0]; + + // evec.rotate(Qt,0,Nk,0,Nk,Nm); + basisRotate(evec._v,Qt,0,Nk,0,Nk,Nm); + + { + std::cout << GridLogIRL << "Test convergence" << std::endl; + Field B(grid); + + for(int j = 0; j=Nstop || beta_k < betastp){ + goto converged; + } + + std::cout << GridLogIRL << "Convergence testing: Rotating back" << std::endl; + //B[j] +=Qt[k+_Nm*j] * _v[k]._odata[ss]; + { + Eigen::MatrixXd qm = Eigen::MatrixXd::Zero(Nk,Nk); // Restrict Qt to Nk x Nk + for (int k=0;k& lmd, + std::vector& lme, + BasisFieldVector& evec, + Field& w,int Nm,int k) + { + const RealD tiny = 1.0e-20; + assert( k< Nm ); + + GridStopWatch gsw_op,gsw_o; + + Field& evec_k = evec[k]; + + _HermOp(evec_k,w); + std::cout<0) w -= lme[k-1] * evec[k-1]; + + ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk) + RealD alph = real(zalph); + + w = w - alph * evec_k;// 5. wk:=wk−αkvk + + RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop + // 7. vk+1 := wk/βk+1 + + lmd[k] = alph; + lme[k] = beta; + + std::cout<0 && k % orth_period == 0) { + orthogonalize(w,evec,k); // orthonormalise + std::cout<& lmd, std::vector& lme, + int Nk, int Nm, + Eigen::MatrixXd & Qt, // Nm x Nm + GridBase *grid) + { + Eigen::MatrixXd TriDiag = Eigen::MatrixXd::Zero(Nk,Nk); + + for(int i=0;i eigensolver(TriDiag); + + for (int i = 0; i < Nk; i++) { + lmd[Nk-1-i] = eigensolver.eigenvalues()(i); + } + for (int i = 0; i < Nk; i++) { + for (int j = 0; j < Nk; j++) { + Qt(Nk-1-i,j) = eigensolver.eigenvectors()(j,i); + } + } + } + + /////////////////////////////////////////////////////////////////////////// + // File could end here if settle on Eigen ??? + /////////////////////////////////////////////////////////////////////////// + + void QR_decomp(std::vector& lmd, // Nm + std::vector& lme, // Nm + int Nk, int Nm, // Nk, Nm + Eigen::MatrixXd& Qt, // Nm x Nm matrix + RealD Dsh, int kmin, int kmax) + { + int k = kmin-1; + RealD x; + + RealD Fden = 1.0/hypot(lmd[k]-Dsh,lme[k]); + RealD c = ( lmd[k] -Dsh) *Fden; + RealD s = -lme[k] *Fden; + + RealD tmpa1 = lmd[k]; + RealD tmpa2 = lmd[k+1]; + RealD tmpb = lme[k]; + + lmd[k] = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb; + lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb; + lme[k] = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb; + x =-s*lme[k+1]; + lme[k+1] = c*lme[k+1]; + + for(int i=0; i& lmd, std::vector& lme, + int Nk, int Nm, + Eigen::MatrixXd & Qt, + GridBase *grid) + { + Qt = Eigen::MatrixXd::Identity(Nm,Nm); + if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) { + diagonalize_lapack(lmd,lme,Nk,Nm,Qt,grid); + } else if ( diagonalisation == IRLdiagonaliseWithQR ) { + diagonalize_QR(lmd,lme,Nk,Nm,Qt,grid); + } else if ( diagonalisation == IRLdiagonaliseWithEigen ) { + diagonalize_Eigen(lmd,lme,Nk,Nm,Qt,grid); + } else { + assert(0); + } + } + +#ifdef USE_LAPACK +void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e, + double *vl, double *vu, int *il, int *iu, double *abstol, + int *m, double *w, double *z, int *ldz, int *isuppz, + double *work, int *lwork, int *iwork, int *liwork, + int *info); #endif - }; - +void diagonalize_lapack(std::vector& lmd, + std::vector& lme, + int Nk, int Nm, + Eigen::MatrixXd& Qt, + GridBase *grid) +{ +#ifdef USE_LAPACK + const int size = Nm; + int NN = Nk; + double evals_tmp[NN]; + double evec_tmp[NN][NN]; + memset(evec_tmp[0],0,sizeof(double)*NN*NN); + double DD[NN]; + double EE[NN]; + for (int i = 0; i< NN; i++) { + for (int j = i - 1; j <= i + 1; j++) { + if ( j < NN && j >= 0 ) { + if (i==j) DD[i] = lmd[i]; + if (i==j) evals_tmp[i] = lmd[i]; + if (j==(i-1)) EE[j] = lme[j]; + } + } + } + int evals_found; + int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ; + int liwork = 3+NN*10 ; + int iwork[liwork]; + double work[lwork]; + int isuppz[2*NN]; + char jobz = 'V'; // calculate evals & evecs + char range = 'I'; // calculate all evals + // char range = 'A'; // calculate all evals + char uplo = 'U'; // refer to upper half of original matrix + char compz = 'I'; // Compute eigenvectors of tridiagonal matrix + int ifail[NN]; + int info; + int total = grid->_Nprocessors; + int node = grid->_processor; + int interval = (NN/total)+1; + double vl = 0.0, vu = 0.0; + int il = interval*node+1 , iu = interval*(node+1); + if (iu > NN) iu=NN; + double tol = 0.0; + if (1) { + memset(evals_tmp,0,sizeof(double)*NN); + if ( il <= NN){ + LAPACK_dstegr(&jobz, &range, &NN, + (double*)DD, (double*)EE, + &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A' + &tol, // tolerance + &evals_found, evals_tmp, (double*)evec_tmp, &NN, + isuppz, + work, &lwork, iwork, &liwork, + &info); + for (int i = iu-1; i>= il-1; i--){ + evals_tmp[i] = evals_tmp[i - (il-1)]; + if (il>1) evals_tmp[i-(il-1)]=0.; + for (int j = 0; j< NN; j++){ + evec_tmp[i][j] = evec_tmp[i - (il-1)][j]; + if (il>1) evec_tmp[i-(il-1)][j]=0.; + } + } + } + { + grid->GlobalSumVector(evals_tmp,NN); + grid->GlobalSumVector((double*)evec_tmp,NN*NN); + } + } + // Safer to sort instead of just reversing it, + // but the document of the routine says evals are sorted in increasing order. + // qr gives evals in decreasing order. + for(int i=0;i& lmd, std::vector& lme, + int Nk, int Nm, + Eigen::MatrixXd & Qt, + GridBase *grid) + { + int QRiter = 100*Nm; + int kmin = 1; + int kmax = Nk; + + // (this should be more sophisticated) + for(int iter=0; iter= kmin; --j){ + RealD dds = fabs(lmd[j-1])+fabs(lmd[j]); + if(fabs(lme[j-1])+dds > dds){ + kmax = j+1; + goto continued; + } + } + QRiter = iter; + return; + + continued: + for(int j=0; j dds){ + kmin = j+1; + break; + } + } + } + std::cout << GridLogError << "[QL method] Error - Too many iteration: "<& Qt,int j0, int j1, int k0,int k1,int Nm) { + void rotate(Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm) { GridBase* grid = _v[0]._grid; @@ -62,7 +62,7 @@ class BasisFieldVector { for(int j=j0; j &logstreams) { GridLogError.Active(0); diff --git a/lib/log/Log.h b/lib/log/Log.h index 74d080bb..8db83266 100644 --- a/lib/log/Log.h +++ b/lib/log/Log.h @@ -85,6 +85,7 @@ class Logger { protected: Colours &Painter; int active; + int timing_mode; static int timestamp; std::string name, topName; std::string COLOUR; @@ -101,20 +102,24 @@ public: name(nm), topName(topNm), Painter(col_class), + timing_mode(0), COLOUR(col) {} ; void Active(int on) {active = on;}; int isActive(void) {return active;}; static void Timestamp(int on) {timestamp = on;}; - + void Reset(void) { StopWatch.Reset(); } + void TimingMode(int on) { timing_mode = on; if(on) Reset(); } + friend std::ostream& operator<< (std::ostream& stream, Logger& log){ if ( log.active ) { - stream << log.background()<< std::setw(8) << std::left << log.topName << log.background()<< " : "; - stream << log.colour() << std::setw(10) << std::left << log.name << log.background() << " : "; + stream << log.background()<< std::left << log.topName << log.background()<< " : "; + stream << log.colour() << std::left << log.name << log.background() << " : "; if ( log.timestamp ) { StopWatch.Stop(); GridTime now = StopWatch.Elapsed(); + if ( log.timing_mode==1 ) StopWatch.Reset(); StopWatch.Start(); stream << log.evidence()<< now << log.background() << " : " ; } @@ -135,6 +140,8 @@ public: void GridLogConfigure(std::vector &logstreams); +extern GridLogger GridLogIRL; +extern GridLogger GridLogSolver; extern GridLogger GridLogError; extern GridLogger GridLogWarning; extern GridLogger GridLogMessage; diff --git a/lib/threads/Threads.h b/lib/threads/Threads.h index d15f15ce..36daf2af 100644 --- a/lib/threads/Threads.h +++ b/lib/threads/Threads.h @@ -51,7 +51,9 @@ Author: paboyle #define PARALLEL_CRITICAL #endif +#define parallel_region PARALLEL_REGION #define parallel_for PARALLEL_FOR_LOOP for +#define parallel_for_internal PARALLEL_FOR_LOOP_INTERN for #define parallel_for_nest2 PARALLEL_NESTED_LOOP2 for namespace Grid { diff --git a/tests/debug/Test_cheby.cc b/tests/debug/Test_cheby.cc index 40544c56..72d07885 100644 --- a/tests/debug/Test_cheby.cc +++ b/tests/debug/Test_cheby.cc @@ -37,8 +37,15 @@ RealD InverseApproximation(RealD x){ RealD SqrtApproximation(RealD x){ return std::sqrt(x); } +RealD Approximation32(RealD x){ + return std::pow(x,-1.0/32.0); +} +RealD Approximation2(RealD x){ + return std::pow(x,-1.0/2.0); +} + RealD StepFunction(RealD x){ - if ( x<0.1 ) return 1.0; + if ( x<10.0 ) return 1.0; else return 0.0; } @@ -56,7 +63,6 @@ int main (int argc, char ** argv) Chebyshev ChebyInv(lo,hi,2000,InverseApproximation); - { std::ofstream of("chebyinv"); ChebyInv.csv(of); @@ -78,7 +84,6 @@ int main (int argc, char ** argv) ChebyStep.JacksonSmooth(); - { std::ofstream of("chebystepjack"); ChebyStep.csv(of); @@ -100,5 +105,30 @@ int main (int argc, char ** argv) ChebyNE.csv(of); } + lo=0.0; + hi=4.0; + Chebyshev Cheby32(lo,hi,2000,Approximation32); + { + std::ofstream of("cheby32"); + Cheby32.csv(of); + } + Cheby32.JacksonSmooth(); + { + std::ofstream of("cheby32jack"); + Cheby32.csv(of); + } + + Chebyshev ChebySqrt(lo,hi,2000,Approximation2); + { + std::ofstream of("chebysqrt"); + ChebySqrt.csv(of); + } + ChebySqrt.JacksonSmooth(); + { + std::ofstream of("chebysqrtjack"); + ChebySqrt.csv(of); + } + + Grid_finalize(); } diff --git a/tests/hmc/Test_remez.cc b/tests/hmc/Test_remez.cc index bc851173..5f4b0a25 100644 --- a/tests/hmc/Test_remez.cc +++ b/tests/hmc/Test_remez.cc @@ -38,11 +38,11 @@ int main (int argc, char ** argv) std::cout< Date: Fri, 13 Oct 2017 13:22:26 +0100 Subject: [PATCH 02/63] Final version prior to reunification --- .../BlockImplicitlyRestartedLanczos.h | 45 +++++++++---------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/lib/algorithms/iterative/BlockImplicitlyRestartedLanczos/BlockImplicitlyRestartedLanczos.h b/lib/algorithms/iterative/BlockImplicitlyRestartedLanczos/BlockImplicitlyRestartedLanczos.h index 90d45193..de3f1790 100644 --- a/lib/algorithms/iterative/BlockImplicitlyRestartedLanczos/BlockImplicitlyRestartedLanczos.h +++ b/lib/algorithms/iterative/BlockImplicitlyRestartedLanczos/BlockImplicitlyRestartedLanczos.h @@ -35,9 +35,6 @@ Author: Christoph Lehner //#include #include -#include -#include -#include namespace Grid { @@ -178,7 +175,7 @@ class BlockImplicitlyRestartedLanczos { //////////////////////////////// // Embedded objects //////////////////////////////// - SortEigen _sort; + // SortEigen _sort; LinearFunction &_HermOp; LinearFunction &_HermOpTest; ///////////////////////// @@ -212,11 +209,10 @@ public: return nn; } - void orthogonalize(Field& w, BasisFieldVector& evec,int k) + void orthogonalize(Field& w, std::vector& evec,int k) { OrthoTime-=usecond()/1e6; - //evec.orthogonalize(w,k); - basisOrthogonalize(evec._v,w,k); + basisOrthogonalize(evec,w,k); normalise(w); OrthoTime+=usecond()/1e6; } @@ -238,7 +234,7 @@ repeat →AVK =VKHK +fKe†K † Extend to an M = K + P step factorization AVM = VMHM + fMeM until convergence */ - void calc(std::vector& eval, BasisFieldVector& evec, const Field& src, int& Nconv, bool reverse, int SkipTest) + void calc(std::vector& eval, std::vector& evec, const Field& src, int& Nconv, bool reverse, int SkipTest) { GridBase *grid = src._grid; assert(grid == evec[0]._grid); @@ -341,7 +337,8 @@ until convergence ////////////////////////////////// eval2_copy = eval2; - _sort.push(eval2,Nm); + // _sort.push(eval2,Nm); + std::partial_sort(eval2.begin(),eval2.begin()+Nm,eval2.end()); std::cout<0); - // evec.rotate(Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis - basisRotate(evec._v,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis + basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis std::cout< //memset +//#include +#include -namespace Grid { +namespace Grid { - enum IRLdiagonalisation { - IRLdiagonaliseWithDSTEGR, - IRLdiagonaliseWithQR, - IRLdiagonaliseWithEigen - }; - -//////////////////////////////////////////////////////////////////////////////// -// Helper class for sorting the evalues AND evectors by Field -// Use pointer swizzle on vectors -//////////////////////////////////////////////////////////////////////////////// template -class SortEigen { - private: - static bool less_lmd(RealD left,RealD right){ - return left > right; - } - static bool less_pair(std::pair& left, - std::pair& right){ - return left.first > (right.first); - } - - public: - void push(std::vector& lmd,std::vector& evec,int N) { - - //////////////////////////////////////////////////////////////////////// - // PAB: FIXME: VERY VERY VERY wasteful: takes a copy of the entire vector set. - // : The vector reorder should be done by pointer swizzle somehow - //////////////////////////////////////////////////////////////////////// - std::vector cpy(lmd.size(),evec[0]._grid); - for(int i=0;i > emod(lmd.size()); +void basisOrthogonalize(std::vector &basis,Field &w,int k) +{ + for(int j=0; j(lmd[i],&cpy[i]); - - partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair); - - typename std::vector >::iterator it = emod.begin(); - for(int i=0;ifirst; - evec[i]=*(it->second); - ++it; +template +void basisRotate(std::vector &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm) +{ + typedef typename Field::vector_object vobj; + GridBase* grid = basis[0]._grid; + + parallel_region + { + std::vector < vobj > B(Nm); // Thread private + + parallel_for_internal(int ss=0;ss < grid->oSites();ss++){ + for(int j=j0; j& lmd,int N) { - std::partial_sort(lmd.begin(),lmd.begin()+N,lmd.end(),less_lmd); +} + +template +void basisReorderInPlace(std::vector &_v,std::vector& sort_vals, std::vector& idx) +{ + int vlen = idx.size(); + + assert(vlen>=1); + assert(vlen<=sort_vals.size()); + assert(vlen<=_v.size()); + + for (size_t i=0;i i); + ////////////////////////////////////// + // idx[i] is a table of desired sources giving a permutation. + // + // Swap v[i] with v[idx[i]]. + // + // Find j>i for which _vnew[j] = _vold[i], + // track the move idx[j] => idx[i] + // track the move idx[i] => i + ////////////////////////////////////// + size_t j; + for (j=i;j fabs(thrs); +} + +inline std::vector basisSortGetIndex(std::vector& sort_vals) +{ + std::vector idx(sort_vals.size()); + std::iota(idx.begin(), idx.end(), 0); + + // sort indexes based on comparing values in v + std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) { + return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]); + }); + return idx; +} + +template +void basisSortInPlace(std::vector & _v,std::vector& sort_vals, bool reverse) +{ + std::vector idx = basisSortGetIndex(sort_vals); + if (reverse) + std::reverse(idx.begin(), idx.end()); + + basisReorderInPlace(_v,sort_vals,idx); +} + +// PAB: faster to compute the inner products first then fuse loops. +// If performance critical can improve. +template +void basisDeflate(const std::vector &_v,const std::vector& eval,const Field& src_orig,Field& result) { + result = zero; + assert(_v.size()==eval.size()); + int N = (int)_v.size(); + for (int i=0;i class ImplicitlyRestartedLanczos { - -private: - - int MaxIter; // Max iterations - int Nstop; // Number of evecs checked for convergence - int Nk; // Number of converged sought - int Nm; // Nm -- total number of vectors - RealD eresid; + private: + const RealD small = 1.0e-8; + int MaxIter; + int MinRestart; // Minimum number of restarts; only check for convergence after + int Nstop; // Number of evecs checked for convergence + int Nk; // Number of converged sought + // int Np; // Np -- Number of spare vecs in krylov space // == Nm - Nk + int Nm; // Nm -- total number of vectors IRLdiagonalisation diagonalisation; - //////////////////////////////////// + int orth_period; + + RealD OrthoTime; + RealD eresid, betastp; + //////////////////////////////// // Embedded objects - //////////////////////////////////// - SortEigen _sort; - LinearOperatorBase &_Linop; - OperatorFunction &_poly; - + //////////////////////////////// + LinearFunction &_HermOp; + LinearFunction &_HermOpTest; ///////////////////////// // Constructor ///////////////////////// public: - ImplicitlyRestartedLanczos(LinearOperatorBase &Linop, // op - OperatorFunction & poly, // polynomial - int _Nstop, // really sought vecs - int _Nk, // sought vecs - int _Nm, // total vecs - RealD _eresid, // resid in lmd deficit - int _MaxIter, // Max iterations - IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen ) : - _Linop(Linop), _poly(poly), - Nstop(_Nstop), Nk(_Nk), Nm(_Nm), - eresid(_eresid), MaxIter(_MaxIter), - diagonalisation(_diagonalisation) - { }; + ImplicitlyRestartedLanczos(LinearFunction & HermOp, + LinearFunction & HermOpTest, + int _Nstop, // sought vecs + int _Nk, // sought vecs + int _Nm, // spare vecs + RealD _eresid, // resid in lmdue deficit + RealD _betastp, // if beta(k) < betastp: converged + int _MaxIter, // Max iterations + int _MinRestart, int _orth_period = 1, + IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) : + _HermOp(HermOp), _HermOpTest(HermOpTest), + Nstop(_Nstop) , Nk(_Nk), Nm(_Nm), + eresid(_eresid), betastp(_betastp), + MaxIter(_MaxIter) , MinRestart(_MinRestart), + orth_period(_orth_period), diagonalisation(_diagonalisation) { }; //////////////////////////////// // Helpers //////////////////////////////// - static RealD normalise(Field& v) + template static RealD normalise(T& v) { RealD nn = norm2(v); nn = sqrt(nn); v = v * (1.0/nn); return nn; } - - void orthogonalize(Field& w, std::vector& evec, int k) + + void orthogonalize(Field& w, std::vector& evec,int k) { - typedef typename Field::scalar_type MyComplex; - MyComplex ip; - - for(int j=0; j& eval, std::vector& evec, const Field& src, int& Nconv) + void calc(std::vector& eval, std::vector& evec, const Field& src, int& Nconv, bool reverse, int SkipTest) { + GridBase *grid = src._grid; + assert(grid == evec[0]._grid); - GridBase *grid = evec[0]._grid; - assert(grid == src._grid); - - std::cout << GridLogMessage <<"**************************************************************************"<< std::endl; - std::cout << GridLogMessage <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 / "<< MaxIter<< std::endl; - std::cout << GridLogMessage <<"**************************************************************************"<< std::endl; - std::cout << GridLogMessage <<" -- seek Nk = " << Nk <<" vectors"<< std::endl; - std::cout << GridLogMessage <<" -- accept Nstop = " << Nstop <<" vectors"<< std::endl; - std::cout << GridLogMessage <<" -- total Nm = " << Nm <<" vectors"<< std::endl; - std::cout << GridLogMessage <<" -- size of eval = " << eval.size() << std::endl; - std::cout << GridLogMessage <<" -- size of evec = " << evec.size() << std::endl; + GridLogIRL.TimingMode(1); + std::cout << GridLogIRL <<"**************************************************************************"<< std::endl; + std::cout << GridLogIRL <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 / "<< MaxIter<< std::endl; + std::cout << GridLogIRL <<"**************************************************************************"<< std::endl; + std::cout << GridLogIRL <<" -- seek Nk = " << Nk <<" vectors"<< std::endl; + std::cout << GridLogIRL <<" -- accept Nstop = " << Nstop <<" vectors"<< std::endl; + std::cout << GridLogIRL <<" -- total Nm = " << Nm <<" vectors"<< std::endl; + std::cout << GridLogIRL <<" -- size of eval = " << eval.size() << std::endl; + std::cout << GridLogIRL <<" -- size of evec = " << evec.size() << std::endl; if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) { - std::cout << GridLogMessage << "Diagonalisation is DSTEGR "< lme(Nm); std::vector lme2(Nm); std::vector eval2(Nm); + std::vector eval2_copy(Nm); + Eigen::MatrixXd Qt = Eigen::MatrixXd::Zero(Nm,Nm); - Eigen::MatrixXd Qt = Eigen::MatrixXd::Zero(Nm,Nm); - - std::vector Iconv(Nm); - std::vector B(Nm,grid); // waste of space replicating - Field f(grid); Field v(grid); - int k1 = 1; int k2 = Nk; - - Nconv = 0; - RealD beta_k; + + Nconv = 0; // Set initial vector evec[0] = src; - std::cout << GridLogMessage <<"norm2(src)= " << norm2(src)<0); + basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis + + std::cout<= MinRestart) { + std::cout << GridLogIRL << "Rotation to test convergence " << std::endl; - _Linop.HermOp(B[i],v); + Field ev0_orig(grid); + ev0_orig = evec[0]; - RealD vnum = real(innerProduct(B[i],v)); // HermOp. - RealD vden = norm2(B[i]); - eval2[i] = vnum/vden; - v -= eval2[i]*B[i]; - RealD vv = norm2(v); - - std::cout.precision(13); - std::cout << GridLogMessage << "[" << std::setw(3)<< std::setiosflags(std::ios_base::right) <=Nstop ){ - goto converged; - } - } // end of iter loop - - std::cout << GridLogMessage <<"**************************************************************************"<< std::endl; - std::cout << GridLogError <<" ImplicitlyRestartedLanczos::calc() NOT converged."; - std::cout << GridLogMessage <<"**************************************************************************"<< std::endl; + { + std::cout << GridLogIRL << "Test convergence" << std::endl; + Field B(grid); + + for(int j = 0; j=Nstop || beta_k < betastp){ + goto converged; + } + + //B[j] +=Qt[k+_Nm*j] * _v[k]._odata[ss]; + { + Eigen::MatrixXd qm = Eigen::MatrixXd::Zero(Nk,Nk); // Restrict Qt to Nk x Nk + for (int k=0;k0) w -= lme[k-1] * evec[k-1]; - - ComplexD zalph = innerProduct(evec[k],w); // 4. αk:=(wk,vk) + + ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk) RealD alph = real(zalph); - - w = w - alph * evec[k];// 5. wk:=wk−αkvk - + + w = w - alph * evec_k;// 5. wk:=wk−αkvk + RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop // 7. vk+1 := wk/βk+1 - + lmd[k] = alph; lme[k] = beta; - - if ( k > 0 ) orthogonalize(w,evec,k); // orthonormalise - if ( k < Nm-1) evec[k+1] = w; - - if ( beta < tiny ) std::cout << GridLogMessage << " beta is tiny "<0 && k % orth_period == 0) { + orthogonalize(w,evec,k); // orthonormalise + std::cout<& lmd, std::vector& lme, int Nk, int Nm, Eigen::MatrixXd & Qt, // Nm x Nm @@ -405,11 +565,12 @@ private: } } } + /////////////////////////////////////////////////////////////////////////// // File could end here if settle on Eigen ??? /////////////////////////////////////////////////////////////////////////// - void qr_decomp(std::vector& lmd, // Nm + void QR_decomp(std::vector& lmd, // Nm std::vector& lme, // Nm int Nk, int Nm, // Nk, Nm Eigen::MatrixXd& Qt, // Nm x Nm matrix @@ -576,51 +737,50 @@ void diagonalize_lapack(std::vector& lmd, #endif } - void diagonalize_QR(std::vector& lmd, std::vector& lme, - int Nk, int Nm, - Eigen::MatrixXd & Qt, - GridBase *grid) - { - int Niter = 100*Nm; - int kmin = 1; - int kmax = Nk; - - // (this should be more sophisticated) - for(int iter=0; iter= kmin; --j){ - RealD dds = fabs(lmd[j-1])+fabs(lmd[j]); - if(fabs(lme[j-1])+dds > dds){ - kmax = j+1; - goto continued; - } - } - Niter = iter; - return; - - continued: - for(int j=0; j dds){ - kmin = j+1; - break; - } +void diagonalize_QR(std::vector& lmd, std::vector& lme, + int Nk, int Nm, + Eigen::MatrixXd & Qt, + GridBase *grid) +{ + int QRiter = 100*Nm; + int kmin = 1; + int kmax = Nk; + + // (this should be more sophisticated) + for(int iter=0; iter= kmin; --j){ + RealD dds = fabs(lmd[j-1])+fabs(lmd[j]); + if(fabs(lme[j-1])+dds > dds){ + kmax = j+1; + goto continued; + } + } + QRiter = iter; + return; + + continued: + for(int j=0; j dds){ + kmin = j+1; + break; } } - std::cout << GridLogError << "[QL method] Error - Too many iteration: "< Date: Fri, 13 Oct 2017 13:23:07 +0100 Subject: [PATCH 04/63] Logging improvement; reunified the Lanczos codes --- .../BlockImplicitlyRestartedLanczos.h | 789 ------------------ lib/log/Log.cc | 2 +- lib/log/Log.h | 30 +- lib/util/Init.cc | 2 +- tests/lanczos/Test_dwf_compressed_lanczos.cc | 17 +- 5 files changed, 36 insertions(+), 804 deletions(-) delete mode 100644 lib/algorithms/iterative/BlockImplicitlyRestartedLanczos/BlockImplicitlyRestartedLanczos.h diff --git a/lib/algorithms/iterative/BlockImplicitlyRestartedLanczos/BlockImplicitlyRestartedLanczos.h b/lib/algorithms/iterative/BlockImplicitlyRestartedLanczos/BlockImplicitlyRestartedLanczos.h deleted file mode 100644 index de3f1790..00000000 --- a/lib/algorithms/iterative/BlockImplicitlyRestartedLanczos/BlockImplicitlyRestartedLanczos.h +++ /dev/null @@ -1,789 +0,0 @@ - /************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h - - Copyright (C) 2015 - -Author: Peter Boyle -Author: paboyle -Author: Chulwoo Jung -Author: Christoph Lehner - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ -#ifndef GRID_BIRL_H -#define GRID_BIRL_H - -#include //memset -//#include -#include - - -namespace Grid { - -template -void basisOrthogonalize(std::vector &basis,Field &w,int k) -{ - for(int j=0; j -void basisRotate(std::vector &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm) -{ - typedef typename Field::vector_object vobj; - GridBase* grid = basis[0]._grid; - - parallel_region - { - std::vector < vobj > B(Nm); // Thread private - - parallel_for_internal(int ss=0;ss < grid->oSites();ss++){ - for(int j=j0; j -void basisReorderInPlace(std::vector &_v,std::vector& sort_vals, std::vector& idx) -{ - int vlen = idx.size(); - - assert(vlen>=1); - assert(vlen<=sort_vals.size()); - assert(vlen<=_v.size()); - - for (size_t i=0;i i); - ////////////////////////////////////// - // idx[i] is a table of desired sources giving a permutation. - // - // Swap v[i] with v[idx[i]]. - // - // Find j>i for which _vnew[j] = _vold[i], - // track the move idx[j] => idx[i] - // track the move idx[i] => i - ////////////////////////////////////// - size_t j; - for (j=i;j basisSortGetIndex(std::vector& sort_vals) -{ - std::vector idx(sort_vals.size()); - std::iota(idx.begin(), idx.end(), 0); - - // sort indexes based on comparing values in v - std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) { - return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]); - }); - return idx; -} - -template -void basisSortInPlace(std::vector & _v,std::vector& sort_vals, bool reverse) -{ - std::vector idx = basisSortGetIndex(sort_vals); - if (reverse) - std::reverse(idx.begin(), idx.end()); - - basisReorderInPlace(_v,sort_vals,idx); -} - -// PAB: faster to compute the inner products first then fuse loops. -// If performance critical can improve. -template -void basisDeflate(const std::vector &_v,const std::vector& eval,const Field& src_orig,Field& result) { - result = zero; - assert(_v.size()==eval.size()); - int N = (int)_v.size(); - for (int i=0;i -class BlockImplicitlyRestartedLanczos { - private: - const RealD small = 1.0e-8; - int MaxIter; - int MinRestart; // Minimum number of restarts; only check for convergence after - int Nstop; // Number of evecs checked for convergence - int Nk; // Number of converged sought - // int Np; // Np -- Number of spare vecs in krylov space // == Nm - Nk - int Nm; // Nm -- total number of vectors - IRLdiagonalisation diagonalisation; - int orth_period; - - RealD OrthoTime; - RealD eresid, betastp; - //////////////////////////////// - // Embedded objects - //////////////////////////////// - // SortEigen _sort; - LinearFunction &_HermOp; - LinearFunction &_HermOpTest; - ///////////////////////// - // Constructor - ///////////////////////// -public: - BlockImplicitlyRestartedLanczos(LinearFunction & HermOp, - LinearFunction & HermOpTest, - int _Nstop, // sought vecs - int _Nk, // sought vecs - int _Nm, // spare vecs - RealD _eresid, // resid in lmdue deficit - RealD _betastp, // if beta(k) < betastp: converged - int _MaxIter, // Max iterations - int _MinRestart, int _orth_period = 1, - IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) : - _HermOp(HermOp), _HermOpTest(HermOpTest), - Nstop(_Nstop) , Nk(_Nk), Nm(_Nm), - eresid(_eresid), betastp(_betastp), - MaxIter(_MaxIter) , MinRestart(_MinRestart), - orth_period(_orth_period), diagonalisation(_diagonalisation) { }; - - //////////////////////////////// - // Helpers - //////////////////////////////// - template static RealD normalise(T& v) - { - RealD nn = norm2(v); - nn = sqrt(nn); - v = v * (1.0/nn); - return nn; - } - - void orthogonalize(Field& w, std::vector& evec,int k) - { - OrthoTime-=usecond()/1e6; - basisOrthogonalize(evec,w,k); - normalise(w); - OrthoTime+=usecond()/1e6; - } - -/* Rudy Arthur's thesis pp.137 ------------------------- -Require: M > K P = M − K † -Compute the factorization AVM = VM HM + fM eM -repeat - Q=I - for i = 1,...,P do - QiRi =HM −θiI Q = QQi - H M = Q †i H M Q i - end for - βK =HM(K+1,K) σK =Q(M,K) - r=vK+1βK +rσK - VK =VM(1:M)Q(1:M,1:K) - HK =HM(1:K,1:K) - →AVK =VKHK +fKe†K † Extend to an M = K + P step factorization AVM = VMHM + fMeM -until convergence -*/ - void calc(std::vector& eval, std::vector& evec, const Field& src, int& Nconv, bool reverse, int SkipTest) - { - GridBase *grid = src._grid; - assert(grid == evec[0]._grid); - - GridLogIRL.TimingMode(1); - std::cout << GridLogIRL <<"**************************************************************************"<< std::endl; - std::cout << GridLogIRL <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 / "<< MaxIter<< std::endl; - std::cout << GridLogIRL <<"**************************************************************************"<< std::endl; - std::cout << GridLogIRL <<" -- seek Nk = " << Nk <<" vectors"<< std::endl; - std::cout << GridLogIRL <<" -- accept Nstop = " << Nstop <<" vectors"<< std::endl; - std::cout << GridLogIRL <<" -- total Nm = " << Nm <<" vectors"<< std::endl; - std::cout << GridLogIRL <<" -- size of eval = " << eval.size() << std::endl; - std::cout << GridLogIRL <<" -- size of evec = " << evec.size() << std::endl; - if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) { - std::cout << GridLogIRL << "Diagonalisation is DSTEGR "< lme(Nm); - std::vector lme2(Nm); - std::vector eval2(Nm); - std::vector eval2_copy(Nm); - Eigen::MatrixXd Qt = Eigen::MatrixXd::Zero(Nm,Nm); - - Field f(grid); - Field v(grid); - int k1 = 1; - int k2 = Nk; - RealD beta_k; - - Nconv = 0; - - // Set initial vector - evec[0] = src; - normalise(evec[0]); - - // Initial Nk steps - OrthoTime=0.; - for(int k=0; k0); - basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis - - std::cout<= MinRestart) { - std::cout << GridLogIRL << "Rotation to test convergence " << std::endl; - - Field ev0_orig(grid); - ev0_orig = evec[0]; - - basisRotate(evec,Qt,0,Nk,0,Nk,Nm); - - { - std::cout << GridLogIRL << "Test convergence" << std::endl; - Field B(grid); - - for(int j = 0; j=Nstop || beta_k < betastp){ - goto converged; - } - - //B[j] +=Qt[k+_Nm*j] * _v[k]._odata[ss]; - { - Eigen::MatrixXd qm = Eigen::MatrixXd::Zero(Nk,Nk); // Restrict Qt to Nk x Nk - for (int k=0;k& lmd, - std::vector& lme, - std::vector& evec, - Field& w,int Nm,int k) - { - const RealD tiny = 1.0e-20; - assert( k< Nm ); - - GridStopWatch gsw_op,gsw_o; - - Field& evec_k = evec[k]; - - _HermOp(evec_k,w); - std::cout<0) w -= lme[k-1] * evec[k-1]; - - ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk) - RealD alph = real(zalph); - - w = w - alph * evec_k;// 5. wk:=wk−αkvk - - RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop - // 7. vk+1 := wk/βk+1 - - lmd[k] = alph; - lme[k] = beta; - - std::cout<0 && k % orth_period == 0) { - orthogonalize(w,evec,k); // orthonormalise - std::cout<& lmd, std::vector& lme, - int Nk, int Nm, - Eigen::MatrixXd & Qt, // Nm x Nm - GridBase *grid) - { - Eigen::MatrixXd TriDiag = Eigen::MatrixXd::Zero(Nk,Nk); - - for(int i=0;i eigensolver(TriDiag); - - for (int i = 0; i < Nk; i++) { - lmd[Nk-1-i] = eigensolver.eigenvalues()(i); - } - for (int i = 0; i < Nk; i++) { - for (int j = 0; j < Nk; j++) { - Qt(Nk-1-i,j) = eigensolver.eigenvectors()(j,i); - } - } - } - - /////////////////////////////////////////////////////////////////////////// - // File could end here if settle on Eigen ??? - /////////////////////////////////////////////////////////////////////////// - - void QR_decomp(std::vector& lmd, // Nm - std::vector& lme, // Nm - int Nk, int Nm, // Nk, Nm - Eigen::MatrixXd& Qt, // Nm x Nm matrix - RealD Dsh, int kmin, int kmax) - { - int k = kmin-1; - RealD x; - - RealD Fden = 1.0/hypot(lmd[k]-Dsh,lme[k]); - RealD c = ( lmd[k] -Dsh) *Fden; - RealD s = -lme[k] *Fden; - - RealD tmpa1 = lmd[k]; - RealD tmpa2 = lmd[k+1]; - RealD tmpb = lme[k]; - - lmd[k] = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb; - lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb; - lme[k] = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb; - x =-s*lme[k+1]; - lme[k+1] = c*lme[k+1]; - - for(int i=0; i& lmd, std::vector& lme, - int Nk, int Nm, - Eigen::MatrixXd & Qt, - GridBase *grid) - { - Qt = Eigen::MatrixXd::Identity(Nm,Nm); - if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) { - diagonalize_lapack(lmd,lme,Nk,Nm,Qt,grid); - } else if ( diagonalisation == IRLdiagonaliseWithQR ) { - diagonalize_QR(lmd,lme,Nk,Nm,Qt,grid); - } else if ( diagonalisation == IRLdiagonaliseWithEigen ) { - diagonalize_Eigen(lmd,lme,Nk,Nm,Qt,grid); - } else { - assert(0); - } - } - -#ifdef USE_LAPACK -void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e, - double *vl, double *vu, int *il, int *iu, double *abstol, - int *m, double *w, double *z, int *ldz, int *isuppz, - double *work, int *lwork, int *iwork, int *liwork, - int *info); -#endif - -void diagonalize_lapack(std::vector& lmd, - std::vector& lme, - int Nk, int Nm, - Eigen::MatrixXd& Qt, - GridBase *grid) -{ -#ifdef USE_LAPACK - const int size = Nm; - int NN = Nk; - double evals_tmp[NN]; - double evec_tmp[NN][NN]; - memset(evec_tmp[0],0,sizeof(double)*NN*NN); - double DD[NN]; - double EE[NN]; - for (int i = 0; i< NN; i++) { - for (int j = i - 1; j <= i + 1; j++) { - if ( j < NN && j >= 0 ) { - if (i==j) DD[i] = lmd[i]; - if (i==j) evals_tmp[i] = lmd[i]; - if (j==(i-1)) EE[j] = lme[j]; - } - } - } - int evals_found; - int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ; - int liwork = 3+NN*10 ; - int iwork[liwork]; - double work[lwork]; - int isuppz[2*NN]; - char jobz = 'V'; // calculate evals & evecs - char range = 'I'; // calculate all evals - // char range = 'A'; // calculate all evals - char uplo = 'U'; // refer to upper half of original matrix - char compz = 'I'; // Compute eigenvectors of tridiagonal matrix - int ifail[NN]; - int info; - int total = grid->_Nprocessors; - int node = grid->_processor; - int interval = (NN/total)+1; - double vl = 0.0, vu = 0.0; - int il = interval*node+1 , iu = interval*(node+1); - if (iu > NN) iu=NN; - double tol = 0.0; - if (1) { - memset(evals_tmp,0,sizeof(double)*NN); - if ( il <= NN){ - LAPACK_dstegr(&jobz, &range, &NN, - (double*)DD, (double*)EE, - &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A' - &tol, // tolerance - &evals_found, evals_tmp, (double*)evec_tmp, &NN, - isuppz, - work, &lwork, iwork, &liwork, - &info); - for (int i = iu-1; i>= il-1; i--){ - evals_tmp[i] = evals_tmp[i - (il-1)]; - if (il>1) evals_tmp[i-(il-1)]=0.; - for (int j = 0; j< NN; j++){ - evec_tmp[i][j] = evec_tmp[i - (il-1)][j]; - if (il>1) evec_tmp[i-(il-1)][j]=0.; - } - } - } - { - grid->GlobalSumVector(evals_tmp,NN); - grid->GlobalSumVector((double*)evec_tmp,NN*NN); - } - } - // Safer to sort instead of just reversing it, - // but the document of the routine says evals are sorted in increasing order. - // qr gives evals in decreasing order. - for(int i=0;i& lmd, std::vector& lme, - int Nk, int Nm, - Eigen::MatrixXd & Qt, - GridBase *grid) - { - int QRiter = 100*Nm; - int kmin = 1; - int kmax = Nk; - - // (this should be more sophisticated) - for(int iter=0; iter= kmin; --j){ - RealD dds = fabs(lmd[j-1])+fabs(lmd[j]); - if(fabs(lme[j-1])+dds > dds){ - kmax = j+1; - goto continued; - } - } - QRiter = iter; - return; - - continued: - for(int j=0; j dds){ - kmin = j+1; - break; - } - } - } - std::cout << GridLogError << "[QL method] Error - Too many iteration: "<Reset(); + StopWatch->Start(); + } + void TimingMode(int on) { + timing_mode = on; + if(on) { + StopWatch = &LocalStopWatch; + Reset(); + } + } friend std::ostream& operator<< (std::ostream& stream, Logger& log){ @@ -117,10 +131,10 @@ public: stream << log.background()<< std::left << log.topName << log.background()<< " : "; stream << log.colour() << std::left << log.name << log.background() << " : "; if ( log.timestamp ) { - StopWatch.Stop(); - GridTime now = StopWatch.Elapsed(); - if ( log.timing_mode==1 ) StopWatch.Reset(); - StopWatch.Start(); + log.StopWatch->Stop(); + GridTime now = log.StopWatch->Elapsed(); + if ( log.timing_mode==1 ) log.StopWatch->Reset(); + log.StopWatch->Start(); stream << log.evidence()<< now << log.background() << " : " ; } stream << log.colour(); diff --git a/lib/util/Init.cc b/lib/util/Init.cc index 1266d34d..031f8f5a 100644 --- a/lib/util/Init.cc +++ b/lib/util/Init.cc @@ -208,7 +208,7 @@ static int Grid_is_initialised = 0; void Grid_init(int *argc,char ***argv) { - GridLogger::StopWatch.Start(); + GridLogger::GlobalStopWatch.Start(); std::string arg; diff --git a/tests/lanczos/Test_dwf_compressed_lanczos.cc b/tests/lanczos/Test_dwf_compressed_lanczos.cc index 7fe37387..544d0358 100644 --- a/tests/lanczos/Test_dwf_compressed_lanczos.cc +++ b/tests/lanczos/Test_dwf_compressed_lanczos.cc @@ -21,7 +21,14 @@ (ortho krylov low poly); and then fix up lowest say 200 eigenvalues by 1 run with high-degree poly (600 could be enough) */ #include -#include +#include +///////////////////////////////////////////////////////////////////////////// +// The following are now decoupled from the Lanczos and deal with grids. +// Safe to replace functionality +///////////////////////////////////////////////////////////////////////////// +#include +#include +#include #include "FieldVectorIO.h" #include "Params.h" @@ -319,7 +326,7 @@ void CoarseGridLanczos(BlockProjector& pr,RealD alpha2,RealD beta,int Npo Op2 = &Op2plain; } ProjectedHermOp,LatticeFermion> Op2nopoly(pr,HermOp); - BlockImplicitlyRestartedLanczos > IRL2(*Op2,*Op2,Nstop2,Nk2,Nm2,resid2,betastp2,MaxIt,MinRes2); + ImplicitlyRestartedLanczos > IRL2(*Op2,*Op2,Nstop2,Nk2,Nm2,resid2,betastp2,MaxIt,MinRes2); src_coarse = 1.0; @@ -350,7 +357,7 @@ void CoarseGridLanczos(BlockProjector& pr,RealD alpha2,RealD beta,int Npo ) { - IRL2.calc(eval2,coef,src_coarse,Nconv,true,SkipTest2); + IRL2.calc(eval2,coef._v,src_coarse,Nconv,true,SkipTest2); coef.resize(Nstop2); eval2.resize(Nstop2); @@ -641,7 +648,7 @@ int main (int argc, char ** argv) { } // First round of Lanczos to get low mode basis - BlockImplicitlyRestartedLanczos IRL1(Op1,Op1test,Nstop1,Nk1,Nm1,resid1,betastp1,MaxIt,MinRes1); + ImplicitlyRestartedLanczos IRL1(Op1,Op1test,Nstop1,Nk1,Nm1,resid1,betastp1,MaxIt,MinRes1); int Nconv; char tag[1024]; @@ -650,7 +657,7 @@ int main (int argc, char ** argv) { if (simple_krylov_basis) { quick_krylov_basis(evec,src,Op1,Nstop1); } else { - IRL1.calc(eval1,evec,src,Nconv,false,1); + IRL1.calc(eval1,evec._v,src,Nconv,false,1); } evec.resize(Nstop1); // and throw away superfluous eval1.resize(Nstop1); From e325929851aa0e26055875a22b39aee39ed186cd Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 13 Oct 2017 14:02:43 +0100 Subject: [PATCH 05/63] ALl codes compile against the new Lanczos call signature --- lib/algorithms/LinearOperator.h | 59 +++++++++++++++++++ lib/algorithms/approx/Chebyshev.h | 35 ----------- .../iterative/ImplicitlyRestartedLanczos.h | 6 +- tests/lanczos/Test_dwf_compressed_lanczos.cc | 30 +--------- tests/lanczos/Test_dwf_lanczos.cc | 11 ++-- tests/lanczos/Test_synthetic_lanczos.cc | 10 ++-- tests/lanczos/Test_wilson_lanczos.cc | 9 ++- 7 files changed, 82 insertions(+), 78 deletions(-) diff --git a/lib/algorithms/LinearOperator.h b/lib/algorithms/LinearOperator.h index f1b8820e..0d32cc15 100644 --- a/lib/algorithms/LinearOperator.h +++ b/lib/algorithms/LinearOperator.h @@ -346,6 +346,7 @@ namespace Grid { virtual void operator() (const Field &in, Field &out) = 0; }; + ///////////////////////////////////////////////////////////// // Base classes for Multishift solvers for operators ///////////////////////////////////////////////////////////// @@ -368,6 +369,64 @@ namespace Grid { }; */ + //////////////////////////////////////////////////////////////////////////////////////////// + // Hermitian operator Linear function and operator function + //////////////////////////////////////////////////////////////////////////////////////////// + template + class HermOpOperatorFunction : public OperatorFunction { + void operator() (LinearOperatorBase &Linop, const Field &in, Field &out) { + Linop.HermOp(in,out); + }; + }; + + template + class PlainHermOp : public LinearFunction { + public: + LinearOperatorBase &_Linop; + + PlainHermOp(LinearOperatorBase& linop) : _Linop(linop) + {} + + void operator()(const Field& in, Field& out) { + _Linop.HermOp(in,out); + } + }; + + template + class FunctionHermOp : public LinearFunction { + public: + OperatorFunction & _poly; + LinearOperatorBase &_Linop; + + FunctionHermOp(OperatorFunction & poly,LinearOperatorBase& linop) + : _poly(poly), _Linop(linop) {}; + + void operator()(const Field& in, Field& out) { + _poly(_Linop,in,out); + } + }; + + template + class Polynomial : public OperatorFunction { + private: + std::vector Coeffs; + public: + Polynomial(std::vector &_Coeffs) : Coeffs(_Coeffs) { }; + + // Implement the required interface + void operator() (LinearOperatorBase &Linop, const Field &in, Field &out) { + + Field AtoN(in._grid); + Field Mtmp(in._grid); + AtoN = in; + out = AtoN*Coeffs[0]; + for(int n=1;n namespace Grid { - //////////////////////////////////////////////////////////////////////////////////////////// - // Simple general polynomial with user supplied coefficients - //////////////////////////////////////////////////////////////////////////////////////////// - template - class HermOpOperatorFunction : public OperatorFunction { - void operator() (LinearOperatorBase &Linop, const Field &in, Field &out) { - Linop.HermOp(in,out); - }; - }; - - template - class Polynomial : public OperatorFunction { - private: - std::vector Coeffs; - public: - Polynomial(std::vector &_Coeffs) : Coeffs(_Coeffs) { }; - - // Implement the required interface - void operator() (LinearOperatorBase &Linop, const Field &in, Field &out) { - - Field AtoN(in._grid); - Field Mtmp(in._grid); - AtoN = in; - out = AtoN*Coeffs[0]; -// std::cout <<"Poly in " <& eval, std::vector& evec, const Field& src, int& Nconv, bool reverse, int SkipTest) + void calc(std::vector& eval, std::vector& evec, const Field& src, int& Nconv, bool reverse=true, int SkipTest=0) { GridBase *grid = src._grid; assert(grid == evec[0]._grid); diff --git a/tests/lanczos/Test_dwf_compressed_lanczos.cc b/tests/lanczos/Test_dwf_compressed_lanczos.cc index 544d0358..10d6c3ae 100644 --- a/tests/lanczos/Test_dwf_compressed_lanczos.cc +++ b/tests/lanczos/Test_dwf_compressed_lanczos.cc @@ -100,19 +100,6 @@ void write_history(char* fn, std::vector& hist) { fclose(f); } -template -class FunctionHermOp : public LinearFunction { -public: - OperatorFunction & _poly; - LinearOperatorBase &_Linop; - - FunctionHermOp(OperatorFunction & poly,LinearOperatorBase& linop) : _poly(poly), _Linop(linop) { - } - - void operator()(const Field& in, Field& out) { - _poly(_Linop,in,out); - } -}; template class CheckpointedLinearFunction : public LinearFunction { @@ -268,19 +255,6 @@ public: } }; -template -class PlainHermOp : public LinearFunction { -public: - LinearOperatorBase &_Linop; - - PlainHermOp(LinearOperatorBase& linop) : _Linop(linop) { - } - - void operator()(const Field& in, Field& out) { - _Linop.HermOp(in,out); - } -}; - template using CoarseSiteFieldGeneral = iScalar< iVector >; template using CoarseSiteFieldD = CoarseSiteFieldGeneral< vComplexD, N >; template using CoarseSiteFieldF = CoarseSiteFieldGeneral< vComplexF, N >; @@ -326,7 +300,7 @@ void CoarseGridLanczos(BlockProjector& pr,RealD alpha2,RealD beta,int Npo Op2 = &Op2plain; } ProjectedHermOp,LatticeFermion> Op2nopoly(pr,HermOp); - ImplicitlyRestartedLanczos > IRL2(*Op2,*Op2,Nstop2,Nk2,Nm2,resid2,betastp2,MaxIt,MinRes2); + ImplicitlyRestartedLanczos > IRL2(*Op2,*Op2,Nstop2,Nk2,Nm2,resid2,MaxIt,betastp2,MinRes2); src_coarse = 1.0; @@ -648,7 +622,7 @@ int main (int argc, char ** argv) { } // First round of Lanczos to get low mode basis - ImplicitlyRestartedLanczos IRL1(Op1,Op1test,Nstop1,Nk1,Nm1,resid1,betastp1,MaxIt,MinRes1); + ImplicitlyRestartedLanczos IRL1(Op1,Op1test,Nstop1,Nk1,Nm1,resid1,MaxIt,betastp1,MinRes1); int Nconv; char tag[1024]; diff --git a/tests/lanczos/Test_dwf_lanczos.cc b/tests/lanczos/Test_dwf_lanczos.cc index 1dd5dae3..b1e205cf 100644 --- a/tests/lanczos/Test_dwf_lanczos.cc +++ b/tests/lanczos/Test_dwf_lanczos.cc @@ -84,11 +84,12 @@ int main (int argc, char ** argv) std::vector Coeffs { 0.,-1.}; Polynomial PolyX(Coeffs); - Chebyshev Cheb(0.2,5.,11); -// ChebyshevLanczos Cheb(9.,1.,0.,20); -// Cheb.csv(std::cout); -// exit(-24); - ImplicitlyRestartedLanczos IRL(HermOp,Cheb,Nstop,Nk,Nm,resid,MaxIt); + Chebyshev Cheby(0.2,5.,11); + + FunctionHermOp OpCheby(Cheby,HermOp); + PlainHermOp Op (HermOp); + + ImplicitlyRestartedLanczos IRL(OpCheby,Op,Nstop,Nk,Nm,resid,MaxIt); std::vector eval(Nm); diff --git a/tests/lanczos/Test_synthetic_lanczos.cc b/tests/lanczos/Test_synthetic_lanczos.cc index 32fd6f32..4be9ca31 100644 --- a/tests/lanczos/Test_synthetic_lanczos.cc +++ b/tests/lanczos/Test_synthetic_lanczos.cc @@ -119,12 +119,13 @@ int main (int argc, char ** argv) RealD beta = 0.1; RealD mu = 0.0; int order = 11; - ChebyshevLanczos Cheby(alpha,beta,mu,order); + Chebyshev Cheby(alpha,beta,order); std::ofstream file("cheby.dat"); Cheby.csv(file); - HermOpOperatorFunction X; DumbOperator HermOp(grid); + FunctionHermOp OpCheby(Cheby,HermOp); + PlainHermOp Op(HermOp); const int Nk = 40; const int Nm = 80; @@ -133,8 +134,9 @@ int main (int argc, char ** argv) int Nconv; RealD eresid = 1.0e-6; - ImplicitlyRestartedLanczos IRL(HermOp,X,Nk,Nk,Nm,eresid,Nit); - ImplicitlyRestartedLanczos ChebyIRL(HermOp,Cheby,Nk,Nk,Nm,eresid,Nit); + + ImplicitlyRestartedLanczos IRL(Op,Op,Nk,Nk,Nm,eresid,Nit); + ImplicitlyRestartedLanczos ChebyIRL(OpCheby,Op,Nk,Nk,Nm,eresid,Nit); LatticeComplex src(grid); gaussian(RNG,src); { diff --git a/tests/lanczos/Test_wilson_lanczos.cc b/tests/lanczos/Test_wilson_lanczos.cc index e8549234..eabc86d7 100644 --- a/tests/lanczos/Test_wilson_lanczos.cc +++ b/tests/lanczos/Test_wilson_lanczos.cc @@ -86,9 +86,12 @@ int main(int argc, char** argv) { std::vector Coeffs{0, 1.}; Polynomial PolyX(Coeffs); - Chebyshev Cheb(0.0, 10., 12); - ImplicitlyRestartedLanczos IRL(HermOp, PolyX, Nstop, Nk, Nm, - resid, MaxIt); + Chebyshev Cheby(0.0, 10., 12); + + FunctionHermOp OpCheby(Cheby,HermOp); + PlainHermOp Op (HermOp); + + ImplicitlyRestartedLanczos IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt); std::vector eval(Nm); FermionField src(FGrid); From 27936900e63c51aa6d52d2cb2cfdfa2936adbbd3 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Wed, 18 Oct 2017 13:08:09 +0100 Subject: [PATCH 06/63] Putting the FG verbosity in the Integrator level --- lib/qcd/hmc/integrators/Integrator_algorithm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/qcd/hmc/integrators/Integrator_algorithm.h b/lib/qcd/hmc/integrators/Integrator_algorithm.h index ecc125ef..13a37aeb 100644 --- a/lib/qcd/hmc/integrators/Integrator_algorithm.h +++ b/lib/qcd/hmc/integrators/Integrator_algorithm.h @@ -231,7 +231,7 @@ class ForceGradient : public Integrator Date: Wed, 25 Oct 2017 10:24:14 +0100 Subject: [PATCH 07/63] Solving again the MPI comm bug with FFTs --- lib/cartesian/Cartesian_base.h | 2 ++ lib/cartesian/Cartesian_full.h | 2 ++ lib/cartesian/Cartesian_red_black.h | 2 ++ lib/communicator/Communicator_base.h | 1 + lib/communicator/Communicator_mpi.cc | 6 ++++++ lib/communicator/Communicator_mpit.cc | 6 ++++++ lib/communicator/Communicator_none.cc | 2 ++ 7 files changed, 21 insertions(+) diff --git a/lib/cartesian/Cartesian_base.h b/lib/cartesian/Cartesian_base.h index 324772c5..6aa0e3c7 100644 --- a/lib/cartesian/Cartesian_base.h +++ b/lib/cartesian/Cartesian_base.h @@ -52,6 +52,8 @@ public: GridBase(const std::vector & processor_grid, const CartesianCommunicator &parent) : CartesianCommunicator(processor_grid,parent) {}; + virtual ~GridBase() = default; + // Physics Grid information. std::vector _simd_layout;// Which dimensions get relayed out over simd lanes. std::vector _fdimensions;// (full) Global dimensions of array prior to cb removal diff --git a/lib/cartesian/Cartesian_full.h b/lib/cartesian/Cartesian_full.h index a6a85ab7..c7ea68c9 100644 --- a/lib/cartesian/Cartesian_full.h +++ b/lib/cartesian/Cartesian_full.h @@ -81,6 +81,8 @@ public: Init(dimensions,simd_layout,processor_grid); } + virtual ~GridCartesian() = default; + void Init(const std::vector &dimensions, const std::vector &simd_layout, const std::vector &processor_grid) diff --git a/lib/cartesian/Cartesian_red_black.h b/lib/cartesian/Cartesian_red_black.h index f89cacc5..166c8491 100644 --- a/lib/cartesian/Cartesian_red_black.h +++ b/lib/cartesian/Cartesian_red_black.h @@ -133,6 +133,8 @@ public: { Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim) ; } + + virtual ~GridRedBlackCartesian() = default; #if 0 //////////////////////////////////////////////////////////// // Create redblack grid ;; deprecate these. Should not diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h index 8ff22dbd..22c9e4d0 100644 --- a/lib/communicator/Communicator_base.h +++ b/lib/communicator/Communicator_base.h @@ -155,6 +155,7 @@ class CartesianCommunicator { //////////////////////////////////////////////// CartesianCommunicator(const std::vector &processors,const CartesianCommunicator &parent); CartesianCommunicator(const std::vector &pdimensions_in); + virtual ~CartesianCommunicator(); private: #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT) diff --git a/lib/communicator/Communicator_mpi.cc b/lib/communicator/Communicator_mpi.cc index 678e4517..f1dad1e9 100644 --- a/lib/communicator/Communicator_mpi.cc +++ b/lib/communicator/Communicator_mpi.cc @@ -52,6 +52,12 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); ShmInitGeneric(); } + +CartesianCommunicator::~CartesianCommunicator(){ +  if (communicator && !MPI::Is_finalized()) +  MPI_Comm_free(&communicator); +} + void CartesianCommunicator::GlobalSum(uint32_t &u){ int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); assert(ierr==0); diff --git a/lib/communicator/Communicator_mpit.cc b/lib/communicator/Communicator_mpit.cc index 5137c27b..2d257a44 100644 --- a/lib/communicator/Communicator_mpit.cc +++ b/lib/communicator/Communicator_mpit.cc @@ -53,6 +53,12 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { ShmInitGeneric(); } +CartesianCommunicator::~CartesianCommunicator(){ +  if (communicator && !MPI::Is_finalized()) +  MPI_Comm_free(&communicator); +} + + void CartesianCommunicator::GlobalSum(uint32_t &u){ int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); assert(ierr==0); diff --git a/lib/communicator/Communicator_none.cc b/lib/communicator/Communicator_none.cc index e9d71a15..629a3e4a 100644 --- a/lib/communicator/Communicator_none.cc +++ b/lib/communicator/Communicator_none.cc @@ -56,6 +56,8 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors) } } +CartesianCommunicator::~CartesianCommunicator(){} + void CartesianCommunicator::GlobalSum(float &){} void CartesianCommunicator::GlobalSumVector(float *,int N){} void CartesianCommunicator::GlobalSum(double &){} From 8a3aae98f6ffba03dcb85e1be23cd387a510e35d Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Wed, 25 Oct 2017 10:34:49 +0100 Subject: [PATCH 08/63] Solving minor bug in compilation --- lib/communicator/Communicator_mpi.cc | 7 ++++--- lib/communicator/Communicator_mpit.cc | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/lib/communicator/Communicator_mpi.cc b/lib/communicator/Communicator_mpi.cc index f1dad1e9..5a2dc4d0 100644 --- a/lib/communicator/Communicator_mpi.cc +++ b/lib/communicator/Communicator_mpi.cc @@ -53,9 +53,10 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { ShmInitGeneric(); } -CartesianCommunicator::~CartesianCommunicator(){ -  if (communicator && !MPI::Is_finalized()) -  MPI_Comm_free(&communicator); +CartesianCommunicator::~CartesianCommunicator() +{ + if (communicator && !MPI::Is_finalized()) + MPI_Comm_free(&communicator); } void CartesianCommunicator::GlobalSum(uint32_t &u){ diff --git a/lib/communicator/Communicator_mpit.cc b/lib/communicator/Communicator_mpit.cc index 2d257a44..15ee13fd 100644 --- a/lib/communicator/Communicator_mpit.cc +++ b/lib/communicator/Communicator_mpit.cc @@ -53,9 +53,10 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { ShmInitGeneric(); } -CartesianCommunicator::~CartesianCommunicator(){ -  if (communicator && !MPI::Is_finalized()) -  MPI_Comm_free(&communicator); +CartesianCommunicator::~CartesianCommunicator() +{ + if (communicator && !MPI::Is_finalized()) + MPI_Comm_free(&communicator); } From 28ba8a0f481f0451b5dc22691fe0ad35963af55a Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 25 Oct 2017 23:45:57 +0100 Subject: [PATCH 09/63] Force spacing more nicely --- lib/log/Log.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/log/Log.h b/lib/log/Log.h index 1b4732ab..ddff4c1d 100644 --- a/lib/log/Log.h +++ b/lib/log/Log.h @@ -135,7 +135,7 @@ public: GridTime now = log.StopWatch->Elapsed(); if ( log.timing_mode==1 ) log.StopWatch->Reset(); log.StopWatch->Start(); - stream << log.evidence()<< now << log.background() << " : " ; + stream << log.evidence()<< std::setw(6)< Date: Wed, 25 Oct 2017 23:46:33 +0100 Subject: [PATCH 10/63] Improvements for coarse grid compressed lanczos --- lib/algorithms/CoarsenedMatrix.h | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/lib/algorithms/CoarsenedMatrix.h b/lib/algorithms/CoarsenedMatrix.h index c2910151..8af8d7ac 100644 --- a/lib/algorithms/CoarsenedMatrix.h +++ b/lib/algorithms/CoarsenedMatrix.h @@ -103,29 +103,32 @@ namespace Grid { GridBase *CoarseGrid; GridBase *FineGrid; std::vector > subspace; + int checkerboard; - Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid) : - CoarseGrid(_CoarseGrid), + Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) : + CoarseGrid(_CoarseGrid), FineGrid(_FineGrid), - subspace(nbasis,_FineGrid) + subspace(nbasis,_FineGrid), + checkerboard(_checkerboard) { }; void Orthogonalise(void){ CoarseScalar InnerProd(CoarseGrid); + std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"< pokey(CoarseGrid); - - for(int i=0;ioSites();ss++){ + parallel_for(int ss=0;ssoSites();ss++){ eProj._odata[ss](i)=CComplex(1.0); } eProj=eProj - iProj; @@ -137,6 +140,7 @@ namespace Grid { blockProject(CoarseVec,FineVec,subspace); } void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){ + FineVec.checkerboard = subspace[0].checkerboard; blockPromote(CoarseVec,FineVec,subspace); } void CreateSubspaceRandom(GridParallelRNG &RNG){ @@ -147,6 +151,7 @@ namespace Grid { Orthogonalise(); } + /* virtual void CreateSubspaceLanczos(GridParallelRNG &RNG,LinearOperatorBase &hermop,int nn=nbasis) { // Run a Lanczos with sloppy convergence @@ -195,7 +200,7 @@ namespace Grid { std::cout << GridLogMessage <<"subspace["< &hermop,int nn=nbasis) { RealD scale; From d83868fdbbc6a3e9f67c966a190d517a2fb7f9f7 Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 25 Oct 2017 23:47:10 +0100 Subject: [PATCH 11/63] Identity linear op added -- useful in circumstances where a linear op may or may not be needed. Supply a trivial one if not needed --- lib/algorithms/LinearOperator.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/lib/algorithms/LinearOperator.h b/lib/algorithms/LinearOperator.h index 0d32cc15..2a68a7b9 100644 --- a/lib/algorithms/LinearOperator.h +++ b/lib/algorithms/LinearOperator.h @@ -346,6 +346,13 @@ namespace Grid { virtual void operator() (const Field &in, Field &out) = 0; }; + template class IdentityLinearFunction : public LinearFunction { + public: + void operator() (const Field &in, Field &out){ + out = in; + }; + }; + ///////////////////////////////////////////////////////////// // Base classes for Multishift solvers for operators From f6c3f6bf2d6ff210e25844b64f0d09fe5d074212 Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 25 Oct 2017 23:47:59 +0100 Subject: [PATCH 12/63] XML serialisation of parms and initialise from parms object --- lib/algorithms/approx/Chebyshev.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/lib/algorithms/approx/Chebyshev.h b/lib/algorithms/approx/Chebyshev.h index 7a6e9a9b..b34fac7f 100644 --- a/lib/algorithms/approx/Chebyshev.h +++ b/lib/algorithms/approx/Chebyshev.h @@ -34,6 +34,12 @@ Author: Christoph Lehner namespace Grid { +struct ChebyParams : Serializable { + GRID_SERIALIZABLE_CLASS_MEMBERS(ChebyParams, + RealD, alpha, + RealD, beta, + int, Npoly); +}; //////////////////////////////////////////////////////////////////////////////////////////// // Generic Chebyshev approximations @@ -67,6 +73,7 @@ namespace Grid { }; Chebyshev(){}; + Chebyshev(ChebyParams p){ Init(p.alpha,p.beta,p.Npoly);}; Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ) {Init(_lo,_hi,_order,func);}; Chebyshev(RealD _lo,RealD _hi,int _order) {Init(_lo,_hi,_order);}; From a479325349d5eed9351abe5adf267311d8b6d34c Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 25 Oct 2017 23:48:47 +0100 Subject: [PATCH 13/63] Rewrite of local coherence lanczos --- .../Test_dwf_compressed_lanczos_reorg.cc | 518 ++++++++++++++++++ 1 file changed, 518 insertions(+) create mode 100644 tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc diff --git a/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc b/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc new file mode 100644 index 00000000..a0691116 --- /dev/null +++ b/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc @@ -0,0 +1,518 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_dwf_compressed_lanczos_reorg.cc + + Copyright (C) 2017 + +Author: Leans heavily on Christoph Lehner's code +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +/* + * Reimplement the badly named "multigrid" lanczos as compressed Lanczos using the features + * in Grid that were intended to be used to support blocked Aggregates, from + */ +#include +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +struct LanczosParams : Serializable { + public: + GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams, + ChebyParams, Cheby,/*Chebyshev*/ + int, Nstop, /*Vecs in Lanczos must converge Nstop < Nk < Nm*/ + int, Nk, /*Vecs in Lanczos seek converge*/ + int, Nm, /*Total vecs in Lanczos include restart*/ + RealD, resid, /*residual*/ + int, MaxIt, + RealD, betastp, /* ? */ + int, MinRes); // Must restart +}; + +struct CompressedLanczosParams : Serializable { + public: + GRID_SERIALIZABLE_CLASS_MEMBERS(CompressedLanczosParams, + LanczosParams, FineParams, + LanczosParams, CoarseParams, + ChebyParams, Smoother, + std::vector, blockSize, + std::string, config, + std::vector < std::complex >, omega, + RealD, mass, + RealD, M5 + ); +}; + +// Duplicate functionality; ProjectedFunctionHermOp could be used with the trivial function +template +class ProjectedHermOp : public LinearFunction > > { +public: + typedef iVector CoarseSiteVector; + typedef Lattice CoarseField; + typedef Lattice CoarseScalar; // used for inner products on fine field + typedef Lattice FineField; + + LinearOperatorBase &_Linop; + Aggregation &_Aggregate; + + ProjectedHermOp(LinearOperatorBase& linop, Aggregation &aggregate) : + _Linop(linop), + _Aggregate(aggregate) { }; + + void operator()(const CoarseField& in, CoarseField& out) { + + GridBase *FineGrid = _Aggregate.FineGrid; + FineField fin(FineGrid); + FineField fout(FineGrid); + + _Aggregate.PromoteFromSubspace(in,fin); std::cout< +class ProjectedFunctionHermOp : public LinearFunction > > { +public: + typedef iVector CoarseSiteVector; + typedef Lattice CoarseField; + typedef Lattice CoarseScalar; // used for inner products on fine field + typedef Lattice FineField; + + + OperatorFunction & _poly; + LinearOperatorBase &_Linop; + Aggregation &_Aggregate; + + ProjectedFunctionHermOp(OperatorFunction & poly,LinearOperatorBase& linop, + Aggregation &aggregate) : + _poly(poly), + _Linop(linop), + _Aggregate(aggregate) { }; + + void operator()(const CoarseField& in, CoarseField& out) { + + GridBase *FineGrid = _Aggregate.FineGrid; + + FineField fin(FineGrid) ;fin.checkerboard =_Aggregate.checkerboard; + FineField fout(FineGrid);fout.checkerboard =_Aggregate.checkerboard; + + _Aggregate.PromoteFromSubspace(in,fin); std::cout< +class ImplicitlyRestartedLanczosSmoothedTester : public ImplicitlyRestartedLanczosTester > > +{ + public: + typedef iVector CoarseSiteVector; + typedef Lattice CoarseField; + typedef Lattice CoarseScalar; // used for inner products on fine field + typedef Lattice FineField; + + LinearFunction & _Poly; + OperatorFunction & _smoother; + LinearOperatorBase &_Linop; + Aggregation &_Aggregate; + + ImplicitlyRestartedLanczosSmoothedTester(LinearFunction &Poly, + OperatorFunction &smoother, + LinearOperatorBase &Linop, + Aggregation &Aggregate) + : _smoother(smoother), _Linop(Linop),_Aggregate(Aggregate), _Poly(Poly) { }; + + int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox) + { + CoarseField v(B); + RealD eval_poly = eval; + // Apply operator + _Poly(B,v); + + RealD vnum = real(innerProduct(B,v)); // HermOp. + RealD vden = norm2(B); + RealD vv0 = norm2(v); + eval = vnum/vden; + v -= eval*B; + + RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0); + + std::cout.precision(13); + std::cout< +class CoarseFineIRL +{ +public: + typedef iVector CoarseSiteVector; + typedef Lattice CoarseScalar; // used for inner products on fine field + typedef Lattice CoarseField; + typedef Lattice FineField; + +private: + GridBase *_CoarseGrid; + GridBase *_FineGrid; + int _checkerboard; + LinearOperatorBase & _FineOp; + + // FIXME replace Aggregation with vector of fine; the code reuse is too small for + // the hassle and complexity of cross coupling. + Aggregation _Aggregate; + std::vector evals_fine; + std::vector evals_coarse; + std::vector evec_coarse; +public: + CoarseFineIRL(GridBase *FineGrid, + GridBase *CoarseGrid, + LinearOperatorBase &FineOp, + int checkerboard) : + _CoarseGrid(CoarseGrid), + _FineGrid(FineGrid), + _Aggregate(CoarseGrid,FineGrid,checkerboard), + _FineOp(FineOp), + _checkerboard(checkerboard) + { + evals_fine.resize(0); + evals_coarse.resize(0); + }; + void Orthogonalise(void ) { _Aggregate.Orthogonalise(); } + + template static RealD normalise(T& v) + { + RealD nn = norm2(v); + nn = ::sqrt(nn); + v = v * (1.0/nn); + return nn; + } + + void testFine(void) + { + int Nk = nbasis; + _Aggregate.subspace.resize(Nk,_FineGrid); + _Aggregate.subspace[0]=1.0; + _Aggregate.subspace[0].checkerboard=_checkerboard; + normalise(_Aggregate.subspace[0]); + PlainHermOp Op(_FineOp); + for(int k=1;k Cheby(cheby_parms); + FunctionHermOp ChebyOp(Cheby,_FineOp); + PlainHermOp Op(_FineOp); + + evals_fine.resize(Nm); + _Aggregate.subspace.resize(Nm,_FineGrid); + + ImplicitlyRestartedLanczos IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes); + + FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard; + + int Nconv; + IRL.calc(evals_fine,_Aggregate.subspace,src,Nconv,false,0); + + // Shrink down to number saved + assert(Nstop>=nbasis); + assert(Nconv>=nbasis); + evals_fine.resize(nbasis); + _Aggregate.subspace.resize(nbasis,_FineGrid); + } + void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth, + int Nstop, int Nk, int Nm,RealD resid, + RealD MaxIt, RealD betastp, int MinRes) + { + Chebyshev Cheby(cheby_op); + ProjectedHermOp Op(_FineOp,_Aggregate); + ProjectedFunctionHermOp ChebyOp (Cheby,_FineOp,_Aggregate); + ////////////////////////////////////////////////////////////////////////////////////////////////// + // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL + ////////////////////////////////////////////////////////////////////////////////////////////////// + + Chebyshev ChebySmooth(cheby_smooth); + ImplicitlyRestartedLanczosSmoothedTester ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate); + + + evals_coarse.resize(Nm); + evec_coarse.resize(Nm,_CoarseGrid); + + CoarseField src(_CoarseGrid); src=1.0; + + ImplicitlyRestartedLanczos IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes); + int Nconv=0; + IRL.calc(evals_coarse,evec_coarse,src,Nconv,false,1); + assert(Nconv>=Nstop); + + for (int i=0;i blockSize = Params.blockSize; + + // Grids + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::vector fineLatt = GridDefaultLatt(); + int dims=fineLatt.size(); + assert(blockSize.size()==dims+1); + std::vector coarseLatt(dims); + std::vector coarseLatt5d ; + + for (int d=0;d HermOp(Ddwf); + + // Eigenvector storage + LanczosParams fine =Params.FineParams; + LanczosParams coarse=Params.CoarseParams; + + const int Ns1 = fine.Nstop; const int Ns2 = coarse.Nstop; + const int Nk1 = fine.Nk; const int Nk2 = coarse.Nk; + const int Nm1 = fine.Nm; const int Nm2 = coarse.Nm; + + std::cout << GridLogMessage << "Keep " << fine.Nstop << " fine vectors" << std::endl; + std::cout << GridLogMessage << "Keep " << coarse.Nstop << " coarse vectors" << std::endl; + assert(Nm2 >= Nm1); + + const int nbasis= 60; + assert(nbasis==Ns1); + CoarseFineIRL IRL(FrbGrid,CoarseGrid5rb,HermOp,Odd); + std::cout << GridLogMessage << "Constructed CoarseFine IRL" << std::endl; + + int do_fine = 1; + int do_coarse = 0; + int do_smooth = 0; + if ( do_fine ) { + std::cout << GridLogMessage << "Performing fine grid IRL Nstop "<< Ns1 << " Nk "< Date: Wed, 25 Oct 2017 23:49:23 +0100 Subject: [PATCH 14/63] 64 bit safe offsets --- lib/parallelIO/BinaryIO.h | 94 +++++++++++++++++++++------------------ 1 file changed, 51 insertions(+), 43 deletions(-) diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h index d14f3fe2..a2abc9be 100644 --- a/lib/parallelIO/BinaryIO.h +++ b/lib/parallelIO/BinaryIO.h @@ -261,7 +261,7 @@ class BinaryIO { GridBase *grid, std::vector &iodata, std::string file, - int offset, + Integer offset, const std::string &format, int control, uint32_t &nersc_csum, uint32_t &scidac_csuma, @@ -367,7 +367,7 @@ class BinaryIO { assert(0); #endif } else { - std::cout << GridLogMessage << "C++ read I/O " << file << " : " + std::cout << GridLogMessage << "C++ read I/O " << file << " : " << iodata.size() * sizeof(fobj) << " bytes" << std::endl; std::ifstream fin; fin.open(file, std::ios::binary | std::ios::in); @@ -444,48 +444,56 @@ class BinaryIO { assert(0); #endif } else { + + std::cout << GridLogMessage << "C++ write I/O " << file << " : " + << iodata.size() * sizeof(fobj) << " bytes" << std::endl; std::ofstream fout; - fout.exceptions ( std::fstream::failbit | std::fstream::badbit ); - try { - fout.open(file,std::ios::binary|std::ios::out|std::ios::in); - } catch (const std::fstream::failure& exc) { - std::cout << GridLogError << "Error in opening the file " << file << " for output" < &Umu, std::string file, munger munge, - int offset, + Integer offset, const std::string &format, uint32_t &nersc_csum, uint32_t &scidac_csuma, @@ -552,7 +560,7 @@ class BinaryIO { static inline void writeLatticeObject(Lattice &Umu, std::string file, munger munge, - int offset, + Integer offset, const std::string &format, uint32_t &nersc_csum, uint32_t &scidac_csuma, @@ -589,7 +597,7 @@ class BinaryIO { static inline void readRNG(GridSerialRNG &serial, GridParallelRNG ¶llel, std::string file, - int offset, + Integer offset, uint32_t &nersc_csum, uint32_t &scidac_csuma, uint32_t &scidac_csumb) @@ -651,7 +659,7 @@ class BinaryIO { static inline void writeRNG(GridSerialRNG &serial, GridParallelRNG ¶llel, std::string file, - int offset, + Integer offset, uint32_t &nersc_csum, uint32_t &scidac_csuma, uint32_t &scidac_csumb) From 66295b99aada692f68c6547ce7d435e8d7df9e66 Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 25 Oct 2017 23:50:05 +0100 Subject: [PATCH 15/63] Bit less verbose SciDAC IO --- lib/parallelIO/IldgIO.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/lib/parallelIO/IldgIO.h b/lib/parallelIO/IldgIO.h index ba71153d..1f2b7c90 100644 --- a/lib/parallelIO/IldgIO.h +++ b/lib/parallelIO/IldgIO.h @@ -147,7 +147,7 @@ namespace QCD { _scidacRecord = sr; - std::cout << GridLogMessage << "Build SciDAC datatype " <_gsites; createLimeRecordHeader(record_name, 0, 0, PayloadSize); - // std::cout << "W sizeof(sobj)" <_gsites< xmlc(nbytes+1,'\0'); limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR); - std::cout << GridLogMessage<< "Non binary record :" < Date: Wed, 25 Oct 2017 23:50:37 +0100 Subject: [PATCH 16/63] Better error messaging --- lib/serialisation/XmlIO.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/serialisation/XmlIO.cc b/lib/serialisation/XmlIO.cc index a132a2f0..c0c45adc 100644 --- a/lib/serialisation/XmlIO.cc +++ b/lib/serialisation/XmlIO.cc @@ -68,10 +68,10 @@ std::string XmlWriter::XmlString(void) XmlReader::XmlReader(const char *xmlstring,string toplev) : fileName_("") { pugi::xml_parse_result result; - result = doc_.load_string(xmlstring); + result = doc_.load_file(xmlstring); if ( !result ) { - cerr << "XML error description: " << result.description() << "\n"; - cerr << "XML error offset : " << result.offset << "\n"; + cerr << "XML error description: char * " << result.description() << " "<< xmlstring << "\n"; + cerr << "XML error offset : char * " << result.offset << " "< Date: Wed, 25 Oct 2017 23:51:18 +0100 Subject: [PATCH 17/63] Red black friendly coarsening --- lib/lattice/Lattice_transfer.h | 54 ++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/lib/lattice/Lattice_transfer.h b/lib/lattice/Lattice_transfer.h index 713a8788..48688e43 100644 --- a/lib/lattice/Lattice_transfer.h +++ b/lib/lattice/Lattice_transfer.h @@ -109,8 +109,8 @@ inline void blockProject(Lattice > &coarseData, coarseData=zero; - // Loop with a cache friendly loop ordering - for(int sf=0;sfoSites();sf++){ + // Loop over coars parallel, and then loop over fine associated with coarse. + parallel_for(int sf=0;sfoSites();sf++){ int sc; std::vector coor_c(_ndimension); @@ -119,8 +119,9 @@ inline void blockProject(Lattice > &coarseData, for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); +PARALLEL_CRITICAL for(int i=0;i &fineZ, GridBase * coarse= coarseA._grid; fineZ.checkerboard=fineX.checkerboard; + assert(fineX.checkerboard==fineY.checkerboard); subdivides(coarse,fine); // require they map conformable(fineX,fineY); conformable(fineX,fineZ); @@ -180,9 +182,10 @@ template GridBase *coarse(CoarseInner._grid); GridBase *fine (fineX._grid); - Lattice fine_inner(fine); + Lattice fine_inner(fine); fine_inner.checkerboard = fineX.checkerboard; Lattice coarse_inner(coarse); + // Precision promotion? fine_inner = localInnerProduct(fineX,fineY); blockSum(coarse_inner,fine_inner); parallel_for(int ss=0;ssoSites();ss++){ @@ -193,7 +196,7 @@ template inline void blockNormalise(Lattice &ip,Lattice &fineX) { GridBase *coarse = ip._grid; - Lattice zz(fineX._grid); zz=zero; + Lattice zz(fineX._grid); zz=zero; zz.checkerboard=fineX.checkerboard; blockInnerProduct(ip,fineX,fineX); ip = pow(ip,-0.5); blockZAXPY(fineX,ip,fineX,zz); @@ -216,19 +219,25 @@ inline void blockSum(Lattice &coarseData,const Lattice &fineData) block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; } + // Turn this around to loop threaded over sc and interior loop + // over sf would thread better coarseData=zero; - for(int sf=0;sfoSites();sf++){ - + parallel_region { + int sc; std::vector coor_c(_ndimension); std::vector coor_f(_ndimension); - Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); - for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; - Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); - - coarseData._odata[sc]=coarseData._odata[sc]+fineData._odata[sf]; + parallel_for_internal(int sf=0;sfoSites();sf++){ + + Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); + for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; + Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); + +PARALLEL_CRITICAL + coarseData._odata[sc]=coarseData._odata[sc]+fineData._odata[sf]; + } } return; } @@ -238,7 +247,7 @@ inline void blockPick(GridBase *coarse,const Lattice &unpicked,Lattice zz(fine); + Lattice zz(fine); zz.checkerboard = unpicked.checkerboard; Lattice > fcoor(fine); zz = zero; @@ -303,20 +312,21 @@ inline void blockPromote(const Lattice > &coarseData, } // Loop with a cache friendly loop ordering - for(int sf=0;sfoSites();sf++){ - + parallel_region { int sc; std::vector coor_c(_ndimension); std::vector coor_f(_ndimension); - Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); - for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; - Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); - - for(int i=0;ioSites();sf++){ + Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); + for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; + Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); + + for(int i=0;i Date: Wed, 25 Oct 2017 23:52:47 +0100 Subject: [PATCH 18/63] Use existing functionality where possible --- tests/lanczos/FieldBasisVector.h | 81 ++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 tests/lanczos/FieldBasisVector.h diff --git a/tests/lanczos/FieldBasisVector.h b/tests/lanczos/FieldBasisVector.h new file mode 100644 index 00000000..9a21aa46 --- /dev/null +++ b/tests/lanczos/FieldBasisVector.h @@ -0,0 +1,81 @@ +namespace Grid { + +template +class BasisFieldVector { + public: + int _Nm; + + typedef typename Field::scalar_type Coeff_t; + typedef typename Field::vector_type vCoeff_t; + typedef typename Field::vector_object vobj; + typedef typename vobj::scalar_object sobj; + + std::vector _v; // _Nfull vectors + + void report(int n,GridBase* value) { + + std::cout << GridLogMessage << "BasisFieldVector allocated:\n"; + std::cout << GridLogMessage << " Delta N = " << n << "\n"; + std::cout << GridLogMessage << " Size of full vectors (size) = " << + ((double)n*sizeof(vobj)*value->oSites() / 1024./1024./1024.) << " GB\n"; + std::cout << GridLogMessage << " Size = " << _v.size() << " Capacity = " << _v.capacity() << std::endl; + + value->Barrier(); + +#ifdef __linux + if (value->IsBoss()) { + system("cat /proc/meminfo"); + } +#endif + + value->Barrier(); + + } + + BasisFieldVector(int Nm,GridBase* value) : _Nm(Nm), _v(Nm,value) { + report(Nm,value); + } + + ~BasisFieldVector() { + } + + Field& operator[](int i) { + return _v[i]; + } + + void orthogonalize(Field& w, int k) { + basisOrthogonalize(_v,w,k); + } + + void rotate(Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm) { + basisRotate(_v,Qt,j0,j1,k0,k1,Nm); + } + + size_t size() const { + return _Nm; + } + + void resize(int n) { + if (n > _Nm) + _v.reserve(n); + + _v.resize(n,_v[0]._grid); + + if (n < _Nm) + _v.shrink_to_fit(); + + report(n - _Nm,_v[0]._grid); + + _Nm = n; + } + + void sortInPlace(std::vector& sort_vals, bool reverse) { + basisSortInPlace(_v,sort_vals,reverse); + } + + void deflate(const std::vector& eval,const Field& src_orig,Field& result) { + basisDeflate(_v,eval,src_orig,result); + } + + }; +} From e4d461cb03ee3b039345c3c4ec29704dec5c8d94 Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 25 Oct 2017 23:53:19 +0100 Subject: [PATCH 19/63] Messagign --- tests/lanczos/Test_dwf_compressed_lanczos.cc | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/lanczos/Test_dwf_compressed_lanczos.cc b/tests/lanczos/Test_dwf_compressed_lanczos.cc index 10d6c3ae..a6eb95e9 100644 --- a/tests/lanczos/Test_dwf_compressed_lanczos.cc +++ b/tests/lanczos/Test_dwf_compressed_lanczos.cc @@ -26,9 +26,9 @@ // The following are now decoupled from the Lanczos and deal with grids. // Safe to replace functionality ///////////////////////////////////////////////////////////////////////////// -#include -#include -#include +#include "BlockedGrid.h" +#include "FieldBasisVector.h" +#include "BlockProjector.h" #include "FieldVectorIO.h" #include "Params.h" @@ -431,6 +431,7 @@ void CoarseGridLanczos(BlockProjector& pr,RealD alpha2,RealD beta,int Npo auto result = src_orig; // undeflated solve + std::cout << GridLogMessage << " Undeflated solve "<IsBoss()) @@ -438,6 +439,7 @@ void CoarseGridLanczos(BlockProjector& pr,RealD alpha2,RealD beta,int Npo // CG.ResHistory.clear(); // deflated solve with all eigenvectors + std::cout << GridLogMessage << " Deflated solve with all evectors"<& pr,RealD alpha2,RealD beta,int Npo // CG.ResHistory.clear(); // deflated solve with non-blocked eigenvectors + std::cout << GridLogMessage << " Deflated solve with non-blocked evectors"<& pr,RealD alpha2,RealD beta,int Npo // CG.ResHistory.clear(); // deflated solve with all eigenvectors and original eigenvalues from proj + std::cout << GridLogMessage << " Deflated solve with all eigenvectors and original eigenvalues from proj"< Date: Wed, 25 Oct 2017 23:53:44 +0100 Subject: [PATCH 20/63] Faster converge time --- tests/solver/Test_dwf_mrhs_cg.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/solver/Test_dwf_mrhs_cg.cc b/tests/solver/Test_dwf_mrhs_cg.cc index d9215db2..079fa85a 100644 --- a/tests/solver/Test_dwf_mrhs_cg.cc +++ b/tests/solver/Test_dwf_mrhs_cg.cc @@ -190,7 +190,7 @@ int main (int argc, char ** argv) MdagMLinearOperator HermOp(Ddwf); MdagMLinearOperator HermOpCk(Dchk); - ConjugateGradient CG((1.0e-8/(me+1)),10000); + ConjugateGradient CG((1.0e-5/(me+1)),10000); s_res = zero; CG(HermOp,s_src,s_res); From d577211cc376303d88355df5bb101ff8aaf6f9ab Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 25 Oct 2017 23:57:54 +0100 Subject: [PATCH 21/63] Relax stoppign condition --- tests/solver/Test_dwf_mrhs_cg_mpi.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/solver/Test_dwf_mrhs_cg_mpi.cc b/tests/solver/Test_dwf_mrhs_cg_mpi.cc index 90969b85..fbc6dd32 100644 --- a/tests/solver/Test_dwf_mrhs_cg_mpi.cc +++ b/tests/solver/Test_dwf_mrhs_cg_mpi.cc @@ -113,7 +113,7 @@ int main (int argc, char ** argv) MdagMLinearOperator HermOp(Ddwf); MdagMLinearOperator HermOpCk(Dchk); - ConjugateGradient CG((1.0e-8/(me+1)),10000); + ConjugateGradient CG((1.0e-5/(me+1)),10000); s_res = zero; CG(HermOp,s_src,s_res); From e9be293444039051630aca103ae861b51cf242a5 Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 26 Oct 2017 01:59:30 +0100 Subject: [PATCH 22/63] Better messaging --- lib/parallelIO/BinaryIO.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h index a2abc9be..b40a75af 100644 --- a/lib/parallelIO/BinaryIO.h +++ b/lib/parallelIO/BinaryIO.h @@ -356,7 +356,7 @@ class BinaryIO { if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) { #ifdef USE_MPI_IO - std::cout<< GridLogMessage<< "MPI read I/O "<< file<< std::endl; + std::cout<< GridLogMessage<<"IOobject: MPI read I/O "<< file<< std::endl; ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh); assert(ierr==0); ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr==0); ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status); assert(ierr==0); @@ -367,7 +367,7 @@ class BinaryIO { assert(0); #endif } else { - std::cout << GridLogMessage << "C++ read I/O " << file << " : " + std::cout << GridLogMessage <<"IOobject: C++ read I/O " << file << " : " << iodata.size() * sizeof(fobj) << " bytes" << std::endl; std::ifstream fin; fin.open(file, std::ios::binary | std::ios::in); @@ -413,9 +413,9 @@ class BinaryIO { timer.Start(); if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) { #ifdef USE_MPI_IO - std::cout << GridLogMessage << "MPI write I/O " << file << std::endl; + std::cout << GridLogMessage <<"IOobject: MPI write I/O " << file << std::endl; ierr = MPI_File_open(grid->communicator, (char *)file.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh); - std::cout << GridLogMessage << "Checking for errors" << std::endl; + // std::cout << GridLogMessage << "Checking for errors" << std::endl; if (ierr != MPI_SUCCESS) { char error_string[BUFSIZ]; @@ -445,7 +445,7 @@ class BinaryIO { #endif } else { - std::cout << GridLogMessage << "C++ write I/O " << file << " : " + std::cout << GridLogMessage << "IOobject: C++ write I/O " << file << " : " << iodata.size() * sizeof(fobj) << " bytes" << std::endl; std::ofstream fout; From ccd20df8276fa1951f7d6489bce95c3a65de57eb Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 26 Oct 2017 01:59:59 +0100 Subject: [PATCH 23/63] Better IRL interface --- tests/lanczos/BlockProjector.h | 143 +++++++ tests/lanczos/BlockedGrid.h | 401 ++++++++++++++++++ tests/lanczos/Test_dwf_compressed_lanczos.cc | 4 +- .../Test_dwf_compressed_lanczos_reorg.cc | 21 +- 4 files changed, 557 insertions(+), 12 deletions(-) create mode 100644 tests/lanczos/BlockProjector.h create mode 100644 tests/lanczos/BlockedGrid.h diff --git a/tests/lanczos/BlockProjector.h b/tests/lanczos/BlockProjector.h new file mode 100644 index 00000000..6becaa66 --- /dev/null +++ b/tests/lanczos/BlockProjector.h @@ -0,0 +1,143 @@ +namespace Grid { + +/* + BlockProjector + + If _HP_BLOCK_PROJECTORS_ is defined, we assume that _evec is a basis that is not + fully orthonormalized (to the precision of the coarse field) and we allow for higher-precision + coarse field than basis field. + +*/ +//#define _HP_BLOCK_PROJECTORS_ + +template +class BlockProjector { +public: + + BasisFieldVector& _evec; + BlockedGrid& _bgrid; + + BlockProjector(BasisFieldVector& evec, BlockedGrid& bgrid) : _evec(evec), _bgrid(bgrid) { + } + + void createOrthonormalBasis(RealD thres = 0.0) { + + GridStopWatch sw; + sw.Start(); + + int cnt = 0; + +#pragma omp parallel shared(cnt) + { + int lcnt = 0; + +#pragma omp for + for (int b=0;b<_bgrid._o_blocks;b++) { + + for (int i=0;i<_evec._Nm;i++) { + + auto nrm0 = _bgrid.block_sp(b,_evec._v[i],_evec._v[i]); + + // |i> -= |j> + for (int j=0;j + void coarseToFine(const CoarseField& in, Field& out) { + + out = zero; + out.checkerboard = _evec._v[0].checkerboard; + + int Nbasis = sizeof(in._odata[0]._internal._internal) / sizeof(in._odata[0]._internal._internal[0]); + assert(Nbasis == _evec._Nm); + +#pragma omp parallel for + for (int b=0;b<_bgrid._o_blocks;b++) { + for (int j=0;j<_evec._Nm;j++) { + _bgrid.block_caxpy(b,out,in._odata[b]._internal._internal[j],_evec._v[j],out); + } + } + + } + + template + void fineToCoarse(const Field& in, CoarseField& out) { + + out = zero; + + int Nbasis = sizeof(out._odata[0]._internal._internal) / sizeof(out._odata[0]._internal._internal[0]); + assert(Nbasis == _evec._Nm); + + + Field tmp(_bgrid._grid); + tmp = in; + +#pragma omp parallel for + for (int b=0;b<_bgrid._o_blocks;b++) { + for (int j=0;j<_evec._Nm;j++) { + // |rhs> -= |j> + auto c = _bgrid.block_sp(b,_evec._v[j],tmp); + _bgrid.block_caxpy(b,tmp,-c,_evec._v[j],tmp); // may make this more numerically stable + out._odata[b]._internal._internal[j] = c; + } + } + + } + + template + void deflateFine(BasisFieldVector& _coef,const std::vector& eval,int N,const Field& src_orig,Field& result) { + result = zero; + for (int i=0;i + void deflateCoarse(BasisFieldVector& _coef,const std::vector& eval,int N,const Field& src_orig,Field& result) { + CoarseField src_coarse(_coef._v[0]._grid); + CoarseField result_coarse = src_coarse; + result_coarse = zero; + fineToCoarse(src_orig,src_coarse); + for (int i=0;i + void deflate(BasisFieldVector& _coef,const std::vector& eval,int N,const Field& src_orig,Field& result) { + // Deflation on coarse Grid is much faster, so use it by default. Deflation on fine Grid is kept for legacy reasons for now. + deflateCoarse(_coef,eval,N,src_orig,result); + } + +}; +} diff --git a/tests/lanczos/BlockedGrid.h b/tests/lanczos/BlockedGrid.h new file mode 100644 index 00000000..821272de --- /dev/null +++ b/tests/lanczos/BlockedGrid.h @@ -0,0 +1,401 @@ +namespace Grid { + +template +class BlockedGrid { +public: + GridBase* _grid; + typedef typename Field::scalar_type Coeff_t; + typedef typename Field::vector_type vCoeff_t; + + std::vector _bs; // block size + std::vector _nb; // number of blocks + std::vector _l; // local dimensions irrespective of cb + std::vector _l_cb; // local dimensions of checkerboarded vector + std::vector _l_cb_o; // local dimensions of inner checkerboarded vector + std::vector _bs_cb; // block size in checkerboarded vector + std::vector _nb_o; // number of blocks of simd o-sites + + int _nd, _blocks, _cf_size, _cf_block_size, _cf_o_block_size, _o_blocks, _block_sites; + + BlockedGrid(GridBase* grid, const std::vector& block_size) : + _grid(grid), _bs(block_size), _nd((int)_bs.size()), + _nb(block_size), _l(block_size), _l_cb(block_size), _nb_o(block_size), + _l_cb_o(block_size), _bs_cb(block_size) { + + _blocks = 1; + _o_blocks = 1; + _l = grid->FullDimensions(); + _l_cb = grid->LocalDimensions(); + _l_cb_o = grid->_rdimensions; + + _cf_size = 1; + _block_sites = 1; + for (int i=0;i<_nd;i++) { + _l[i] /= grid->_processors[i]; + + assert(!(_l[i] % _bs[i])); // lattice must accommodate choice of blocksize + + int r = _l[i] / _l_cb[i]; + assert(!(_bs[i] % r)); // checkerboarding must accommodate choice of blocksize + _bs_cb[i] = _bs[i] / r; + _block_sites *= _bs_cb[i]; + _nb[i] = _l[i] / _bs[i]; + _nb_o[i] = _nb[i] / _grid->_simd_layout[i]; + if (_nb[i] % _grid->_simd_layout[i]) { // simd must accommodate choice of blocksize + std::cout << GridLogMessage << "Problem: _nb[" << i << "] = " << _nb[i] << " _grid->_simd_layout[" << i << "] = " << _grid->_simd_layout[i] << std::endl; + assert(0); + } + _blocks *= _nb[i]; + _o_blocks *= _nb_o[i]; + _cf_size *= _l[i]; + } + + _cf_size *= 12 / 2; + _cf_block_size = _cf_size / _blocks; + _cf_o_block_size = _cf_size / _o_blocks; + + std::cout << GridLogMessage << "BlockedGrid:" << std::endl; + std::cout << GridLogMessage << " _l = " << _l << std::endl; + std::cout << GridLogMessage << " _l_cb = " << _l_cb << std::endl; + std::cout << GridLogMessage << " _l_cb_o = " << _l_cb_o << std::endl; + std::cout << GridLogMessage << " _bs = " << _bs << std::endl; + std::cout << GridLogMessage << " _bs_cb = " << _bs_cb << std::endl; + + std::cout << GridLogMessage << " _nb = " << _nb << std::endl; + std::cout << GridLogMessage << " _nb_o = " << _nb_o << std::endl; + std::cout << GridLogMessage << " _blocks = " << _blocks << std::endl; + std::cout << GridLogMessage << " _o_blocks = " << _o_blocks << std::endl; + std::cout << GridLogMessage << " sizeof(vCoeff_t) = " << sizeof(vCoeff_t) << std::endl; + std::cout << GridLogMessage << " _cf_size = " << _cf_size << std::endl; + std::cout << GridLogMessage << " _cf_block_size = " << _cf_block_size << std::endl; + std::cout << GridLogMessage << " _block_sites = " << _block_sites << std::endl; + std::cout << GridLogMessage << " _grid->oSites() = " << _grid->oSites() << std::endl; + + // _grid->Barrier(); + //abort(); + } + + void block_to_coor(int b, std::vector& x0) { + + std::vector bcoor; + bcoor.resize(_nd); + x0.resize(_nd); + assert(b < _o_blocks); + Lexicographic::CoorFromIndex(bcoor,b,_nb_o); + int i; + + for (i=0;i<_nd;i++) { + x0[i] = bcoor[i]*_bs_cb[i]; + } + + //std::cout << GridLogMessage << "Map block b -> " << x0 << std::endl; + + } + + void block_site_to_o_coor(const std::vector& x0, std::vector& coor, int i) { + Lexicographic::CoorFromIndex(coor,i,_bs_cb); + for (int j=0;j<_nd;j++) + coor[j] += x0[j]; + } + + int block_site_to_o_site(const std::vector& x0, int i) { + std::vector coor; coor.resize(_nd); + block_site_to_o_coor(x0,coor,i); + Lexicographic::IndexFromCoor(coor,i,_l_cb_o); + return i; + } + + vCoeff_t block_sp(int b, const Field& x, const Field& y) { + + std::vector x0; + block_to_coor(b,x0); + + vCoeff_t ret = 0.0; + for (int i=0;i<_block_sites;i++) { // only odd sites + int ss = block_site_to_o_site(x0,i); + ret += TensorRemove(innerProduct(x._odata[ss],y._odata[ss])); + } + + return ret; + + } + + vCoeff_t block_sp(int b, const Field& x, const std::vector< ComplexD >& y) { + + std::vector x0; + block_to_coor(b,x0); + + constexpr int nsimd = sizeof(vCoeff_t) / sizeof(Coeff_t); + int lsize = _cf_o_block_size / _block_sites; + + std::vector< ComplexD > ret(nsimd); + for (int i=0;i + void vcaxpy(iScalar& r,const vCoeff_t& a,const iScalar& x,const iScalar& y) { + vcaxpy(r._internal,a,x._internal,y._internal); + } + + template + void vcaxpy(iVector& r,const vCoeff_t& a,const iVector& x,const iVector& y) { + for (int i=0;i x0; + block_to_coor(b,x0); + + for (int i=0;i<_block_sites;i++) { // only odd sites + int ss = block_site_to_o_site(x0,i); + vcaxpy(ret._odata[ss],a,x._odata[ss],y._odata[ss]); + } + + } + + void block_caxpy(int b, std::vector< ComplexD >& ret, const vCoeff_t& a, const Field& x, const std::vector< ComplexD >& y) { + std::vector x0; + block_to_coor(b,x0); + + constexpr int nsimd = sizeof(vCoeff_t) / sizeof(Coeff_t); + int lsize = _cf_o_block_size / _block_sites; + + for (int i=0;i<_block_sites;i++) { // only odd sites + int ss = block_site_to_o_site(x0,i); + + int n = lsize / nsimd; + for (int l=0;l& x) { + std::vector x0; + block_to_coor(b,x0); + + int lsize = _cf_o_block_size / _block_sites; + + for (int i=0;i<_block_sites;i++) { // only odd sites + int ss = block_site_to_o_site(x0,i); + + for (int l=0;l& x) { + std::vector x0; + block_to_coor(b,x0); + + int lsize = _cf_o_block_size / _block_sites; + + for (int i=0;i<_block_sites;i++) { // only odd sites + int ss = block_site_to_o_site(x0,i); + + for (int l=0;l + void vcscale(iScalar& r,const vCoeff_t& a,const iScalar& x) { + vcscale(r._internal,a,x._internal); + } + + template + void vcscale(iVector& r,const vCoeff_t& a,const iVector& x) { + for (int i=0;i x0; + block_to_coor(b,x0); + + for (int i=0;i<_block_sites;i++) { // only odd sites + int ss = block_site_to_o_site(x0,i); + vcscale(ret._odata[ss],a,ret._odata[ss]); + } + } + + void getCanonicalBlockOffset(int cb, std::vector& x0) { + const int ndim = 5; + assert(_nb.size() == ndim); + std::vector _nbc = { _nb[1], _nb[2], _nb[3], _nb[4], _nb[0] }; + std::vector _bsc = { _bs[1], _bs[2], _bs[3], _bs[4], _bs[0] }; + x0.resize(ndim); + + assert(cb >= 0); + assert(cb < _nbc[0]*_nbc[1]*_nbc[2]*_nbc[3]*_nbc[4]); + + Lexicographic::CoorFromIndex(x0,cb,_nbc); + int i; + + for (i=0;i& buf) { + std::vector _bsc = { _bs[1], _bs[2], _bs[3], _bs[4], _bs[0] }; + std::vector ldim = v._grid->LocalDimensions(); + std::vector cldim = { ldim[1], ldim[2], ldim[3], ldim[4], ldim[0] }; + const int _nbsc = _bs_cb[0]*_bs_cb[1]*_bs_cb[2]*_bs_cb[3]*_bs_cb[4]; + // take canonical block cb of v and put it in canonical ordering in buf + std::vector cx0; + getCanonicalBlockOffset(cb,cx0); + +#pragma omp parallel + { + std::vector co0,cl0; + co0=cx0; cl0=cx0; + +#pragma omp for + for (int i=0;i<_nbsc;i++) { + Lexicographic::CoorFromIndex(co0,2*i,_bsc); // 2* for eo + for (int j=0;j<(int)_bsc.size();j++) + cl0[j] = cx0[j] + co0[j]; + + std::vector l0 = { cl0[4], cl0[0], cl0[1], cl0[2], cl0[3] }; + int oi = v._grid->oIndex(l0); + int ii = v._grid->iIndex(l0); + int lti = i; + + //if (cb < 2 && i<2) + // std::cout << GridLogMessage << "Map: " << cb << ", " << i << " To: " << cl0 << ", " << cx0 << ", " << oi << ", " << ii << std::endl; + + for (int s=0;s<4;s++) + for (int c=0;c<3;c++) { + Coeff_t& ld = ((Coeff_t*)&v._odata[oi]._internal._internal[s]._internal[c])[ii]; + int ti = 12*lti + 3*s + c; + ld = Coeff_t(buf[2*ti+0], buf[2*ti+1]); + } + } + } + } + + void peekBlockOfVectorCanonical(int cb,const Field& v,std::vector& buf) { + std::vector _bsc = { _bs[1], _bs[2], _bs[3], _bs[4], _bs[0] }; + std::vector ldim = v._grid->LocalDimensions(); + std::vector cldim = { ldim[1], ldim[2], ldim[3], ldim[4], ldim[0] }; + const int _nbsc = _bs_cb[0]*_bs_cb[1]*_bs_cb[2]*_bs_cb[3]*_bs_cb[4]; + // take canonical block cb of v and put it in canonical ordering in buf + std::vector cx0; + getCanonicalBlockOffset(cb,cx0); + + buf.resize(_cf_block_size * 2); + +#pragma omp parallel + { + std::vector co0,cl0; + co0=cx0; cl0=cx0; + +#pragma omp for + for (int i=0;i<_nbsc;i++) { + Lexicographic::CoorFromIndex(co0,2*i,_bsc); // 2* for eo + for (int j=0;j<(int)_bsc.size();j++) + cl0[j] = cx0[j] + co0[j]; + + std::vector l0 = { cl0[4], cl0[0], cl0[1], cl0[2], cl0[3] }; + int oi = v._grid->oIndex(l0); + int ii = v._grid->iIndex(l0); + int lti = i; + + //if (cb < 2 && i<2) + // std::cout << GridLogMessage << "Map: " << cb << ", " << i << " To: " << cl0 << ", " << cx0 << ", " << oi << ", " << ii << std::endl; + + for (int s=0;s<4;s++) + for (int c=0;c<3;c++) { + Coeff_t& ld = ((Coeff_t*)&v._odata[oi]._internal._internal[s]._internal[c])[ii]; + int ti = 12*lti + 3*s + c; + buf[2*ti+0] = ld.real(); + buf[2*ti+1] = ld.imag(); + } + } + } + } + + int globalToLocalCanonicalBlock(int slot,const std::vector& src_nodes,int nb) { + // processor coordinate + int _nd = (int)src_nodes.size(); + std::vector _src_nodes = src_nodes; + std::vector pco(_nd); + Lexicographic::CoorFromIndex(pco,slot,_src_nodes); + std::vector cpco = { pco[1], pco[2], pco[3], pco[4], pco[0] }; + + // get local block + std::vector _nbc = { _nb[1], _nb[2], _nb[3], _nb[4], _nb[0] }; + assert(_nd == 5); + std::vector c_src_local_blocks(_nd); + for (int i=0;i<_nd;i++) { + assert(_grid->_fdimensions[i] % (src_nodes[i] * _bs[i]) == 0); + c_src_local_blocks[(i+4) % 5] = _grid->_fdimensions[i] / src_nodes[i] / _bs[i]; + } + std::vector cbcoor(_nd); // coordinate of block in slot in canonical form + Lexicographic::CoorFromIndex(cbcoor,nb,c_src_local_blocks); + + // cpco, cbcoor + std::vector clbcoor(_nd); + for (int i=0;i<_nd;i++) { + int cgcoor = cpco[i] * c_src_local_blocks[i] + cbcoor[i]; // global block coordinate + int pcoor = cgcoor / _nbc[i]; // processor coordinate in my Grid + int tpcoor = _grid->_processor_coor[(i+1)%5]; + if (pcoor != tpcoor) + return -1; + clbcoor[i] = cgcoor - tpcoor * _nbc[i]; // canonical local block coordinate for canonical dimension i + } + + int lnb; + Lexicographic::IndexFromCoor(clbcoor,lnb,_nbc); + //std::cout << "Mapped slot = " << slot << " nb = " << nb << " to " << lnb << std::endl; + return lnb; + } + + + }; + +} diff --git a/tests/lanczos/Test_dwf_compressed_lanczos.cc b/tests/lanczos/Test_dwf_compressed_lanczos.cc index a6eb95e9..45690f05 100644 --- a/tests/lanczos/Test_dwf_compressed_lanczos.cc +++ b/tests/lanczos/Test_dwf_compressed_lanczos.cc @@ -331,7 +331,7 @@ void CoarseGridLanczos(BlockProjector& pr,RealD alpha2,RealD beta,int Npo ) { - IRL2.calc(eval2,coef._v,src_coarse,Nconv,true,SkipTest2); + IRL2.calc(eval2,coef._v,src_coarse,Nconv,true); coef.resize(Nstop2); eval2.resize(Nstop2); @@ -635,7 +635,7 @@ int main (int argc, char ** argv) { if (simple_krylov_basis) { quick_krylov_basis(evec,src,Op1,Nstop1); } else { - IRL1.calc(eval1,evec._v,src,Nconv,false,1); + IRL1.calc(eval1,evec._v,src,Nconv,false); } evec.resize(Nstop1); // and throw away superfluous eval1.resize(Nstop1); diff --git a/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc b/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc index a0691116..8fbbacbc 100644 --- a/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc +++ b/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc @@ -56,6 +56,7 @@ struct CompressedLanczosParams : Serializable { LanczosParams, FineParams, LanczosParams, CoarseParams, ChebyParams, Smoother, + RealD , coarse_relax_tol, std::vector, blockSize, std::string, config, std::vector < std::complex >, omega, @@ -137,12 +138,13 @@ class ImplicitlyRestartedLanczosSmoothedTester : public ImplicitlyRestartedLanc OperatorFunction & _smoother; LinearOperatorBase &_Linop; Aggregation &_Aggregate; - + RealD _coarse_relax_tol; ImplicitlyRestartedLanczosSmoothedTester(LinearFunction &Poly, OperatorFunction &smoother, LinearOperatorBase &Linop, - Aggregation &Aggregate) - : _smoother(smoother), _Linop(Linop),_Aggregate(Aggregate), _Poly(Poly) { }; + Aggregation &Aggregate, + RealD coarse_relax_tol=5.0e3) + : _smoother(smoother), _Linop(Linop),_Aggregate(Aggregate), _Poly(Poly), _coarse_relax_tol(coarse_relax_tol) { }; int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox) { @@ -196,7 +198,7 @@ class ImplicitlyRestartedLanczosSmoothedTester : public ImplicitlyRestartedLanc <<"eval = "< nbasis ) eresid = eresid*_coarse_relax_tol; if( (vv=nbasis); @@ -345,7 +347,7 @@ public: evals_fine.resize(nbasis); _Aggregate.subspace.resize(nbasis,_FineGrid); } - void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth, + void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax, int Nstop, int Nk, int Nm,RealD resid, RealD MaxIt, RealD betastp, int MinRes) { @@ -357,8 +359,7 @@ public: ////////////////////////////////////////////////////////////////////////////////////////////////// Chebyshev ChebySmooth(cheby_smooth); - ImplicitlyRestartedLanczosSmoothedTester ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate); - + ImplicitlyRestartedLanczosSmoothedTester ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax); evals_coarse.resize(Nm); evec_coarse.resize(Nm,_CoarseGrid); @@ -367,7 +368,7 @@ public: ImplicitlyRestartedLanczos IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes); int Nconv=0; - IRL.calc(evals_coarse,evec_coarse,src,Nconv,false,1); + IRL.calc(evals_coarse,evec_coarse,src,Nconv,false); assert(Nconv>=Nstop); for (int i=0;i Date: Thu, 26 Oct 2017 07:45:56 +0100 Subject: [PATCH 24/63] Update to IRL; getting close to the structure I would like. --- .../iterative/ImplicitlyRestartedLanczos.h | 234 +++++++++++------- 1 file changed, 142 insertions(+), 92 deletions(-) diff --git a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h index 6d3e0755..4be2715a 100644 --- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h +++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h @@ -71,6 +71,23 @@ void basisRotate(std::vector &basis,Eigen::MatrixXd& Qt,int j0, int j1, i } } +// Extract a single rotated vector +template +void basisRotateJ(Field &result,std::vector &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) +{ + typedef typename Field::vector_object vobj; + GridBase* grid = basis[0]._grid; + + result.checkerboard = basis[0].checkerboard; + parallel_for(int ss=0;ss < grid->oSites();ss++){ + vobj B = zero; + for(int k=k0; k void basisReorderInPlace(std::vector &_v,std::vector& sort_vals, std::vector& idx) { @@ -87,9 +104,7 @@ void basisReorderInPlace(std::vector &_v,std::vector& sort_vals, s assert(idx[i] > i); ////////////////////////////////////// // idx[i] is a table of desired sources giving a permutation. - // // Swap v[i] with v[idx[i]]. - // // Find j>i for which _vnew[j] = _vold[i], // track the move idx[j] => idx[i] // track the move idx[i] => i @@ -155,6 +170,49 @@ enum IRLdiagonalisation { ///////////////////////////////////////////////////////////// // Implicitly restarted lanczos ///////////////////////////////////////////////////////////// +template class ImplicitlyRestartedLanczosTester +{ + public: + virtual int TestConvergence(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox); + virtual int ReconstructEval(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox); +}; + +template class ImplicitlyRestartedLanczosHermOpTester : public ImplicitlyRestartedLanczosTester +{ + public: + LinearFunction &_HermOpTest; + ImplicitlyRestartedLanczosHermOpTester(LinearFunction &HermOpTest) : _HermOpTest(HermOpTest) { }; + int ReconstructEval(int j,RealD resid,Field &B, RealD &eval,RealD evalMaxApprox) + { + return TestConvergence(j,resid,B,eval,evalMaxApprox); + } + int TestConvergence(int j,RealD eresid,Field &B, RealD &eval,RealD evalMaxApprox) + { + Field v(B); + RealD eval_poly = eval; + // Apply operator + _HermOpTest(B,v); + + RealD vnum = real(innerProduct(B,v)); // HermOp. + RealD vden = norm2(B); + RealD vv0 = norm2(v); + eval = vnum/vden; + v -= eval*B; + + RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0); + + std::cout.precision(13); + std::cout< class ImplicitlyRestartedLanczos { @@ -174,14 +232,19 @@ class ImplicitlyRestartedLanczos { //////////////////////////////// // Embedded objects //////////////////////////////// - LinearFunction &_HermOp; - LinearFunction &_HermOpTest; + LinearFunction &_HermOp; + LinearFunction &_HermOpTest; + ImplicitlyRestartedLanczosTester &_Tester; + // Default tester provided (we need a ref to something in default case) + ImplicitlyRestartedLanczosHermOpTester SimpleTester; ///////////////////////// // Constructor ///////////////////////// + public: ImplicitlyRestartedLanczos(LinearFunction & HermOp, LinearFunction & HermOpTest, + ImplicitlyRestartedLanczosTester & Tester, int _Nstop, // sought vecs int _Nk, // sought vecs int _Nm, // spare vecs @@ -190,7 +253,23 @@ public: RealD _betastp=0.0, // if beta(k) < betastp: converged int _MinRestart=1, int _orth_period = 1, IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) : - _HermOp(HermOp), _HermOpTest(HermOpTest), + SimpleTester(HermOpTest), _HermOp(HermOp), _HermOpTest(HermOpTest), _Tester(Tester), + Nstop(_Nstop) , Nk(_Nk), Nm(_Nm), + eresid(_eresid), betastp(_betastp), + MaxIter(_MaxIter) , MinRestart(_MinRestart), + orth_period(_orth_period), diagonalisation(_diagonalisation) { }; + + ImplicitlyRestartedLanczos(LinearFunction & HermOp, + LinearFunction & HermOpTest, + int _Nstop, // sought vecs + int _Nk, // sought vecs + int _Nm, // spare vecs + RealD _eresid, // resid in lmdue deficit + int _MaxIter, // Max iterations + RealD _betastp=0.0, // if beta(k) < betastp: converged + int _MinRestart=1, int _orth_period = 1, + IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) : + SimpleTester(HermOpTest), _HermOp(HermOp), _HermOpTest(HermOpTest), _Tester(SimpleTester), Nstop(_Nstop) , Nk(_Nk), Nm(_Nm), eresid(_eresid), betastp(_betastp), MaxIter(_MaxIter) , MinRestart(_MinRestart), @@ -232,7 +311,7 @@ repeat →AVK =VKHK +fKe†K † Extend to an M = K + P step factorization AVM = VMHM + fMeM until convergence */ - void calc(std::vector& eval, std::vector& evec, const Field& src, int& Nconv, bool reverse=true, int SkipTest=0) + void calc(std::vector& eval, std::vector& evec, const Field& src, int& Nconv, bool reverse=true) { GridBase *grid = src._grid; assert(grid == evec[0]._grid); @@ -335,11 +414,18 @@ until convergence ////////////////////////////////// eval2_copy = eval2; - // _sort.push(eval2,Nm); - std::partial_sort(eval2.begin(),eval2.begin()+Nm,eval2.end()); + std::partial_sort(eval2.begin(),eval2.begin()+Nm,eval2.end(),std::greater()); std::cout<0); - basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis + assert(k20); + basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis std::cout<= MinRestart) { - std::cout << GridLogIRL << "Rotation to test convergence " << std::endl; - - Field ev0_orig(grid); - ev0_orig = evec[0]; - - basisRotate(evec,Qt,0,Nk,0,Nk,Nm); - { - std::cout << GridLogIRL << "Test convergence" << std::endl; - Field B(grid); - - for(int j = 0; j Nconv ) { + Nconv=j+1; + jj=Nstop; // Terminate the scan } } - - // test if we converged, if so, terminate - std::cout<=Nstop || beta_k < betastp){ - goto converged; - } - - //B[j] +=Qt[k+_Nm*j] * _v[k]._odata[ss]; - { - Eigen::MatrixXd qm = Eigen::MatrixXd::Zero(Nk,Nk); // Restrict Qt to Nk x Nk - for (int k=0;k= "<=Nstop || beta_k < betastp){ + if( Nconv>=Nstop){ + goto converged; + } + } else { std::cout << GridLogIRL << "iter < MinRestart: do not yet test for convergence\n"; } // end of iter loop @@ -461,24 +510,28 @@ until convergence converged: - if (SkipTest == 1) { - eval = eval2; - } else { - ////////////////////////////////////////////// - // test quickly - // PAB -- what precisely does this test? Don't like this eval2, eval2_copy etc... - ////////////////////////////////////////////// - for (int j=0;j0) w -= lme[k-1] * evec[k-1]; @@ -529,8 +581,6 @@ until convergence lmd[k] = alph; lme[k] = beta; - std::cout<0 && k % orth_period == 0) { orthogonalize(w,evec,k); // orthonormalise std::cout< Date: Thu, 26 Oct 2017 07:47:42 +0100 Subject: [PATCH 25/63] Moving these out of algorithms --- .../BlockProjector.h | 143 ------- .../BlockedGrid.h | 401 ------------------ .../FieldBasisVector.h | 162 ------- 3 files changed, 706 deletions(-) delete mode 100644 lib/algorithms/iterative/BlockImplicitlyRestartedLanczos/BlockProjector.h delete mode 100644 lib/algorithms/iterative/BlockImplicitlyRestartedLanczos/BlockedGrid.h delete mode 100644 lib/algorithms/iterative/BlockImplicitlyRestartedLanczos/FieldBasisVector.h diff --git a/lib/algorithms/iterative/BlockImplicitlyRestartedLanczos/BlockProjector.h b/lib/algorithms/iterative/BlockImplicitlyRestartedLanczos/BlockProjector.h deleted file mode 100644 index 6becaa66..00000000 --- a/lib/algorithms/iterative/BlockImplicitlyRestartedLanczos/BlockProjector.h +++ /dev/null @@ -1,143 +0,0 @@ -namespace Grid { - -/* - BlockProjector - - If _HP_BLOCK_PROJECTORS_ is defined, we assume that _evec is a basis that is not - fully orthonormalized (to the precision of the coarse field) and we allow for higher-precision - coarse field than basis field. - -*/ -//#define _HP_BLOCK_PROJECTORS_ - -template -class BlockProjector { -public: - - BasisFieldVector& _evec; - BlockedGrid& _bgrid; - - BlockProjector(BasisFieldVector& evec, BlockedGrid& bgrid) : _evec(evec), _bgrid(bgrid) { - } - - void createOrthonormalBasis(RealD thres = 0.0) { - - GridStopWatch sw; - sw.Start(); - - int cnt = 0; - -#pragma omp parallel shared(cnt) - { - int lcnt = 0; - -#pragma omp for - for (int b=0;b<_bgrid._o_blocks;b++) { - - for (int i=0;i<_evec._Nm;i++) { - - auto nrm0 = _bgrid.block_sp(b,_evec._v[i],_evec._v[i]); - - // |i> -= |j> - for (int j=0;j - void coarseToFine(const CoarseField& in, Field& out) { - - out = zero; - out.checkerboard = _evec._v[0].checkerboard; - - int Nbasis = sizeof(in._odata[0]._internal._internal) / sizeof(in._odata[0]._internal._internal[0]); - assert(Nbasis == _evec._Nm); - -#pragma omp parallel for - for (int b=0;b<_bgrid._o_blocks;b++) { - for (int j=0;j<_evec._Nm;j++) { - _bgrid.block_caxpy(b,out,in._odata[b]._internal._internal[j],_evec._v[j],out); - } - } - - } - - template - void fineToCoarse(const Field& in, CoarseField& out) { - - out = zero; - - int Nbasis = sizeof(out._odata[0]._internal._internal) / sizeof(out._odata[0]._internal._internal[0]); - assert(Nbasis == _evec._Nm); - - - Field tmp(_bgrid._grid); - tmp = in; - -#pragma omp parallel for - for (int b=0;b<_bgrid._o_blocks;b++) { - for (int j=0;j<_evec._Nm;j++) { - // |rhs> -= |j> - auto c = _bgrid.block_sp(b,_evec._v[j],tmp); - _bgrid.block_caxpy(b,tmp,-c,_evec._v[j],tmp); // may make this more numerically stable - out._odata[b]._internal._internal[j] = c; - } - } - - } - - template - void deflateFine(BasisFieldVector& _coef,const std::vector& eval,int N,const Field& src_orig,Field& result) { - result = zero; - for (int i=0;i - void deflateCoarse(BasisFieldVector& _coef,const std::vector& eval,int N,const Field& src_orig,Field& result) { - CoarseField src_coarse(_coef._v[0]._grid); - CoarseField result_coarse = src_coarse; - result_coarse = zero; - fineToCoarse(src_orig,src_coarse); - for (int i=0;i - void deflate(BasisFieldVector& _coef,const std::vector& eval,int N,const Field& src_orig,Field& result) { - // Deflation on coarse Grid is much faster, so use it by default. Deflation on fine Grid is kept for legacy reasons for now. - deflateCoarse(_coef,eval,N,src_orig,result); - } - -}; -} diff --git a/lib/algorithms/iterative/BlockImplicitlyRestartedLanczos/BlockedGrid.h b/lib/algorithms/iterative/BlockImplicitlyRestartedLanczos/BlockedGrid.h deleted file mode 100644 index 821272de..00000000 --- a/lib/algorithms/iterative/BlockImplicitlyRestartedLanczos/BlockedGrid.h +++ /dev/null @@ -1,401 +0,0 @@ -namespace Grid { - -template -class BlockedGrid { -public: - GridBase* _grid; - typedef typename Field::scalar_type Coeff_t; - typedef typename Field::vector_type vCoeff_t; - - std::vector _bs; // block size - std::vector _nb; // number of blocks - std::vector _l; // local dimensions irrespective of cb - std::vector _l_cb; // local dimensions of checkerboarded vector - std::vector _l_cb_o; // local dimensions of inner checkerboarded vector - std::vector _bs_cb; // block size in checkerboarded vector - std::vector _nb_o; // number of blocks of simd o-sites - - int _nd, _blocks, _cf_size, _cf_block_size, _cf_o_block_size, _o_blocks, _block_sites; - - BlockedGrid(GridBase* grid, const std::vector& block_size) : - _grid(grid), _bs(block_size), _nd((int)_bs.size()), - _nb(block_size), _l(block_size), _l_cb(block_size), _nb_o(block_size), - _l_cb_o(block_size), _bs_cb(block_size) { - - _blocks = 1; - _o_blocks = 1; - _l = grid->FullDimensions(); - _l_cb = grid->LocalDimensions(); - _l_cb_o = grid->_rdimensions; - - _cf_size = 1; - _block_sites = 1; - for (int i=0;i<_nd;i++) { - _l[i] /= grid->_processors[i]; - - assert(!(_l[i] % _bs[i])); // lattice must accommodate choice of blocksize - - int r = _l[i] / _l_cb[i]; - assert(!(_bs[i] % r)); // checkerboarding must accommodate choice of blocksize - _bs_cb[i] = _bs[i] / r; - _block_sites *= _bs_cb[i]; - _nb[i] = _l[i] / _bs[i]; - _nb_o[i] = _nb[i] / _grid->_simd_layout[i]; - if (_nb[i] % _grid->_simd_layout[i]) { // simd must accommodate choice of blocksize - std::cout << GridLogMessage << "Problem: _nb[" << i << "] = " << _nb[i] << " _grid->_simd_layout[" << i << "] = " << _grid->_simd_layout[i] << std::endl; - assert(0); - } - _blocks *= _nb[i]; - _o_blocks *= _nb_o[i]; - _cf_size *= _l[i]; - } - - _cf_size *= 12 / 2; - _cf_block_size = _cf_size / _blocks; - _cf_o_block_size = _cf_size / _o_blocks; - - std::cout << GridLogMessage << "BlockedGrid:" << std::endl; - std::cout << GridLogMessage << " _l = " << _l << std::endl; - std::cout << GridLogMessage << " _l_cb = " << _l_cb << std::endl; - std::cout << GridLogMessage << " _l_cb_o = " << _l_cb_o << std::endl; - std::cout << GridLogMessage << " _bs = " << _bs << std::endl; - std::cout << GridLogMessage << " _bs_cb = " << _bs_cb << std::endl; - - std::cout << GridLogMessage << " _nb = " << _nb << std::endl; - std::cout << GridLogMessage << " _nb_o = " << _nb_o << std::endl; - std::cout << GridLogMessage << " _blocks = " << _blocks << std::endl; - std::cout << GridLogMessage << " _o_blocks = " << _o_blocks << std::endl; - std::cout << GridLogMessage << " sizeof(vCoeff_t) = " << sizeof(vCoeff_t) << std::endl; - std::cout << GridLogMessage << " _cf_size = " << _cf_size << std::endl; - std::cout << GridLogMessage << " _cf_block_size = " << _cf_block_size << std::endl; - std::cout << GridLogMessage << " _block_sites = " << _block_sites << std::endl; - std::cout << GridLogMessage << " _grid->oSites() = " << _grid->oSites() << std::endl; - - // _grid->Barrier(); - //abort(); - } - - void block_to_coor(int b, std::vector& x0) { - - std::vector bcoor; - bcoor.resize(_nd); - x0.resize(_nd); - assert(b < _o_blocks); - Lexicographic::CoorFromIndex(bcoor,b,_nb_o); - int i; - - for (i=0;i<_nd;i++) { - x0[i] = bcoor[i]*_bs_cb[i]; - } - - //std::cout << GridLogMessage << "Map block b -> " << x0 << std::endl; - - } - - void block_site_to_o_coor(const std::vector& x0, std::vector& coor, int i) { - Lexicographic::CoorFromIndex(coor,i,_bs_cb); - for (int j=0;j<_nd;j++) - coor[j] += x0[j]; - } - - int block_site_to_o_site(const std::vector& x0, int i) { - std::vector coor; coor.resize(_nd); - block_site_to_o_coor(x0,coor,i); - Lexicographic::IndexFromCoor(coor,i,_l_cb_o); - return i; - } - - vCoeff_t block_sp(int b, const Field& x, const Field& y) { - - std::vector x0; - block_to_coor(b,x0); - - vCoeff_t ret = 0.0; - for (int i=0;i<_block_sites;i++) { // only odd sites - int ss = block_site_to_o_site(x0,i); - ret += TensorRemove(innerProduct(x._odata[ss],y._odata[ss])); - } - - return ret; - - } - - vCoeff_t block_sp(int b, const Field& x, const std::vector< ComplexD >& y) { - - std::vector x0; - block_to_coor(b,x0); - - constexpr int nsimd = sizeof(vCoeff_t) / sizeof(Coeff_t); - int lsize = _cf_o_block_size / _block_sites; - - std::vector< ComplexD > ret(nsimd); - for (int i=0;i - void vcaxpy(iScalar& r,const vCoeff_t& a,const iScalar& x,const iScalar& y) { - vcaxpy(r._internal,a,x._internal,y._internal); - } - - template - void vcaxpy(iVector& r,const vCoeff_t& a,const iVector& x,const iVector& y) { - for (int i=0;i x0; - block_to_coor(b,x0); - - for (int i=0;i<_block_sites;i++) { // only odd sites - int ss = block_site_to_o_site(x0,i); - vcaxpy(ret._odata[ss],a,x._odata[ss],y._odata[ss]); - } - - } - - void block_caxpy(int b, std::vector< ComplexD >& ret, const vCoeff_t& a, const Field& x, const std::vector< ComplexD >& y) { - std::vector x0; - block_to_coor(b,x0); - - constexpr int nsimd = sizeof(vCoeff_t) / sizeof(Coeff_t); - int lsize = _cf_o_block_size / _block_sites; - - for (int i=0;i<_block_sites;i++) { // only odd sites - int ss = block_site_to_o_site(x0,i); - - int n = lsize / nsimd; - for (int l=0;l& x) { - std::vector x0; - block_to_coor(b,x0); - - int lsize = _cf_o_block_size / _block_sites; - - for (int i=0;i<_block_sites;i++) { // only odd sites - int ss = block_site_to_o_site(x0,i); - - for (int l=0;l& x) { - std::vector x0; - block_to_coor(b,x0); - - int lsize = _cf_o_block_size / _block_sites; - - for (int i=0;i<_block_sites;i++) { // only odd sites - int ss = block_site_to_o_site(x0,i); - - for (int l=0;l - void vcscale(iScalar& r,const vCoeff_t& a,const iScalar& x) { - vcscale(r._internal,a,x._internal); - } - - template - void vcscale(iVector& r,const vCoeff_t& a,const iVector& x) { - for (int i=0;i x0; - block_to_coor(b,x0); - - for (int i=0;i<_block_sites;i++) { // only odd sites - int ss = block_site_to_o_site(x0,i); - vcscale(ret._odata[ss],a,ret._odata[ss]); - } - } - - void getCanonicalBlockOffset(int cb, std::vector& x0) { - const int ndim = 5; - assert(_nb.size() == ndim); - std::vector _nbc = { _nb[1], _nb[2], _nb[3], _nb[4], _nb[0] }; - std::vector _bsc = { _bs[1], _bs[2], _bs[3], _bs[4], _bs[0] }; - x0.resize(ndim); - - assert(cb >= 0); - assert(cb < _nbc[0]*_nbc[1]*_nbc[2]*_nbc[3]*_nbc[4]); - - Lexicographic::CoorFromIndex(x0,cb,_nbc); - int i; - - for (i=0;i& buf) { - std::vector _bsc = { _bs[1], _bs[2], _bs[3], _bs[4], _bs[0] }; - std::vector ldim = v._grid->LocalDimensions(); - std::vector cldim = { ldim[1], ldim[2], ldim[3], ldim[4], ldim[0] }; - const int _nbsc = _bs_cb[0]*_bs_cb[1]*_bs_cb[2]*_bs_cb[3]*_bs_cb[4]; - // take canonical block cb of v and put it in canonical ordering in buf - std::vector cx0; - getCanonicalBlockOffset(cb,cx0); - -#pragma omp parallel - { - std::vector co0,cl0; - co0=cx0; cl0=cx0; - -#pragma omp for - for (int i=0;i<_nbsc;i++) { - Lexicographic::CoorFromIndex(co0,2*i,_bsc); // 2* for eo - for (int j=0;j<(int)_bsc.size();j++) - cl0[j] = cx0[j] + co0[j]; - - std::vector l0 = { cl0[4], cl0[0], cl0[1], cl0[2], cl0[3] }; - int oi = v._grid->oIndex(l0); - int ii = v._grid->iIndex(l0); - int lti = i; - - //if (cb < 2 && i<2) - // std::cout << GridLogMessage << "Map: " << cb << ", " << i << " To: " << cl0 << ", " << cx0 << ", " << oi << ", " << ii << std::endl; - - for (int s=0;s<4;s++) - for (int c=0;c<3;c++) { - Coeff_t& ld = ((Coeff_t*)&v._odata[oi]._internal._internal[s]._internal[c])[ii]; - int ti = 12*lti + 3*s + c; - ld = Coeff_t(buf[2*ti+0], buf[2*ti+1]); - } - } - } - } - - void peekBlockOfVectorCanonical(int cb,const Field& v,std::vector& buf) { - std::vector _bsc = { _bs[1], _bs[2], _bs[3], _bs[4], _bs[0] }; - std::vector ldim = v._grid->LocalDimensions(); - std::vector cldim = { ldim[1], ldim[2], ldim[3], ldim[4], ldim[0] }; - const int _nbsc = _bs_cb[0]*_bs_cb[1]*_bs_cb[2]*_bs_cb[3]*_bs_cb[4]; - // take canonical block cb of v and put it in canonical ordering in buf - std::vector cx0; - getCanonicalBlockOffset(cb,cx0); - - buf.resize(_cf_block_size * 2); - -#pragma omp parallel - { - std::vector co0,cl0; - co0=cx0; cl0=cx0; - -#pragma omp for - for (int i=0;i<_nbsc;i++) { - Lexicographic::CoorFromIndex(co0,2*i,_bsc); // 2* for eo - for (int j=0;j<(int)_bsc.size();j++) - cl0[j] = cx0[j] + co0[j]; - - std::vector l0 = { cl0[4], cl0[0], cl0[1], cl0[2], cl0[3] }; - int oi = v._grid->oIndex(l0); - int ii = v._grid->iIndex(l0); - int lti = i; - - //if (cb < 2 && i<2) - // std::cout << GridLogMessage << "Map: " << cb << ", " << i << " To: " << cl0 << ", " << cx0 << ", " << oi << ", " << ii << std::endl; - - for (int s=0;s<4;s++) - for (int c=0;c<3;c++) { - Coeff_t& ld = ((Coeff_t*)&v._odata[oi]._internal._internal[s]._internal[c])[ii]; - int ti = 12*lti + 3*s + c; - buf[2*ti+0] = ld.real(); - buf[2*ti+1] = ld.imag(); - } - } - } - } - - int globalToLocalCanonicalBlock(int slot,const std::vector& src_nodes,int nb) { - // processor coordinate - int _nd = (int)src_nodes.size(); - std::vector _src_nodes = src_nodes; - std::vector pco(_nd); - Lexicographic::CoorFromIndex(pco,slot,_src_nodes); - std::vector cpco = { pco[1], pco[2], pco[3], pco[4], pco[0] }; - - // get local block - std::vector _nbc = { _nb[1], _nb[2], _nb[3], _nb[4], _nb[0] }; - assert(_nd == 5); - std::vector c_src_local_blocks(_nd); - for (int i=0;i<_nd;i++) { - assert(_grid->_fdimensions[i] % (src_nodes[i] * _bs[i]) == 0); - c_src_local_blocks[(i+4) % 5] = _grid->_fdimensions[i] / src_nodes[i] / _bs[i]; - } - std::vector cbcoor(_nd); // coordinate of block in slot in canonical form - Lexicographic::CoorFromIndex(cbcoor,nb,c_src_local_blocks); - - // cpco, cbcoor - std::vector clbcoor(_nd); - for (int i=0;i<_nd;i++) { - int cgcoor = cpco[i] * c_src_local_blocks[i] + cbcoor[i]; // global block coordinate - int pcoor = cgcoor / _nbc[i]; // processor coordinate in my Grid - int tpcoor = _grid->_processor_coor[(i+1)%5]; - if (pcoor != tpcoor) - return -1; - clbcoor[i] = cgcoor - tpcoor * _nbc[i]; // canonical local block coordinate for canonical dimension i - } - - int lnb; - Lexicographic::IndexFromCoor(clbcoor,lnb,_nbc); - //std::cout << "Mapped slot = " << slot << " nb = " << nb << " to " << lnb << std::endl; - return lnb; - } - - - }; - -} diff --git a/lib/algorithms/iterative/BlockImplicitlyRestartedLanczos/FieldBasisVector.h b/lib/algorithms/iterative/BlockImplicitlyRestartedLanczos/FieldBasisVector.h deleted file mode 100644 index 3ad516ef..00000000 --- a/lib/algorithms/iterative/BlockImplicitlyRestartedLanczos/FieldBasisVector.h +++ /dev/null @@ -1,162 +0,0 @@ -namespace Grid { - -template -class BasisFieldVector { - public: - int _Nm; - - typedef typename Field::scalar_type Coeff_t; - typedef typename Field::vector_type vCoeff_t; - typedef typename Field::vector_object vobj; - typedef typename vobj::scalar_object sobj; - - std::vector _v; // _Nfull vectors - - void report(int n,GridBase* value) { - - std::cout << GridLogMessage << "BasisFieldVector allocated:\n"; - std::cout << GridLogMessage << " Delta N = " << n << "\n"; - std::cout << GridLogMessage << " Size of full vectors (size) = " << - ((double)n*sizeof(vobj)*value->oSites() / 1024./1024./1024.) << " GB\n"; - std::cout << GridLogMessage << " Size = " << _v.size() << " Capacity = " << _v.capacity() << std::endl; - - value->Barrier(); - - if (value->IsBoss()) { - system("cat /proc/meminfo"); - } - - value->Barrier(); - - } - - BasisFieldVector(int Nm,GridBase* value) : _Nm(Nm), _v(Nm,value) { - report(Nm,value); - } - - ~BasisFieldVector() { - } - - Field& operator[](int i) { - return _v[i]; - } - - void orthogonalize(Field& w, int k) { - for(int j=0; j B(Nm); - -#pragma omp for - for(int ss=0;ss < grid->oSites();ss++){ - for(int j=j0; j _Nm) - _v.reserve(n); - - _v.resize(n,_v[0]._grid); - - if (n < _Nm) - _v.shrink_to_fit(); - - report(n - _Nm,_v[0]._grid); - - _Nm = n; - } - - std::vector getIndex(std::vector& sort_vals) { - - std::vector idx(sort_vals.size()); - iota(idx.begin(), idx.end(), 0); - - // sort indexes based on comparing values in v - sort(idx.begin(), idx.end(), - [&sort_vals](int i1, int i2) {return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);}); - - return idx; - } - - void reorderInPlace(std::vector& sort_vals, std::vector& idx) { - GridStopWatch gsw; - gsw.Start(); - - int nswaps = 0; - for (size_t i=0;i& sort_vals, bool reverse) { - - std::vector idx = getIndex(sort_vals); - if (reverse) - std::reverse(idx.begin(), idx.end()); - - reorderInPlace(sort_vals,idx); - - } - - void deflate(const std::vector& eval,const Field& src_orig,Field& result) { - result = zero; - int N = (int)_v.size(); - for (int i=0;i Date: Thu, 26 Oct 2017 07:48:03 +0100 Subject: [PATCH 26/63] Test for split/unsplit in isolation --- tests/solver/Test_split_grid.cc | 144 ++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 tests/solver/Test_split_grid.cc diff --git a/tests/solver/Test_split_grid.cc b/tests/solver/Test_split_grid.cc new file mode 100644 index 00000000..90969b85 --- /dev/null +++ b/tests/solver/Test_split_grid.cc @@ -0,0 +1,144 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_dwf_mrhs_cg.cc + + Copyright (C) 2015 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +int main (int argc, char ** argv) +{ + typedef typename DomainWallFermionR::FermionField FermionField; + typedef typename DomainWallFermionR::ComplexField ComplexField; + typename DomainWallFermionR::ImplParams params; + + const int Ls=4; + + Grid_init(&argc,&argv); + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + std::vector mpi_split (mpi_layout.size(),1); + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * rbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + int nrhs = UGrid->RankCount() ; + + ///////////////////////////////////////////// + // Split into 1^4 mpi communicators + ///////////////////////////////////////////// + GridCartesian * SGrid = new GridCartesian(GridDefaultLatt(), + GridDefaultSimd(Nd,vComplex::Nsimd()), + mpi_split, + *UGrid); + + GridCartesian * SFGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,SGrid); + GridRedBlackCartesian * SrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(SGrid); + GridRedBlackCartesian * SFrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,SGrid); + + /////////////////////////////////////////////// + // Set up the problem as a 4d spreadout job + /////////////////////////////////////////////// + std::vector seeds({1,2,3,4}); + + GridParallelRNG pRNG(UGrid ); pRNG.SeedFixedIntegers(seeds); + GridParallelRNG pRNG5(FGrid); pRNG5.SeedFixedIntegers(seeds); + std::vector src(nrhs,FGrid); + std::vector src_chk(nrhs,FGrid); + std::vector result(nrhs,FGrid); + FermionField tmp(FGrid); + + for(int s=0;sThisRank(); + + LatticeGaugeField s_Umu(SGrid); + FermionField s_src(SFGrid); + FermionField s_tmp(SFGrid); + FermionField s_res(SFGrid); + + /////////////////////////////////////////////////////////////// + // split the source out using MPI instead of I/O + /////////////////////////////////////////////////////////////// + Grid_split (Umu,s_Umu); + Grid_split (src,s_src); + + /////////////////////////////////////////////////////////////// + // Set up N-solvers as trivially parallel + /////////////////////////////////////////////////////////////// + RealD mass=0.01; + RealD M5=1.8; + DomainWallFermionR Dchk(Umu,*FGrid,*FrbGrid,*UGrid,*rbGrid,mass,M5); + DomainWallFermionR Ddwf(s_Umu,*SFGrid,*SFrbGrid,*SGrid,*SrbGrid,mass,M5); + + std::cout << GridLogMessage << "****************************************************************** "< HermOp(Ddwf); + MdagMLinearOperator HermOpCk(Dchk); + ConjugateGradient CG((1.0e-8/(me+1)),10000); + s_res = zero; + CG(HermOp,s_src,s_res); + + ///////////////////////////////////////////////////////////// + // Report how long they all took + ///////////////////////////////////////////////////////////// + std::vector iterations(nrhs,0); + iterations[me] = CG.IterationsToComplete; + + for(int n=0;nGlobalSum(iterations[n]); + std::cout << GridLogMessage<<" Rank "< Date: Thu, 26 Oct 2017 16:25:01 +0100 Subject: [PATCH 27/63] Final? candidate for push back on the lanczos reorg feature --- .../Test_dwf_compressed_lanczos_reorg.cc | 33 ++----------------- 1 file changed, 3 insertions(+), 30 deletions(-) diff --git a/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc b/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc index 8fbbacbc..ad1aaa47 100644 --- a/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc +++ b/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc @@ -374,20 +374,6 @@ public: for (int i=0;i Date: Thu, 26 Oct 2017 20:58:46 +0100 Subject: [PATCH 28/63] Staggered updates : Schur fixed and added a unit test for Test_staggered_cg_schur.cc giving stronger check --- lib/algorithms/LinearOperator.h | 2 +- lib/algorithms/iterative/SchurRedBlack.h | 15 ++++- lib/communicator/Communicator_mpi3.cc | 7 ++- lib/communicator/Communicator_mpit.cc | 18 +++--- tests/solver/Test_staggered_cg_prec.cc | 1 - tests/solver/Test_staggered_cg_schur.cc | 76 ++++++++++++++++++++++++ 6 files changed, 103 insertions(+), 16 deletions(-) create mode 100644 tests/solver/Test_staggered_cg_schur.cc diff --git a/lib/algorithms/LinearOperator.h b/lib/algorithms/LinearOperator.h index f1b8820e..2a757352 100644 --- a/lib/algorithms/LinearOperator.h +++ b/lib/algorithms/LinearOperator.h @@ -319,7 +319,7 @@ namespace Grid { Field tmp(in._grid); _Mat.Meooe(in,tmp); _Mat.MooeeInv(tmp,out); - _Mat.MeooeDag(out,tmp); + _Mat.Meooe(out,tmp); _Mat.Mooee(in,out); return axpy_norm(out,-1.0,tmp,out); } diff --git a/lib/algorithms/iterative/SchurRedBlack.h b/lib/algorithms/iterative/SchurRedBlack.h index a309386b..a0fd86a6 100644 --- a/lib/algorithms/iterative/SchurRedBlack.h +++ b/lib/algorithms/iterative/SchurRedBlack.h @@ -55,7 +55,15 @@ Author: Peter Boyle *Odd * i) D_oo psi_o = L^{-1} eta_o * eta_o' = (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e) + * + * Wilson: * (D_oo)^{\dag} D_oo psi_o = (D_oo)^dag L^{-1} eta_o + * Stag: + * D_oo psi_o = L^{-1} eta = (eta_o - Moe Mee^{-1} eta_e) + * + * L^-1 eta_o= (1 0 ) (e + * (-MoeMee^{-1} 1 ) + * *Even * ii) Mee psi_e + Meo psi_o = src_e * @@ -122,18 +130,19 @@ namespace Grid { pickCheckerboard(Odd ,sol_o,out); ///////////////////////////////////////////////////// - // src_o = Mdag * (source_o - Moe MeeInv source_e) + // src_o = (source_o - Moe MeeInv source_e) ///////////////////////////////////////////////////// _Matrix.MooeeInv(src_e,tmp); assert( tmp.checkerboard ==Even); _Matrix.Meooe (tmp,Mtmp); assert( Mtmp.checkerboard ==Odd); tmp=src_o-Mtmp; assert( tmp.checkerboard ==Odd); - _Matrix.Mooee(tmp,src_o); assert(src_o.checkerboard ==Odd); + src_o = tmp; assert(src_o.checkerboard ==Odd); + // _Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source ////////////////////////////////////////////////////////////// // Call the red-black solver ////////////////////////////////////////////////////////////// - std::cout< + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +template +struct scal { + d internal; +}; + + Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + +int main (int argc, char ** argv) +{ + typedef typename ImprovedStaggeredFermionR::FermionField FermionField; + typename ImprovedStaggeredFermionR::ImplParams params; + Grid_init(&argc,&argv); + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridRedBlackCartesian RBGrid(&Grid); + + std::vector seeds({1,2,3,4}); + GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); + + LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + + FermionField src(&Grid); random(pRNG,src); + FermionField result(&Grid); result=zero; + FermionField resid(&Grid); + + RealD mass=0.1; + ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass); + + ConjugateGradient CG(1.0e-8,10000); + SchurRedBlackStaggeredSolve SchurSolver(CG); + + SchurSolver(Ds,src,result); + + Grid_finalize(); +} From 0f3e9ae57d4a0cc6f7f8ec1d0fa8e922335aab72 Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 26 Oct 2017 23:29:59 +0100 Subject: [PATCH 29/63] Gsites error. Only appeared (so far) in I/O code for even odd fields --- lib/cartesian/Cartesian_red_black.h | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/cartesian/Cartesian_red_black.h b/lib/cartesian/Cartesian_red_black.h index f89cacc5..5c50f062 100644 --- a/lib/cartesian/Cartesian_red_black.h +++ b/lib/cartesian/Cartesian_red_black.h @@ -205,6 +205,7 @@ public: { assert((_gdimensions[d] & 0x1) == 0); _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard + _gsites /= 2; } _ldimensions[d] = _gdimensions[d] / _processors[d]; assert(_ldimensions[d] * _processors[d] == _gdimensions[d]); From 00ebc150ad6a6db27000829c6830ea8b855bacfe Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 26 Oct 2017 23:30:37 +0100 Subject: [PATCH 30/63] Mistake in string parse; interface is ambiguous and must fix. Is char * a file, or a XML buffer ? --- lib/serialisation/XmlIO.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/serialisation/XmlIO.cc b/lib/serialisation/XmlIO.cc index c0c45adc..260611a5 100644 --- a/lib/serialisation/XmlIO.cc +++ b/lib/serialisation/XmlIO.cc @@ -68,10 +68,10 @@ std::string XmlWriter::XmlString(void) XmlReader::XmlReader(const char *xmlstring,string toplev) : fileName_("") { pugi::xml_parse_result result; - result = doc_.load_file(xmlstring); + result = doc_.load_string(xmlstring); if ( !result ) { - cerr << "XML error description: char * " << result.description() << " "<< xmlstring << "\n"; - cerr << "XML error offset : char * " << result.offset << " "< Date: Thu, 26 Oct 2017 23:31:46 +0100 Subject: [PATCH 31/63] Cleaning up --- .../iterative/ImplicitlyRestartedLanczos.h | 48 +++++++++++-------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h index 4be2715a..089e7ff3 100644 --- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h +++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h @@ -37,6 +37,9 @@ Author: Christoph Lehner namespace Grid { + //////////////////////////////////////////////////////// + // Move following 100 LOC to lattice/Lattice_basis.h + //////////////////////////////////////////////////////// template void basisOrthogonalize(std::vector &basis,Field &w,int k) { @@ -101,7 +104,6 @@ void basisReorderInPlace(std::vector &_v,std::vector& sort_vals, s if (idx[i] != i) { - assert(idx[i] > i); ////////////////////////////////////// // idx[i] is a table of desired sources giving a permutation. // Swap v[i] with v[idx[i]]. @@ -114,8 +116,7 @@ void basisReorderInPlace(std::vector &_v,std::vector& sort_vals, s if (idx[j]==i) break; - assert(j!=idx.size()); - assert(idx[j]==i); + assert(idx[i] > i); assert(j!=idx.size()); assert(idx[j]==i); std::swap(_v[i]._odata,_v[idx[i]]._odata); // should use vector move constructor, no data copy std::swap(sort_vals[i],sort_vals[idx[i]]); @@ -161,12 +162,6 @@ void basisDeflate(const std::vector &_v,const std::vector& eval,co } } -enum IRLdiagonalisation { - IRLdiagonaliseWithDSTEGR, - IRLdiagonaliseWithQR, - IRLdiagonaliseWithEigen -}; - ///////////////////////////////////////////////////////////// // Implicitly restarted lanczos ///////////////////////////////////////////////////////////// @@ -177,6 +172,12 @@ template class ImplicitlyRestartedLanczosTester virtual int ReconstructEval(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox); }; +enum IRLdiagonalisation { + IRLdiagonaliseWithDSTEGR, + IRLdiagonaliseWithQR, + IRLdiagonaliseWithEigen +}; + template class ImplicitlyRestartedLanczosHermOpTester : public ImplicitlyRestartedLanczosTester { public: @@ -242,6 +243,17 @@ class ImplicitlyRestartedLanczos { ///////////////////////// public: + ////////////////////////////////////////////////////////////////// + // PAB: + ////////////////////////////////////////////////////////////////// + // Too many options & knobs. Do we really need orth_period + // What is the theoretical basis & guarantees of betastp ? + // Nstop=Nk viable? + // MinRestart avoidable with new convergence test? + // Could cut to HermOp, HermOpTest, Tester, Nk, Nm, resid, maxiter (+diagonalisation) + // HermOpTest could be eliminated if we dropped the Power method for max eval. + // -- also: The eval, eval2, eval2_copy stuff is still unnecessarily unclear + ////////////////////////////////////////////////////////////////// ImplicitlyRestartedLanczos(LinearFunction & HermOp, LinearFunction & HermOpTest, ImplicitlyRestartedLanczosTester & Tester, @@ -413,16 +425,14 @@ until convergence // sorting ////////////////////////////////// eval2_copy = eval2; - std::partial_sort(eval2.begin(),eval2.begin()+Nm,eval2.end(),std::greater()); - std::cout<0); basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis - std::cout<& lmd, // Nm std::vector& lme, // Nm int Nk, int Nm, // Nk, Nm From 9ec9850bdb49548238b1cb253c82bfeee3823683 Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 26 Oct 2017 23:34:31 +0100 Subject: [PATCH 32/63] 64bit ftello update --- lib/parallelIO/IldgIO.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/lib/parallelIO/IldgIO.h b/lib/parallelIO/IldgIO.h index 1f2b7c90..36ecbd1b 100644 --- a/lib/parallelIO/IldgIO.h +++ b/lib/parallelIO/IldgIO.h @@ -224,7 +224,7 @@ class GridLimeReader : public BinaryIO { assert(PayloadSize == file_bytes);// Must match or user error - off_t offset= ftell(File); + uint64_t offset= ftello(File); // std::cout << " ReadLatticeObject from offset "< munge; BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb); @@ -253,16 +253,13 @@ class GridLimeReader : public BinaryIO { while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { // std::cout << GridLogMessage<< " readLimeObject seeking "<< record_name <<" found record :" < xmlc(nbytes+1,'\0'); limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR); - // std::cout << GridLogMessage<< " readLimeObject matches XML " << &xmlc[0] <=0); err=limeWriterCloseRecord(LimeW); assert(err>=0); limeDestroyHeader(h); - // std::cout << " File offset is now"<(); BinarySimpleMunger munge; BinaryIO::writeLatticeObject(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb); + // fseek(File,0,SEEK_END); offset = ftello(File);std::cout << " offset now "<=0); + //////////////////////////////////////// // Write checksum element, propagaing forward from the BinaryIO // Always pair a checksum with a binary object, and close message @@ -703,8 +702,7 @@ class IldgReader : public GridLimeReader { // Binary data ///////////////////////////////// std::cout << GridLogMessage << "ILDG Binary record found : " ILDG_BINARY_DATA << std::endl; - off_t offset= ftell(File); - + uint64_t offset= ftello(File); if ( format == std::string("IEEE64BIG") ) { GaugeSimpleMunger munge; BinaryIO::readLatticeObject< vobj, dobj >(Umu, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb); From 7fab183c0eebfd82e006eca2130d809131a36074 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 27 Oct 2017 08:17:49 +0100 Subject: [PATCH 33/63] Better read test --- lib/parallelIO/IldgIO.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/parallelIO/IldgIO.h b/lib/parallelIO/IldgIO.h index 36ecbd1b..b86e250f 100644 --- a/lib/parallelIO/IldgIO.h +++ b/lib/parallelIO/IldgIO.h @@ -159,7 +159,7 @@ namespace QCD { uint32_t scidac_checksumb = stoull(scidacChecksum_.sumb,0,16); if ( scidac_csuma !=scidac_checksuma) return 0; if ( scidac_csumb !=scidac_checksumb) return 0; - return 1; + return 1; } //////////////////////////////////////////////////////////////////////////////////// @@ -237,7 +237,7 @@ class GridLimeReader : public BinaryIO { ///////////////////////////////////////////// // Verify checksums ///////////////////////////////////////////// - scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb); + assert(scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb)==1); return; } } From fa04b6d3c233d6057fb5133c8e5627bc2d941aba Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 27 Oct 2017 08:18:29 +0100 Subject: [PATCH 34/63] Finished ? Verifying coarse evec restore --- .../Test_dwf_compressed_lanczos_reorg.cc | 145 +++++++++++++----- 1 file changed, 109 insertions(+), 36 deletions(-) diff --git a/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc b/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc index ad1aaa47..42814e2f 100644 --- a/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc +++ b/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc @@ -50,9 +50,13 @@ struct LanczosParams : Serializable { int, MinRes); // Must restart }; -struct CompressedLanczosParams : Serializable { +struct LocalCoherenceLanczosParams : Serializable { public: - GRID_SERIALIZABLE_CLASS_MEMBERS(CompressedLanczosParams, + GRID_SERIALIZABLE_CLASS_MEMBERS(bool, doFine, + bool, doFineRead, + bool, doCoarse, + bool, doCoarseRead, + LocalCoherenceLanczosParams, LanczosParams, FineParams, LanczosParams, CoarseParams, ChebyParams, Smoother, @@ -61,8 +65,7 @@ struct CompressedLanczosParams : Serializable { std::string, config, std::vector < std::complex >, omega, RealD, mass, - RealD, M5 - ); + RealD, M5); }; // Duplicate functionality; ProjectedFunctionHermOp could be used with the trivial function @@ -209,7 +212,7 @@ class ImplicitlyRestartedLanczosSmoothedTester : public ImplicitlyRestartedLanc // Make serializable Lanczos params //////////////////////////////////////////// template -class CoarseFineIRL +class LocalCoherenceLanczos { public: typedef iVector CoarseSiteVector; @@ -230,7 +233,7 @@ private: std::vector evals_coarse; std::vector evec_coarse; public: - CoarseFineIRL(GridBase *FineGrid, + LocalCoherenceLanczos(GridBase *FineGrid, GridBase *CoarseGrid, LinearOperatorBase &FineOp, int checkerboard) : @@ -253,7 +256,7 @@ public: return nn; } - void testFine(void) + void fakeFine(void) { int Nk = nbasis; _Aggregate.subspace.resize(Nk,_FineGrid); @@ -286,6 +289,42 @@ public: write(WR,"evals",evals_fine); } } + + void checkpointFineRestore(std::string evecs_file,std::string evals_file) + { + evals_fine.resize(nbasis); + _Aggregate.subspace.resize(nbasis,_FineGrid); + { + std::cout << GridLogIRL<< "checkpointFineRestore: Reading evals from "< Op(_FineOp); + ImplicitlyRestartedLanczosHermOpTester SimpleTester(Op); + for(int k=0;k ChebySmooth(cheby_smooth); + ProjectedFunctionHermOp ChebyOp (ChebySmooth,_FineOp,_Aggregate); + ImplicitlyRestartedLanczosSmoothedTester ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax); + + for(int k=0;k=Nstop); - + evals_coarse.resize(Nstop); + evec_coarse.resize (Nstop,_CoarseGrid); for (int i=0;i IRL(FrbGrid,CoarseGrid5rb,HermOp,Odd); - std::cout << GridLogMessage << "Constructed CoarseFine IRL" << std::endl; + LocalCoherenceLanczos _LocalCoherenceLanczos(FrbGrid,CoarseGrid5rb,HermOp,Odd); + std::cout << GridLogMessage << "Constructed LocalCoherenceLanczos" << std::endl; - int do_fine = 1; - int do_coarse = 0; - int do_smooth = 0; - if ( do_fine ) { + if ( Params.doCoarse ) { + assert( (Params.doFine)||(Params.doFineRead)); + } + + if ( Params.doFine ) { std::cout << GridLogMessage << "Performing fine grid IRL Nstop "<< Ns1 << " Nk "< Date: Fri, 27 Oct 2017 09:04:31 +0100 Subject: [PATCH 35/63] Move the local coherence lanczos into algorithms. Keep the I/O in the tester. Other people can copy this method to write other I/O formats. --- .../iterative/LocalCoherenceLanczos.h | 348 ++++++++++++++ .../Test_dwf_compressed_lanczos_reorg.cc | 436 +++--------------- 2 files changed, 410 insertions(+), 374 deletions(-) create mode 100644 lib/algorithms/iterative/LocalCoherenceLanczos.h diff --git a/lib/algorithms/iterative/LocalCoherenceLanczos.h b/lib/algorithms/iterative/LocalCoherenceLanczos.h new file mode 100644 index 00000000..6b8fe62c --- /dev/null +++ b/lib/algorithms/iterative/LocalCoherenceLanczos.h @@ -0,0 +1,348 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/algorithms/iterative/LocalCoherenceLanczos.h + + Copyright (C) 2015 + +Author: Christoph Lehner +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#ifndef GRID_LOCAL_COHERENCE_IRL_H +#define GRID_LOCAL_COHERENCE_IRL_H +namespace Grid { +struct LanczosParams : Serializable { + public: + GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams, + ChebyParams, Cheby,/*Chebyshev*/ + int, Nstop, /*Vecs in Lanczos must converge Nstop < Nk < Nm*/ + int, Nk, /*Vecs in Lanczos seek converge*/ + int, Nm, /*Total vecs in Lanczos include restart*/ + RealD, resid, /*residual*/ + int, MaxIt, + RealD, betastp, /* ? */ + int, MinRes); // Must restart +}; + +struct LocalCoherenceLanczosParams : Serializable { + public: + GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams, + bool, doFine, + bool, doFineRead, + bool, doCoarse, + bool, doCoarseRead, + LanczosParams, FineParams, + LanczosParams, CoarseParams, + ChebyParams, Smoother, + RealD , coarse_relax_tol, + std::vector, blockSize, + std::string, config, + std::vector < std::complex >, omega, + RealD, mass, + RealD, M5); +}; + +// Duplicate functionality; ProjectedFunctionHermOp could be used with the trivial function +template +class ProjectedHermOp : public LinearFunction > > { +public: + typedef iVector CoarseSiteVector; + typedef Lattice CoarseField; + typedef Lattice CoarseScalar; // used for inner products on fine field + typedef Lattice FineField; + + LinearOperatorBase &_Linop; + Aggregation &_Aggregate; + + ProjectedHermOp(LinearOperatorBase& linop, Aggregation &aggregate) : + _Linop(linop), + _Aggregate(aggregate) { }; + + void operator()(const CoarseField& in, CoarseField& out) { + + GridBase *FineGrid = _Aggregate.FineGrid; + FineField fin(FineGrid); + FineField fout(FineGrid); + + _Aggregate.PromoteFromSubspace(in,fin); std::cout< +class ProjectedFunctionHermOp : public LinearFunction > > { +public: + typedef iVector CoarseSiteVector; + typedef Lattice CoarseField; + typedef Lattice CoarseScalar; // used for inner products on fine field + typedef Lattice FineField; + + + OperatorFunction & _poly; + LinearOperatorBase &_Linop; + Aggregation &_Aggregate; + + ProjectedFunctionHermOp(OperatorFunction & poly,LinearOperatorBase& linop, + Aggregation &aggregate) : + _poly(poly), + _Linop(linop), + _Aggregate(aggregate) { }; + + void operator()(const CoarseField& in, CoarseField& out) { + + GridBase *FineGrid = _Aggregate.FineGrid; + + FineField fin(FineGrid) ;fin.checkerboard =_Aggregate.checkerboard; + FineField fout(FineGrid);fout.checkerboard =_Aggregate.checkerboard; + + _Aggregate.PromoteFromSubspace(in,fin); std::cout< +class ImplicitlyRestartedLanczosSmoothedTester : public ImplicitlyRestartedLanczosTester > > +{ + public: + typedef iVector CoarseSiteVector; + typedef Lattice CoarseField; + typedef Lattice CoarseScalar; // used for inner products on fine field + typedef Lattice FineField; + + LinearFunction & _Poly; + OperatorFunction & _smoother; + LinearOperatorBase &_Linop; + Aggregation &_Aggregate; + RealD _coarse_relax_tol; + ImplicitlyRestartedLanczosSmoothedTester(LinearFunction &Poly, + OperatorFunction &smoother, + LinearOperatorBase &Linop, + Aggregation &Aggregate, + RealD coarse_relax_tol=5.0e3) + : _smoother(smoother), _Linop(Linop),_Aggregate(Aggregate), _Poly(Poly), _coarse_relax_tol(coarse_relax_tol) { }; + + int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox) + { + CoarseField v(B); + RealD eval_poly = eval; + // Apply operator + _Poly(B,v); + + RealD vnum = real(innerProduct(B,v)); // HermOp. + RealD vden = norm2(B); + RealD vv0 = norm2(v); + eval = vnum/vden; + v -= eval*B; + + RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0); + + std::cout.precision(13); + std::cout< nbasis ) eresid = eresid*_coarse_relax_tol; + if( (vv +class LocalCoherenceLanczos +{ +public: + typedef iVector CoarseSiteVector; + typedef Lattice CoarseScalar; // used for inner products on fine field + typedef Lattice CoarseField; + typedef Lattice FineField; + +protected: + GridBase *_CoarseGrid; + GridBase *_FineGrid; + int _checkerboard; + LinearOperatorBase & _FineOp; + + // FIXME replace Aggregation with vector of fine; the code reuse is too small for + // the hassle and complexity of cross coupling. + Aggregation _Aggregate; + std::vector evals_fine; + std::vector evals_coarse; + std::vector evec_coarse; +public: + LocalCoherenceLanczos(GridBase *FineGrid, + GridBase *CoarseGrid, + LinearOperatorBase &FineOp, + int checkerboard) : + _CoarseGrid(CoarseGrid), + _FineGrid(FineGrid), + _Aggregate(CoarseGrid,FineGrid,checkerboard), + _FineOp(FineOp), + _checkerboard(checkerboard) + { + evals_fine.resize(0); + evals_coarse.resize(0); + }; + void Orthogonalise(void ) { _Aggregate.Orthogonalise(); } + + template static RealD normalise(T& v) + { + RealD nn = norm2(v); + nn = ::sqrt(nn); + v = v * (1.0/nn); + return nn; + } + + void fakeFine(void) + { + int Nk = nbasis; + _Aggregate.subspace.resize(Nk,_FineGrid); + _Aggregate.subspace[0]=1.0; + _Aggregate.subspace[0].checkerboard=_checkerboard; + normalise(_Aggregate.subspace[0]); + PlainHermOp Op(_FineOp); + for(int k=1;k Op(_FineOp); + ImplicitlyRestartedLanczosHermOpTester SimpleTester(Op); + for(int k=0;k ChebySmooth(cheby_smooth); + ProjectedFunctionHermOp ChebyOp (ChebySmooth,_FineOp,_Aggregate); + ImplicitlyRestartedLanczosSmoothedTester ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax); + + for(int k=0;k Cheby(cheby_parms); + FunctionHermOp ChebyOp(Cheby,_FineOp); + PlainHermOp Op(_FineOp); + + evals_fine.resize(Nm); + _Aggregate.subspace.resize(Nm,_FineGrid); + + ImplicitlyRestartedLanczos IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes); + + FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard; + + int Nconv; + IRL.calc(evals_fine,_Aggregate.subspace,src,Nconv,false); + + // Shrink down to number saved + assert(Nstop>=nbasis); + assert(Nconv>=nbasis); + evals_fine.resize(nbasis); + _Aggregate.subspace.resize(nbasis,_FineGrid); + } + void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax, + int Nstop, int Nk, int Nm,RealD resid, + RealD MaxIt, RealD betastp, int MinRes) + { + Chebyshev Cheby(cheby_op); + ProjectedHermOp Op(_FineOp,_Aggregate); + ProjectedFunctionHermOp ChebyOp (Cheby,_FineOp,_Aggregate); + ////////////////////////////////////////////////////////////////////////////////////////////////// + // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL + ////////////////////////////////////////////////////////////////////////////////////////////////// + + Chebyshev ChebySmooth(cheby_smooth); + ImplicitlyRestartedLanczosSmoothedTester ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax); + + evals_coarse.resize(Nm); + evec_coarse.resize(Nm,_CoarseGrid); + + CoarseField src(_CoarseGrid); src=1.0; + + ImplicitlyRestartedLanczos IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes); + int Nconv=0; + IRL.calc(evals_coarse,evec_coarse,src,Nconv,false); + assert(Nconv>=Nstop); + evals_coarse.resize(Nstop); + evec_coarse.resize (Nstop,_CoarseGrid); + for (int i=0;i -class ProjectedFunctionHermOp : public LinearFunction > > { -public: - typedef iVector CoarseSiteVector; - typedef Lattice CoarseField; - typedef Lattice CoarseScalar; // used for inner products on fine field - typedef Lattice FineField; - - - OperatorFunction & _poly; - LinearOperatorBase &_Linop; - Aggregation &_Aggregate; - - ProjectedFunctionHermOp(OperatorFunction & poly,LinearOperatorBase& linop, - Aggregation &aggregate) : - _poly(poly), - _Linop(linop), - _Aggregate(aggregate) { }; - - void operator()(const CoarseField& in, CoarseField& out) { - - GridBase *FineGrid = _Aggregate.FineGrid; - - FineField fin(FineGrid) ;fin.checkerboard =_Aggregate.checkerboard; - FineField fout(FineGrid);fout.checkerboard =_Aggregate.checkerboard; - - _Aggregate.PromoteFromSubspace(in,fin); std::cout< -class ImplicitlyRestartedLanczosSmoothedTester : public ImplicitlyRestartedLanczosTester > > -{ - public: - typedef iVector CoarseSiteVector; - typedef Lattice CoarseField; - typedef Lattice CoarseScalar; // used for inner products on fine field - typedef Lattice FineField; - - LinearFunction & _Poly; - OperatorFunction & _smoother; - LinearOperatorBase &_Linop; - Aggregation &_Aggregate; - RealD _coarse_relax_tol; - ImplicitlyRestartedLanczosSmoothedTester(LinearFunction &Poly, - OperatorFunction &smoother, - LinearOperatorBase &Linop, - Aggregation &Aggregate, - RealD coarse_relax_tol=5.0e3) - : _smoother(smoother), _Linop(Linop),_Aggregate(Aggregate), _Poly(Poly), _coarse_relax_tol(coarse_relax_tol) { }; - - int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox) - { - CoarseField v(B); - RealD eval_poly = eval; - // Apply operator - _Poly(B,v); - - RealD vnum = real(innerProduct(B,v)); // HermOp. - RealD vden = norm2(B); - RealD vv0 = norm2(v); - eval = vnum/vden; - v -= eval*B; - - RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0); - - std::cout.precision(13); - std::cout< nbasis ) eresid = eresid*_coarse_relax_tol; - if( (vv -class LocalCoherenceLanczos -{ -public: - typedef iVector CoarseSiteVector; - typedef Lattice CoarseScalar; // used for inner products on fine field - typedef Lattice CoarseField; - typedef Lattice FineField; - -private: - GridBase *_CoarseGrid; - GridBase *_FineGrid; - int _checkerboard; - LinearOperatorBase & _FineOp; - - // FIXME replace Aggregation with vector of fine; the code reuse is too small for - // the hassle and complexity of cross coupling. - Aggregation _Aggregate; - std::vector evals_fine; - std::vector evals_coarse; - std::vector evec_coarse; -public: - LocalCoherenceLanczos(GridBase *FineGrid, - GridBase *CoarseGrid, - LinearOperatorBase &FineOp, - int checkerboard) : - _CoarseGrid(CoarseGrid), - _FineGrid(FineGrid), - _Aggregate(CoarseGrid,FineGrid,checkerboard), - _FineOp(FineOp), - _checkerboard(checkerboard) - { - evals_fine.resize(0); - evals_coarse.resize(0); - }; - void Orthogonalise(void ) { _Aggregate.Orthogonalise(); } - - template static RealD normalise(T& v) - { - RealD nn = norm2(v); - nn = ::sqrt(nn); - v = v * (1.0/nn); - return nn; - } - - void fakeFine(void) - { - int Nk = nbasis; - _Aggregate.subspace.resize(Nk,_FineGrid); - _Aggregate.subspace[0]=1.0; - _Aggregate.subspace[0].checkerboard=_checkerboard; - normalise(_Aggregate.subspace[0]); - PlainHermOp Op(_FineOp); - for(int k=1;k &FineOp, + int checkerboard) + // Base constructor + : LocalCoherenceLanczos(FineGrid,CoarseGrid,FineOp,checkerboard) + {}; void checkpointFine(std::string evecs_file,std::string evals_file) { - assert(_Aggregate.subspace.size()==nbasis); + assert(this->_Aggregate.subspace.size()==nbasis); emptyUserRecord record; - { - ScidacWriter WR; - WR.open(evecs_file); - for(int k=0;k_Aggregate.subspace[k],record); } + WR.close(); + + XmlWriter WRx(evals_file); + write(WRx,"evals",this->evals_fine); } void checkpointFineRestore(std::string evecs_file,std::string evals_file) { - evals_fine.resize(nbasis); - _Aggregate.subspace.resize(nbasis,_FineGrid); - { - std::cout << GridLogIRL<< "checkpointFineRestore: Reading evals from "<evals_fine.resize(nbasis); + this->_Aggregate.subspace.resize(nbasis,this->_FineGrid); + + std::cout << GridLogIRL<< "checkpointFineRestore: Reading evals from "<evals_fine); + + assert(this->evals_fine.size()==nbasis); + + std::cout << GridLogIRL<< "checkpointFineRestore: Reading evecs from "< Op(_FineOp); - ImplicitlyRestartedLanczosHermOpTester SimpleTester(Op); - for(int k=0;k_Aggregate.subspace[k].checkerboard=this->_checkerboard; + RD.readScidacFieldRecord(this->_Aggregate.subspace[k],record); + } + RD.close(); } void checkpointCoarse(std::string evecs_file,std::string evals_file) { - int n = evec_coarse.size(); + int n = this->evec_coarse.size(); emptyUserRecord record; - { - ScidacWriter WR; - WR.open(evecs_file); - for(int k=0;kevec_coarse[k],record); } + WR.close(); + + XmlWriter WRx(evals_file); + write(WRx,"evals",this->evals_coarse); } + void checkpointCoarseRestore(std::string evecs_file,std::string evals_file,int nvec) { std::cout << " resizing to " << nvec<< std::endl; - evals_coarse.resize(nvec); - evec_coarse.resize(nvec,_CoarseGrid); - { - std::cout << GridLogIRL<< "checkpointCoarseRestore: Reading evals from "<evals_coarse.resize(nvec); + this->evec_coarse.resize(nvec,this->_CoarseGrid); + std::cout << GridLogIRL<< "checkpointCoarseRestore: Reading evals from "<evals_coarse); + assert(this->evals_coarse.size()==nvec); emptyUserRecord record; - { - std::cout << GridLogIRL<< "checkpointCoarseRestore: Reading evecs from "< ChebySmooth(cheby_smooth); - ProjectedFunctionHermOp ChebyOp (ChebySmooth,_FineOp,_Aggregate); - ImplicitlyRestartedLanczosSmoothedTester ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax); - - for(int k=0;k Cheby(cheby_parms); - FunctionHermOp ChebyOp(Cheby,_FineOp); - PlainHermOp Op(_FineOp); - - evals_fine.resize(Nm); - _Aggregate.subspace.resize(Nm,_FineGrid); - - ImplicitlyRestartedLanczos IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes); - - FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard; - - int Nconv; - IRL.calc(evals_fine,_Aggregate.subspace,src,Nconv,false); - - // Shrink down to number saved - assert(Nstop>=nbasis); - assert(Nconv>=nbasis); - evals_fine.resize(nbasis); - _Aggregate.subspace.resize(nbasis,_FineGrid); - } - void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax, - int Nstop, int Nk, int Nm,RealD resid, - RealD MaxIt, RealD betastp, int MinRes) - { - Chebyshev Cheby(cheby_op); - ProjectedHermOp Op(_FineOp,_Aggregate); - ProjectedFunctionHermOp ChebyOp (Cheby,_FineOp,_Aggregate); - ////////////////////////////////////////////////////////////////////////////////////////////////// - // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL - ////////////////////////////////////////////////////////////////////////////////////////////////// - - Chebyshev ChebySmooth(cheby_smooth); - ImplicitlyRestartedLanczosSmoothedTester ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax); - - evals_coarse.resize(Nm); - evec_coarse.resize(Nm,_CoarseGrid); - - CoarseField src(_CoarseGrid); src=1.0; - - ImplicitlyRestartedLanczos IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes); - int Nconv=0; - IRL.calc(evals_coarse,evec_coarse,src,Nconv,false); - assert(Nconv>=Nstop); - evals_coarse.resize(Nstop); - evec_coarse.resize (Nstop,_CoarseGrid); - for (int i=0;ievec_coarse[k],record); } + RD.close(); } }; - int main (int argc, char ** argv) { Grid_init(&argc,&argv); @@ -465,7 +153,9 @@ int main (int argc, char ** argv) { std::vector blockSize = Params.blockSize; // Grids - GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), + GridDefaultSimd(Nd,vComplex::Nsimd()), + GridDefaultMpi()); GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); @@ -516,12 +206,10 @@ int main (int argc, char ** argv) { const int nbasis= 60; assert(nbasis==Ns1); - LocalCoherenceLanczos _LocalCoherenceLanczos(FrbGrid,CoarseGrid5rb,HermOp,Odd); + LocalCoherenceLanczosScidac _LocalCoherenceLanczos(FrbGrid,CoarseGrid5rb,HermOp,Odd); std::cout << GridLogMessage << "Constructed LocalCoherenceLanczos" << std::endl; - if ( Params.doCoarse ) { - assert( (Params.doFine)||(Params.doFineRead)); - } + assert( (Params.doFine)||(Params.doFineRead)); if ( Params.doFine ) { std::cout << GridLogMessage << "Performing fine grid IRL Nstop "<< Ns1 << " Nk "< Date: Fri, 27 Oct 2017 09:43:22 +0100 Subject: [PATCH 36/63] Passes reload of coarse basis --- lib/algorithms/iterative/LocalCoherenceLanczos.h | 6 +++++- tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc | 3 ++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/lib/algorithms/iterative/LocalCoherenceLanczos.h b/lib/algorithms/iterative/LocalCoherenceLanczos.h index 6b8fe62c..d5d1bbc2 100644 --- a/lib/algorithms/iterative/LocalCoherenceLanczos.h +++ b/lib/algorithms/iterative/LocalCoherenceLanczos.h @@ -285,7 +285,11 @@ public: ImplicitlyRestartedLanczosSmoothedTester ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax); for(int k=0;k Date: Fri, 27 Oct 2017 10:29:34 +0100 Subject: [PATCH 37/63] Bug fix in the coarse restore... Think this is nearly there --- tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc b/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc index 0824cfa4..4c702a33 100644 --- a/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc +++ b/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc @@ -109,7 +109,7 @@ public: void checkpointCoarseRestore(std::string evecs_file,std::string evals_file,int nvec) { - std::cout << " resizing to " << nvec<< std::endl; + std::cout << "resizing coarse vecs to " << nvec<< std::endl; this->evals_coarse.resize(nvec); this->evec_coarse.resize(nvec,this->_CoarseGrid); std::cout << GridLogIRL<< "checkpointCoarseRestore: Reading evals from "<evec_coarse[k],record); } RD.close(); From 1ef424b1392038df12130b1ce2f855c8b1cc1dbd Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 27 Oct 2017 14:20:35 +0100 Subject: [PATCH 38/63] Split grid Y2K bug fix attempt --- lib/communicator/Communicator_base.h | 14 +++++++++----- lib/communicator/Communicator_mpi.cc | 26 +++++++++++++++++++++----- lib/communicator/Communicator_none.cc | 8 ++++++-- lib/lattice/Lattice_transfer.h | 8 ++++---- 4 files changed, 40 insertions(+), 16 deletions(-) diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h index 22c9e4d0..ff054497 100644 --- a/lib/communicator/Communicator_base.h +++ b/lib/communicator/Communicator_base.h @@ -274,12 +274,16 @@ class CartesianCommunicator { // std::cerr << " AllToAll in.size() "< void Broadcast(int root,obj &data) { diff --git a/lib/communicator/Communicator_mpi.cc b/lib/communicator/Communicator_mpi.cc index 5a2dc4d0..ef612f98 100644 --- a/lib/communicator/Communicator_mpi.cc +++ b/lib/communicator/Communicator_mpi.cc @@ -55,7 +55,9 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { CartesianCommunicator::~CartesianCommunicator() { - if (communicator && !MPI::Is_finalized()) + int MPI_is_finalised; + MPI_Finalized(&MPI_is_finalised); + if (communicator && MPI_is_finalised) MPI_Comm_free(&communicator); } @@ -195,7 +197,7 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes) communicator); assert(ierr==0); } -void CartesianCommunicator::AllToAll(int dim,void *in,void *out,int bytes) +void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes) { std::vector row(_ndimension,1); assert(dim>=0 && dim<_ndimension); @@ -204,11 +206,25 @@ void CartesianCommunicator::AllToAll(int dim,void *in,void *out,int bytes) row[dim] = _processors[dim]; CartesianCommunicator Comm(row,*this); - Comm.AllToAll(in,out,bytes); + Comm.AllToAll(in,out,words,bytes); } -void CartesianCommunicator::AllToAll(void *in,void *out,int bytes) +void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes) { - MPI_Alltoall(in ,bytes,MPI_BYTE,out,bytes,MPI_BYTE,communicator); + // MPI is a pain and uses "int" arguments + // 64*64*64*128*16 == 500Million elements of data. + // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug. + // (Turns up on 32^3 x 64 Gparity too) + MPI_Datatype object; + int iwords; + int ibytes; + iwords = words; + ibytes = bytes; + assert(words == iwords); // safe to cast to int ? + assert(bytes == ibytes); // safe to cast to int ? + MPI_Type_contiguous(ibytes,MPI_BYTE,&object); + MPI_Type_commit(&object); + MPI_Alltoall(in,iwords,object,out,iwords,object,communicator); + MPI_Type_free(&object); } /////////////////////////////////////////////////////// // Should only be used prior to Grid Init finished. diff --git a/lib/communicator/Communicator_none.cc b/lib/communicator/Communicator_none.cc index 629a3e4a..a862d52a 100644 --- a/lib/communicator/Communicator_none.cc +++ b/lib/communicator/Communicator_none.cc @@ -100,9 +100,13 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector & { assert(0); } -void CartesianCommunicator::AllToAll(int dim,void *in,void *out,int bytes) +void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes) { - bcopy(in,out,bytes); + bcopy(in,out,bytes*words); +} +void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes) +{ + bcopy(in,out,bytes*words); } int CartesianCommunicator::RankWorld(void){return 0;} diff --git a/lib/lattice/Lattice_transfer.h b/lib/lattice/Lattice_transfer.h index 713a8788..bc59e9eb 100644 --- a/lib/lattice/Lattice_transfer.h +++ b/lib/lattice/Lattice_transfer.h @@ -790,8 +790,8 @@ void Grid_split(std::vector > & full,Lattice & split) ratio[d] = full_grid->_processors[d]/ split_grid->_processors[d]; } - int lsites = full_grid->lSites(); - Integer sz = lsites * nvector; + uint64_t lsites = full_grid->lSites(); + uint64_t sz = lsites * nvector; std::vector tmpdata(sz); std::vector alldata(sz); std::vector scalardata(lsites); @@ -908,8 +908,8 @@ void Grid_unsplit(std::vector > & full,Lattice & split) ratio[d] = full_grid->_processors[d]/ split_grid->_processors[d]; } - int lsites = full_grid->lSites(); - Integer sz = lsites * nvector; + uint64_t lsites = full_grid->lSites(); + uint64_t sz = lsites * nvector; std::vector tmpdata(sz); std::vector alldata(sz); std::vector scalardata(lsites); From 689323f4eec85b159d82fe4b2b7097ff4312c70c Mon Sep 17 00:00:00 2001 From: paboyle Date: Mon, 30 Oct 2017 00:03:15 +0000 Subject: [PATCH 39/63] Reverse dim ordering lexico support --- lib/util/Lexicographic.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/lib/util/Lexicographic.h b/lib/util/Lexicographic.h index b922dba5..f5c55b74 100644 --- a/lib/util/Lexicographic.h +++ b/lib/util/Lexicographic.h @@ -26,6 +26,25 @@ namespace Grid{ } } + static inline void IndexFromCoorReversed (const std::vector& coor,int &index,const std::vector &dims){ + int nd=dims.size(); + int stride=1; + index=0; + for(int d=nd-1;d>=0;d--){ + index = index+stride*coor[d]; + stride=stride*dims[d]; + } + } + static inline void CoorFromIndexReversed (std::vector& coor,int index,const std::vector &dims){ + int nd= dims.size(); + coor.resize(nd); + for(int d=nd-1;d>=0;d--){ + coor[d] = index % dims[d]; + index = index / dims[d]; + } + } + + }; } From 4a699b4da340280d0502fcaab6d31b598e924f93 Mon Sep 17 00:00:00 2001 From: paboyle Date: Mon, 30 Oct 2017 00:04:14 +0000 Subject: [PATCH 40/63] New rank can be found out --- lib/cartesian/Cartesian_base.h | 9 +++++++-- lib/cartesian/Cartesian_full.h | 11 +++++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/lib/cartesian/Cartesian_base.h b/lib/cartesian/Cartesian_base.h index 6aa0e3c7..acc870de 100644 --- a/lib/cartesian/Cartesian_base.h +++ b/lib/cartesian/Cartesian_base.h @@ -44,13 +44,18 @@ namespace Grid{ class GridBase : public CartesianCommunicator , public GridThread { public: - + int dummy; // Give Lattice access template friend class Lattice; GridBase(const std::vector & processor_grid) : CartesianCommunicator(processor_grid) {}; GridBase(const std::vector & processor_grid, - const CartesianCommunicator &parent) : CartesianCommunicator(processor_grid,parent) {}; + const CartesianCommunicator &parent, + int &split_rank) + : CartesianCommunicator(processor_grid,parent,split_rank) {}; + GridBase(const std::vector & processor_grid, + const CartesianCommunicator &parent) + : CartesianCommunicator(processor_grid,parent,dummy) {}; virtual ~GridBase() = default; diff --git a/lib/cartesian/Cartesian_full.h b/lib/cartesian/Cartesian_full.h index c7ea68c9..9273abf3 100644 --- a/lib/cartesian/Cartesian_full.h +++ b/lib/cartesian/Cartesian_full.h @@ -38,7 +38,7 @@ namespace Grid{ class GridCartesian: public GridBase { public: - + int dummy; virtual int CheckerBoardFromOindexTable (int Oindex) { return 0; } @@ -67,7 +67,14 @@ public: GridCartesian(const std::vector &dimensions, const std::vector &simd_layout, const std::vector &processor_grid, - const GridCartesian &parent) : GridBase(processor_grid,parent) + const GridCartesian &parent) : GridBase(processor_grid,parent,dummy) + { + Init(dimensions,simd_layout,processor_grid); + } + GridCartesian(const std::vector &dimensions, + const std::vector &simd_layout, + const std::vector &processor_grid, + const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank) { Init(dimensions,simd_layout,processor_grid); } From fe4d9b003ca9c38ff6ec15e7445c22b0f4a72ade Mon Sep 17 00:00:00 2001 From: paboyle Date: Mon, 30 Oct 2017 00:04:47 +0000 Subject: [PATCH 41/63] More digits --- lib/algorithms/iterative/ConjugateGradient.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lib/algorithms/iterative/ConjugateGradient.h b/lib/algorithms/iterative/ConjugateGradient.h index 5c968e04..0d4e51c7 100644 --- a/lib/algorithms/iterative/ConjugateGradient.h +++ b/lib/algorithms/iterative/ConjugateGradient.h @@ -78,12 +78,12 @@ class ConjugateGradient : public OperatorFunction { cp = a; ssq = norm2(src); - std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: guess " << guess << std::endl; - std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: src " << ssq << std::endl; - std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: mp " << d << std::endl; - std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: mmp " << b << std::endl; - std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: cp,r " << cp << std::endl; - std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: p " << a << std::endl; + std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: guess " << guess << std::endl; + std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: src " << ssq << std::endl; + std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: mp " << d << std::endl; + std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: mmp " << b << std::endl; + std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: cp,r " << cp << std::endl; + std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: p " << a << std::endl; RealD rsq = Tolerance * Tolerance * ssq; @@ -92,7 +92,7 @@ class ConjugateGradient : public OperatorFunction { return; } - std::cout << GridLogIterative << std::setprecision(4) + std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl; GridStopWatch LinalgTimer; From 5bf42e1e150cb0e9116e427653955cb4398b1326 Mon Sep 17 00:00:00 2001 From: paboyle Date: Mon, 30 Oct 2017 00:05:21 +0000 Subject: [PATCH 42/63] Update --- tests/solver/Test_dwf_hdcr.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/solver/Test_dwf_hdcr.cc b/tests/solver/Test_dwf_hdcr.cc index c553ba0a..b3373238 100644 --- a/tests/solver/Test_dwf_hdcr.cc +++ b/tests/solver/Test_dwf_hdcr.cc @@ -555,13 +555,13 @@ int main (int argc, char ** argv) std::cout< HermDefOp(Ddwf); - Subspace Aggregates(Coarse5d,FGrid); + Subspace Aggregates(Coarse5d,FGrid,0); // Aggregates.CreateSubspace(RNG5,HermDefOp,nbasis); assert ( (nbasis & 0x1)==0); int nb=nbasis/2; std::cout< Date: Mon, 30 Oct 2017 00:16:12 +0000 Subject: [PATCH 43/63] Communicator updates for split grid --- lib/communicator/Communicator_base.cc | 63 +++++++++++++++++++-------- lib/communicator/Communicator_base.h | 2 +- lib/communicator/Communicator_mpi.cc | 3 +- 3 files changed, 47 insertions(+), 21 deletions(-) diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc index ce9a3cf0..a72c75fe 100644 --- a/lib/communicator/Communicator_base.cc +++ b/lib/communicator/Communicator_base.cc @@ -97,9 +97,9 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N) } -#if defined( GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT) +#if defined( GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT) || defined (GRID_COMMS_MPI3) -CartesianCommunicator::CartesianCommunicator(const std::vector &processors,const CartesianCommunicator &parent) +CartesianCommunicator::CartesianCommunicator(const std::vector &processors,const CartesianCommunicator &parent,int &srank) { _ndimension = processors.size(); assert(_ndimension = parent._ndimension); @@ -124,33 +124,51 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors, for(int d=0;d<_ndimension;d++){ ccoor[d] = parent._processor_coor[d] % processors[d]; scoor[d] = parent._processor_coor[d] / processors[d]; - ssize[d] = parent._processors[d]/ processors[d]; + ssize[d] = parent._processors[d] / processors[d]; } - int crank,srank; // rank within subcomm ; rank of subcomm within blocks of subcomms - Lexicographic::IndexFromCoor(ccoor,crank,processors); - Lexicographic::IndexFromCoor(scoor,srank,ssize); + int crank; // rank within subcomm ; srank is rank of subcomm within blocks of subcomms + // Mpi uses the reverse Lexico convention to us + Lexicographic::IndexFromCoorReversed(ccoor,crank,processors); + Lexicographic::IndexFromCoorReversed(scoor,srank,ssize); MPI_Comm comm_split; if ( Nchild > 1 ) { - // std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec< &processors, ////////////////////////////////////////////////////////////////////////////////////////////////////// void CartesianCommunicator::InitFromMPICommunicator(const std::vector &processors, MPI_Comm communicator_base) { - // if ( communicator_base != communicator_world ) { - // std::cout << "Cartesian communicator created with a non-world communicator"< &proc } std::vector periodic(_ndimension,1); - MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],1,&communicator); + MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],0,&communicator); MPI_Comm_rank(communicator,&_processor); MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]); + if ( communicator_base != communicator_world ) { + std::cout << "Cartesian communicator created with a non-world communicator"< &processors,const CartesianCommunicator &parent); + CartesianCommunicator(const std::vector &processors,const CartesianCommunicator &parent,int &srank); CartesianCommunicator(const std::vector &pdimensions_in); virtual ~CartesianCommunicator(); diff --git a/lib/communicator/Communicator_mpi.cc b/lib/communicator/Communicator_mpi.cc index ef612f98..5593aa8b 100644 --- a/lib/communicator/Communicator_mpi.cc +++ b/lib/communicator/Communicator_mpi.cc @@ -205,7 +205,8 @@ void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words, // Split the communicator row[dim] = _processors[dim]; - CartesianCommunicator Comm(row,*this); + int me; + CartesianCommunicator Comm(row,*this,me); Comm.AllToAll(in,out,words,bytes); } void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes) From a7f72eb9946d782e48fe315be066ca95b5c097b6 Mon Sep 17 00:00:00 2001 From: paboyle Date: Mon, 30 Oct 2017 00:22:06 +0000 Subject: [PATCH 44/63] SHaking out --- lib/lattice/Lattice_transfer.h | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/lib/lattice/Lattice_transfer.h b/lib/lattice/Lattice_transfer.h index 962cdeb1..1b09217b 100644 --- a/lib/lattice/Lattice_transfer.h +++ b/lib/lattice/Lattice_transfer.h @@ -757,6 +757,7 @@ void precisionChange(Lattice &out, const Lattice &in){ // NB: Easiest to programme if keep in lex order. // ///////////////////////////////////////////////////////// + template void Grid_split(std::vector > & full,Lattice & split) { @@ -805,6 +806,7 @@ void Grid_split(std::vector > & full,Lattice & split) std::vector tmpdata(sz); std::vector alldata(sz); std::vector scalardata(lsites); + for(int v=0;v > & full,Lattice & split) std::vector ldims = full_grid->_ldimensions; std::vector lcoor(ndim); - for(int d=0;d=0;d--){ if ( ratio[d] != 1 ) { full_grid ->AllToAll(d,alldata,tmpdata); - + // std::cout << GridLogMessage << "Grid_split: dim " <_processors[d]< > & full,Lattice & split) int rmul=nvec*lsites; int vmul= lsites; alldata[rsite] = tmpdata[lsite+r*rmul+v*vmul]; - + // if ( lsite==0 ) { + // std::cout << "Grid_split: grow alldata["< > & full,Lattice & split) } } } - vectorizeFromLexOrdArray(alldata,split); } @@ -936,10 +944,12 @@ void Grid_unsplit(std::vector > & full,Lattice & split) lsites = split_grid->lSites(); std::vector ldims = split_grid->_ldimensions; - for(int d=ndim-1;d>=0;d--){ + // for(int d=ndim-1;d>=0;d--){ + for(int d=0;d_processors[d] > 1 ) { tmpdata = alldata; split_grid->AllToAll(d,tmpdata,alldata); @@ -985,13 +995,11 @@ void Grid_unsplit(std::vector > & full,Lattice & split) lsites = full_grid->lSites(); for(int v=0;v Date: Mon, 30 Oct 2017 00:22:52 +0000 Subject: [PATCH 45/63] : --- tests/solver/Test_dwf_mrhs_cg.cc | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/tests/solver/Test_dwf_mrhs_cg.cc b/tests/solver/Test_dwf_mrhs_cg.cc index 079fa85a..207e1331 100644 --- a/tests/solver/Test_dwf_mrhs_cg.cc +++ b/tests/solver/Test_dwf_mrhs_cg.cc @@ -52,15 +52,28 @@ int main (int argc, char ** argv) GridRedBlackCartesian * rbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); - int nrhs = UGrid->RankCount() ; - ///////////////////////////////////////////// // Split into 1^4 mpi communicators ///////////////////////////////////////////// + for(int i=0;i> mpi_split[k]; + } + break; + } + } + + int nrhs = 1; + int me; + for(int i=0;i seeds({1,2,3,4}); - GridParallelRNG pRNG(UGrid ); pRNG.SeedFixedIntegers(seeds); GridParallelRNG pRNG5(FGrid); pRNG5.SeedFixedIntegers(seeds); std::vector src(nrhs,FGrid); @@ -93,7 +105,7 @@ int main (int argc, char ** argv) emptyUserRecord record; std::string file("./scratch.scidac"); std::string filef("./scratch.scidac.ferm"); - int me = UGrid->ThisRank(); + LatticeGaugeField s_Umu(SGrid); FermionField s_src(SFGrid); FermionField s_src_split(SFGrid); @@ -169,7 +181,7 @@ int main (int argc, char ** argv) for(int n=0;nBarrier(); if ( n==me ) { - std::cerr << GridLogMessage<<"Split "<< me << " " << norm2(s_src_split) << " " << norm2(s_src)<< " diff " << norm2(s_tmp)<Barrier(); } @@ -218,7 +230,6 @@ int main (int argc, char ** argv) std::cout << " diff " < Date: Mon, 30 Oct 2017 00:23:34 +0000 Subject: [PATCH 46/63] Extended sub comm supported --- tests/solver/Test_split_grid.cc | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/tests/solver/Test_split_grid.cc b/tests/solver/Test_split_grid.cc index 90969b85..2b6a4bf7 100644 --- a/tests/solver/Test_split_grid.cc +++ b/tests/solver/Test_split_grid.cc @@ -52,11 +52,24 @@ int main (int argc, char ** argv) GridRedBlackCartesian * rbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); - int nrhs = UGrid->RankCount() ; - ///////////////////////////////////////////// // Split into 1^4 mpi communicators ///////////////////////////////////////////// + + for(int i=0;i> mpi_split[k]; + } + break; + } + } + + int nrhs = 1; + for(int i=0;i Date: Mon, 30 Oct 2017 00:24:11 +0000 Subject: [PATCH 47/63] Get subrank info from communicator constructor --- tests/solver/Test_dwf_mrhs_cg_mpieo.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/solver/Test_dwf_mrhs_cg_mpieo.cc b/tests/solver/Test_dwf_mrhs_cg_mpieo.cc index 14115b59..a6dfcd57 100644 --- a/tests/solver/Test_dwf_mrhs_cg_mpieo.cc +++ b/tests/solver/Test_dwf_mrhs_cg_mpieo.cc @@ -47,7 +47,9 @@ int main (int argc, char ** argv) std::vector mpi_layout = GridDefaultMpi(); std::vector mpi_split (mpi_layout.size(),1); - GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), + GridDefaultSimd(Nd,vComplex::Nsimd()), + GridDefaultMpi()); GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); GridRedBlackCartesian * rbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); @@ -57,10 +59,11 @@ int main (int argc, char ** argv) ///////////////////////////////////////////// // Split into 1^4 mpi communicators ///////////////////////////////////////////// + int me; GridCartesian * SGrid = new GridCartesian(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()), mpi_split, - *UGrid); + *UGrid,me); GridCartesian * SFGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,SGrid); GridRedBlackCartesian * SrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(SGrid); @@ -89,8 +92,6 @@ int main (int argc, char ** argv) ///////////////// // MPI only sends ///////////////// - int me = UGrid->ThisRank(); - LatticeGaugeField s_Umu(SGrid); FermionField s_src(SFGrid); FermionField s_src_e(SFrbGrid); From 78e8704eacb41fae706e50c24ae0baa6b17b9481 Mon Sep 17 00:00:00 2001 From: paboyle Date: Mon, 30 Oct 2017 00:25:31 +0000 Subject: [PATCH 48/63] Shaking out --- tests/solver/Test_dwf_mrhs_cg_mpi.cc | 99 +++++++++++++++++++++++++--- 1 file changed, 89 insertions(+), 10 deletions(-) diff --git a/tests/solver/Test_dwf_mrhs_cg_mpi.cc b/tests/solver/Test_dwf_mrhs_cg_mpi.cc index fbc6dd32..f640edff 100644 --- a/tests/solver/Test_dwf_mrhs_cg_mpi.cc +++ b/tests/solver/Test_dwf_mrhs_cg_mpi.cc @@ -1,4 +1,4 @@ - /************************************************************************************* + /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -47,20 +47,36 @@ int main (int argc, char ** argv) std::vector mpi_layout = GridDefaultMpi(); std::vector mpi_split (mpi_layout.size(),1); - GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), + GridDefaultSimd(Nd,vComplex::Nsimd()), + GridDefaultMpi()); GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); GridRedBlackCartesian * rbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); - int nrhs = UGrid->RankCount() ; - ///////////////////////////////////////////// // Split into 1^4 mpi communicators ///////////////////////////////////////////// + + for(int i=0;i> mpi_split[k]; + } + break; + } + } + + int nrhs = 1; + int me; + for(int i=0;i result(nrhs,FGrid); FermionField tmp(FGrid); - for(int s=0;sThisRank(); - LatticeGaugeField s_Umu(SGrid); FermionField s_src(SFGrid); FermionField s_tmp(SFGrid); @@ -98,6 +144,36 @@ int main (int argc, char ** argv) /////////////////////////////////////////////////////////////// Grid_split (Umu,s_Umu); Grid_split (src,s_src); + std::cout << " split rank " < HermOp(Ddwf); MdagMLinearOperator HermOpCk(Dchk); - ConjugateGradient CG((1.0e-5/(me+1)),10000); + ConjugateGradient CG((1.0e-5),10000); s_res = zero; CG(HermOp,s_src,s_res); + std::cout << " s_res norm "< Date: Mon, 30 Oct 2017 01:14:11 +0000 Subject: [PATCH 49/63] No compile on comms == none fix --- lib/communicator/Communicator_none.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/communicator/Communicator_none.cc b/lib/communicator/Communicator_none.cc index a862d52a..26b330a7 100644 --- a/lib/communicator/Communicator_none.cc +++ b/lib/communicator/Communicator_none.cc @@ -38,8 +38,8 @@ void CartesianCommunicator::Init(int *argc, char *** arv) ShmInitGeneric(); } -CartesianCommunicator::CartesianCommunicator(const std::vector &processors,const CartesianCommunicator &parent) - : CartesianCommunicator(processors) {} +CartesianCommunicator::CartesianCommunicator(const std::vector &processors,const CartesianCommunicator &parent,int &srank) + : CartesianCommunicator(processors) { srank=0;} CartesianCommunicator::CartesianCommunicator(const std::vector &processors) { From 360efd0088847727e2fabe034dc5f18a09430cff Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 2 Nov 2017 22:05:31 +0000 Subject: [PATCH 50/63] Improved treatment of reverse asked for by chris. Truncate the basis. Power method renormalises --- .../iterative/ImplicitlyRestartedLanczos.h | 44 ++++++++++++------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h index 089e7ff3..7a0760c9 100644 --- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h +++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h @@ -181,8 +181,8 @@ enum IRLdiagonalisation { template class ImplicitlyRestartedLanczosHermOpTester : public ImplicitlyRestartedLanczosTester { public: - LinearFunction &_HermOpTest; - ImplicitlyRestartedLanczosHermOpTester(LinearFunction &HermOpTest) : _HermOpTest(HermOpTest) { }; + LinearFunction &_HermOp; + ImplicitlyRestartedLanczosHermOpTester(LinearFunction &HermOp) : _HermOp(HermOp) { }; int ReconstructEval(int j,RealD resid,Field &B, RealD &eval,RealD evalMaxApprox) { return TestConvergence(j,resid,B,eval,evalMaxApprox); @@ -192,7 +192,7 @@ template class ImplicitlyRestartedLanczosHermOpTester : public Imp Field v(B); RealD eval_poly = eval; // Apply operator - _HermOpTest(B,v); + _HermOp(B,v); RealD vnum = real(innerProduct(B,v)); // HermOp. RealD vden = norm2(B); @@ -233,8 +233,8 @@ class ImplicitlyRestartedLanczos { //////////////////////////////// // Embedded objects //////////////////////////////// + LinearFunction &_PolyOp; LinearFunction &_HermOp; - LinearFunction &_HermOpTest; ImplicitlyRestartedLanczosTester &_Tester; // Default tester provided (we need a ref to something in default case) ImplicitlyRestartedLanczosHermOpTester SimpleTester; @@ -246,16 +246,22 @@ public: ////////////////////////////////////////////////////////////////// // PAB: ////////////////////////////////////////////////////////////////// - // Too many options & knobs. Do we really need orth_period + // Too many options & knobs. + // Eliminate: + // orth_period + // betastp + // MinRestart + // + // Do we really need orth_period // What is the theoretical basis & guarantees of betastp ? // Nstop=Nk viable? // MinRestart avoidable with new convergence test? - // Could cut to HermOp, HermOpTest, Tester, Nk, Nm, resid, maxiter (+diagonalisation) - // HermOpTest could be eliminated if we dropped the Power method for max eval. + // Could cut to PolyOp, HermOp, Tester, Nk, Nm, resid, maxiter (+diagonalisation) + // HermOp could be eliminated if we dropped the Power method for max eval. // -- also: The eval, eval2, eval2_copy stuff is still unnecessarily unclear ////////////////////////////////////////////////////////////////// - ImplicitlyRestartedLanczos(LinearFunction & HermOp, - LinearFunction & HermOpTest, + ImplicitlyRestartedLanczos(LinearFunction & PolyOp, + LinearFunction & HermOp, ImplicitlyRestartedLanczosTester & Tester, int _Nstop, // sought vecs int _Nk, // sought vecs @@ -265,14 +271,14 @@ public: RealD _betastp=0.0, // if beta(k) < betastp: converged int _MinRestart=1, int _orth_period = 1, IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) : - SimpleTester(HermOpTest), _HermOp(HermOp), _HermOpTest(HermOpTest), _Tester(Tester), + SimpleTester(HermOp), _PolyOp(PolyOp), _HermOp(HermOp), _Tester(Tester), Nstop(_Nstop) , Nk(_Nk), Nm(_Nm), eresid(_eresid), betastp(_betastp), MaxIter(_MaxIter) , MinRestart(_MinRestart), orth_period(_orth_period), diagonalisation(_diagonalisation) { }; - ImplicitlyRestartedLanczos(LinearFunction & HermOp, - LinearFunction & HermOpTest, + ImplicitlyRestartedLanczos(LinearFunction & PolyOp, + LinearFunction & HermOp, int _Nstop, // sought vecs int _Nk, // sought vecs int _Nm, // spare vecs @@ -281,7 +287,7 @@ public: RealD _betastp=0.0, // if beta(k) < betastp: converged int _MinRestart=1, int _orth_period = 1, IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) : - SimpleTester(HermOpTest), _HermOp(HermOp), _HermOpTest(HermOpTest), _Tester(SimpleTester), + SimpleTester(HermOp), _PolyOp(PolyOp), _HermOp(HermOp), _Tester(SimpleTester), Nstop(_Nstop) , Nk(_Nk), Nm(_Nm), eresid(_eresid), betastp(_betastp), MaxIter(_MaxIter) , MinRestart(_MinRestart), @@ -323,7 +329,7 @@ repeat →AVK =VKHK +fKe†K † Extend to an M = K + P step factorization AVM = VMHM + fMeM until convergence */ - void calc(std::vector& eval, std::vector& evec, const Field& src, int& Nconv, bool reverse=true) + void calc(std::vector& eval, std::vector& evec, const Field& src, int& Nconv, bool reverse=false) { GridBase *grid = src._grid; assert(grid == evec[0]._grid); @@ -355,7 +361,8 @@ until convergence auto tmp = src; const int _MAX_ITER_IRL_MEVAPP_ = 50; for (int i=0;i<_MAX_ITER_IRL_MEVAPP_;i++) { - _HermOpTest(src_n,tmp); + normalise(src_n); + _HermOp(src_n,tmp); RealD vnum = real(innerProduct(src_n,tmp)); // HermOp. RealD vden = norm2(src_n); RealD na = vnum/vden; @@ -536,7 +543,10 @@ until convergence std::cout << GridLogIRL << "Nconv ("<0) w -= lme[k-1] * evec[k-1]; From 69929f20bbea5bcd125d9d296fd395e04e0e4dd2 Mon Sep 17 00:00:00 2001 From: paboyle Date: Mon, 6 Nov 2017 23:45:00 +0000 Subject: [PATCH 51/63] Destructor fix. Split Grid and MPI3 will not yet work without more effort from me. --- lib/communicator/Communicator_base.cc | 41 +++++++++++++++++++++++++-- lib/communicator/Communicator_base.h | 2 +- lib/communicator/Communicator_mpi.cc | 30 -------------------- lib/communicator/Communicator_mpi3.cc | 17 ++++++++++- lib/communicator/Communicator_mpit.cc | 8 +++++- 5 files changed, 62 insertions(+), 36 deletions(-) diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc index a72c75fe..531dd358 100644 --- a/lib/communicator/Communicator_base.cc +++ b/lib/communicator/Communicator_base.cc @@ -98,7 +98,39 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N) #if defined( GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT) || defined (GRID_COMMS_MPI3) +void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes) +{ + std::vector row(_ndimension,1); + assert(dim>=0 && dim<_ndimension); + // Split the communicator + row[dim] = _processors[dim]; + + int me; + CartesianCommunicator Comm(row,*this,me); + Comm.AllToAll(in,out,words,bytes); +} +void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes) +{ + // MPI is a pain and uses "int" arguments + // 64*64*64*128*16 == 500Million elements of data. + // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug. + // (Turns up on 32^3 x 64 Gparity too) + MPI_Datatype object; + int iwords; + int ibytes; + iwords = words; + ibytes = bytes; + assert(words == iwords); // safe to cast to int ? + assert(bytes == ibytes); // safe to cast to int ? + MPI_Type_contiguous(ibytes,MPI_BYTE,&object); + MPI_Type_commit(&object); + MPI_Alltoall(in,iwords,object,out,iwords,object,communicator); + MPI_Type_free(&object); +} +#endif + +#if defined( GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT) CartesianCommunicator::CartesianCommunicator(const std::vector &processors,const CartesianCommunicator &parent,int &srank) { _ndimension = processors.size(); @@ -176,6 +208,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors, ////////////////////////////////////////////////////////////////////////////////////////////////////// InitFromMPICommunicator(processors,comm_split); } + ////////////////////////////////////////////////////////////////////////////////////////////////////// // Take an MPI_Comm and self assemble ////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -199,7 +232,7 @@ void CartesianCommunicator::InitFromMPICommunicator(const std::vector &proc MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]); if ( communicator_base != communicator_world ) { - std::cout << "Cartesian communicator created with a non-world communicator"< &proc int Size; MPI_Comm_size(communicator,&Size); -#ifdef GRID_COMMS_MPIT +#if defined(GRID_COMMS_MPIT) || defined (GRID_COMMS_MPI3) communicator_halo.resize (2*_ndimension); for(int i=0;i<_ndimension*2;i++){ MPI_Comm_dup(communicator,&communicator_halo[i]); @@ -220,7 +253,9 @@ void CartesianCommunicator::InitFromMPICommunicator(const std::vector &proc assert(Size==_Nprocessors); } +#endif +#if defined( GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT) CartesianCommunicator::CartesianCommunicator(const std::vector &processors) { InitFromMPICommunicator(processors,communicator_world); @@ -229,10 +264,10 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors) #endif #if !defined( GRID_COMMS_MPI3) - int CartesianCommunicator::NodeCount(void) { return ProcessorCount();}; int CartesianCommunicator::RankCount(void) { return ProcessorCount();}; #endif + #if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPIT) double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, int xmit_to_rank, diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h index 4374ac93..73ea6165 100644 --- a/lib/communicator/Communicator_base.h +++ b/lib/communicator/Communicator_base.h @@ -158,7 +158,7 @@ class CartesianCommunicator { virtual ~CartesianCommunicator(); private: -#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT) +#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT) || defined (GRID_COMMS_MPI3) //////////////////////////////////////////////// // Private initialise from an MPI communicator // Can use after an MPI_Comm_split, but hidden from user so private diff --git a/lib/communicator/Communicator_mpi.cc b/lib/communicator/Communicator_mpi.cc index 5593aa8b..f7b2a460 100644 --- a/lib/communicator/Communicator_mpi.cc +++ b/lib/communicator/Communicator_mpi.cc @@ -196,36 +196,6 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes) root, communicator); assert(ierr==0); -} -void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes) -{ - std::vector row(_ndimension,1); - assert(dim>=0 && dim<_ndimension); - - // Split the communicator - row[dim] = _processors[dim]; - - int me; - CartesianCommunicator Comm(row,*this,me); - Comm.AllToAll(in,out,words,bytes); -} -void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes) -{ - // MPI is a pain and uses "int" arguments - // 64*64*64*128*16 == 500Million elements of data. - // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug. - // (Turns up on 32^3 x 64 Gparity too) - MPI_Datatype object; - int iwords; - int ibytes; - iwords = words; - ibytes = bytes; - assert(words == iwords); // safe to cast to int ? - assert(bytes == ibytes); // safe to cast to int ? - MPI_Type_contiguous(ibytes,MPI_BYTE,&object); - MPI_Type_commit(&object); - MPI_Alltoall(in,iwords,object,out,iwords,object,communicator); - MPI_Type_free(&object); } /////////////////////////////////////////////////////// // Should only be used prior to Grid Init finished. diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc index 3cac726c..9e023fef 100644 --- a/lib/communicator/Communicator_mpi3.cc +++ b/lib/communicator/Communicator_mpi3.cc @@ -454,11 +454,15 @@ void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector &c ////////////////////////////////// // Try to subdivide communicator ////////////////////////////////// -CartesianCommunicator::CartesianCommunicator(const std::vector &processors,const CartesianCommunicator &parent) +/* + * Use default in MPI compile + */ +CartesianCommunicator::CartesianCommunicator(const std::vector &processors,const CartesianCommunicator &parent,int &srank) : CartesianCommunicator(processors) { std::cout << "Attempts to split MPI3 communicators will fail until implemented" < &processors) { int ierr; @@ -596,6 +600,17 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors) } } }; +CartesianCommunicator::~CartesianCommunicator() +{ + int MPI_is_finalised; + MPI_Finalized(&MPI_is_finalised); + if (communicator && MPI_is_finalised) { + MPI_Comm_free(&communicator); + for(int i=0;i< communicator_halo.size();i++){ + MPI_Comm_free(&communicator_halo[i]); + } + } +} void CartesianCommunicator::GlobalSum(uint32_t &u){ int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); assert(ierr==0); diff --git a/lib/communicator/Communicator_mpit.cc b/lib/communicator/Communicator_mpit.cc index 56f96c20..31f786ac 100644 --- a/lib/communicator/Communicator_mpit.cc +++ b/lib/communicator/Communicator_mpit.cc @@ -55,8 +55,14 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { CartesianCommunicator::~CartesianCommunicator() { - if (communicator && !MPI::Is_finalized()) + int MPI_is_finalised; + MPI_Finalized(&MPI_is_finalised); + if (communicator && MPI_is_finalised) { MPI_Comm_free(&communicator); + for(int i=0;i< communicator_halo.size();i++){ + MPI_Comm_free(&communicator_halo[i]); + } + } } From c519aab19dcfb559ab21585d9f9221b1fc193a60 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Tue, 7 Nov 2017 13:55:37 +0000 Subject: [PATCH 52/63] Fixing the MPI memory leak in the communicators --- lib/communicator/Communicator_mpi.cc | 2 +- lib/communicator/Communicator_mpi3.cc | 8 ++++++++ lib/communicator/Communicator_mpit.cc | 5 +++-- lib/communicator/Communicator_shmem.cc | 2 ++ 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/lib/communicator/Communicator_mpi.cc b/lib/communicator/Communicator_mpi.cc index 5593aa8b..1c1ae6c5 100644 --- a/lib/communicator/Communicator_mpi.cc +++ b/lib/communicator/Communicator_mpi.cc @@ -57,7 +57,7 @@ CartesianCommunicator::~CartesianCommunicator() { int MPI_is_finalised; MPI_Finalized(&MPI_is_finalised); - if (communicator && MPI_is_finalised) + if (communicator && !MPI_is_finalised) MPI_Comm_free(&communicator); } diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc index 3cac726c..52f65c34 100644 --- a/lib/communicator/Communicator_mpi3.cc +++ b/lib/communicator/Communicator_mpi3.cc @@ -596,6 +596,14 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors) } } }; +CartesianCommunicator::~CartesianCommunicator() +{ + int MPI_is_finalised; + MPI_Finalized(&MPI_is_finalised); + if (communicator && !MPI_is_finalised) + MPI_Comm_free(&communicator); +} + void CartesianCommunicator::GlobalSum(uint32_t &u){ int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); assert(ierr==0); diff --git a/lib/communicator/Communicator_mpit.cc b/lib/communicator/Communicator_mpit.cc index 56f96c20..1c24433b 100644 --- a/lib/communicator/Communicator_mpit.cc +++ b/lib/communicator/Communicator_mpit.cc @@ -55,11 +55,12 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { CartesianCommunicator::~CartesianCommunicator() { - if (communicator && !MPI::Is_finalized()) + int MPI_is_finalised; + MPI_Finalized(&MPI_is_finalised); + if (communicator && !MPI_is_finalised) MPI_Comm_free(&communicator); } - void CartesianCommunicator::GlobalSum(uint32_t &u){ int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); assert(ierr==0); diff --git a/lib/communicator/Communicator_shmem.cc b/lib/communicator/Communicator_shmem.cc index ed49285d..03e3173e 100644 --- a/lib/communicator/Communicator_shmem.cc +++ b/lib/communicator/Communicator_shmem.cc @@ -75,6 +75,8 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { ShmInitGeneric(); } +CartesianCommunicator::~CartesianCommunicator(){} + CartesianCommunicator::CartesianCommunicator(const std::vector &processors,const CartesianCommunicator &parent) : CartesianCommunicator(processors) { From 9b8d1cc3da4769f250665cc8e05fb305c794bc5d Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Tue, 7 Nov 2017 14:48:45 +0000 Subject: [PATCH 53/63] Staggered Schur decomposed matrix norm changed to not be the Schur anymore :( Carleton wanted this for multimass / multishift --- lib/algorithms/LinearOperator.h | 12 ++++++++++++ lib/algorithms/iterative/SchurRedBlack.h | 6 +++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/lib/algorithms/LinearOperator.h b/lib/algorithms/LinearOperator.h index 9b0e4942..0fa039c8 100644 --- a/lib/algorithms/LinearOperator.h +++ b/lib/algorithms/LinearOperator.h @@ -317,11 +317,23 @@ namespace Grid { } virtual RealD Mpc (const Field &in, Field &out) { Field tmp(in._grid); + Field tmp2(in._grid); + + _Mat.Mooee(in,out); + _Mat.Mooee(out,tmp); + + _Mat.Meooe(in,out); + _Mat.Meooe(out,tmp2); + + return axpy_norm(out,-1.0,tmp2,tmp); +#if 0 + //... much prefer conventional Schur norm _Mat.Meooe(in,tmp); _Mat.MooeeInv(tmp,out); _Mat.Meooe(out,tmp); _Mat.Mooee(in,out); return axpy_norm(out,-1.0,tmp,out); +#endif } virtual RealD MpcDag (const Field &in, Field &out){ return Mpc(in,out); diff --git a/lib/algorithms/iterative/SchurRedBlack.h b/lib/algorithms/iterative/SchurRedBlack.h index a0fd86a6..b9767aa8 100644 --- a/lib/algorithms/iterative/SchurRedBlack.h +++ b/lib/algorithms/iterative/SchurRedBlack.h @@ -90,7 +90,7 @@ namespace Grid { // Take a matrix and form a Red Black solver calling a Herm solver // Use of RB info prevents making SchurRedBlackSolve conform to standard interface /////////////////////////////////////////////////////////////////////////////////////////////////////// - + // Now make the norm reflect extra factor of Mee template class SchurRedBlackStaggeredSolve { private: OperatorFunction & _HermitianRBSolver; @@ -136,8 +136,8 @@ namespace Grid { _Matrix.Meooe (tmp,Mtmp); assert( Mtmp.checkerboard ==Odd); tmp=src_o-Mtmp; assert( tmp.checkerboard ==Odd); - src_o = tmp; assert(src_o.checkerboard ==Odd); - // _Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source + //src_o = tmp; assert(src_o.checkerboard ==Odd); + _Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm. ////////////////////////////////////////////////////////////// // Call the red-black solver From 1860b1698c4b204a332e2fad4dea86c97ffb0abc Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Wed, 8 Nov 2017 09:03:01 +0000 Subject: [PATCH 54/63] Fixed the bag on MPI_T at Cam --- lib/communicator/Communicator_mpit.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/communicator/Communicator_mpit.cc b/lib/communicator/Communicator_mpit.cc index b2fb71a6..bceea0d8 100644 --- a/lib/communicator/Communicator_mpit.cc +++ b/lib/communicator/Communicator_mpit.cc @@ -57,7 +57,7 @@ CartesianCommunicator::~CartesianCommunicator() { int MPI_is_finalised; MPI_Finalized(&MPI_is_finalised); - if (communicator && !MPI_is_finalised) + if (communicator && !MPI_is_finalised){ MPI_Comm_free(&communicator); for(int i=0;i< communicator_halo.size();i++){ MPI_Comm_free(&communicator_halo[i]); @@ -246,7 +246,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector Date: Thu, 9 Nov 2017 19:46:57 +0000 Subject: [PATCH 55/63] Declaring virtual functions as pure virtual functions. --- lib/algorithms/iterative/ImplicitlyRestartedLanczos.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h index 7a0760c9..7b85c095 100644 --- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h +++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h @@ -168,8 +168,8 @@ void basisDeflate(const std::vector &_v,const std::vector& eval,co template class ImplicitlyRestartedLanczosTester { public: - virtual int TestConvergence(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox); - virtual int ReconstructEval(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox); + virtual int TestConvergence(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox)=0; + virtual int ReconstructEval(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox)=0; }; enum IRLdiagonalisation { From 94b8fb56862289c0663453c0e2d82fa6da310f38 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 19 Nov 2017 01:39:04 +0000 Subject: [PATCH 56/63] Debug in progress --- lib/communicator/Communicator_base.cc | 47 ++++++++++++------- lib/communicator/Communicator_mpi3.cc | 2 +- lib/lattice/Lattice_transfer.h | 67 +++++++++++++++++++++++++++ tests/solver/Test_dwf_mrhs_cg_mpi.cc | 6 +-- 4 files changed, 102 insertions(+), 20 deletions(-) diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc index 531dd358..223b07fd 100644 --- a/lib/communicator/Communicator_base.cc +++ b/lib/communicator/Communicator_base.cc @@ -134,8 +134,18 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t CartesianCommunicator::CartesianCommunicator(const std::vector &processors,const CartesianCommunicator &parent,int &srank) { _ndimension = processors.size(); - assert(_ndimension = parent._ndimension); - + + int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension); + std::vector parent_processor_coor(_ndimension,0); + std::vector parent_processors (_ndimension,1); + + // Can make 5d grid from 4d etc... + int pad = _ndimension-parent_ndimension; + for(int d=0;d &processors, std::vector ssize(_ndimension); // coor of split within parent for(int d=0;d<_ndimension;d++){ - ccoor[d] = parent._processor_coor[d] % processors[d]; - scoor[d] = parent._processor_coor[d] / processors[d]; - ssize[d] = parent._processors[d] / processors[d]; + ccoor[d] = parent_processor_coor[d] % processors[d]; + scoor[d] = parent_processor_coor[d] / processors[d]; + ssize[d] = parent_processors[d] / processors[d]; } int crank; // rank within subcomm ; srank is rank of subcomm within blocks of subcomms // Mpi uses the reverse Lexico convention to us @@ -166,38 +176,34 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors, MPI_Comm comm_split; if ( Nchild > 1 ) { - /* std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec< &processors, // Set up from the new split communicator ////////////////////////////////////////////////////////////////////////////////////////////////////// InitFromMPICommunicator(processors,comm_split); + + std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl; + for(int d=0;d &out, const Lattice &in){ // NB: Easiest to programme if keep in lex order. // ///////////////////////////////////////////////////////// +/* +[0,0,0,0,0] S {V<4>{V<3>{(0,0),(0,0),(0,0)},V<3>{(0,0),(0,0),(0,0)},V<3>{(0,0),(0,0),(0,0)},V<3>{(0,0),(0,0),(0,0)}}} +[0,0,0,0,1] S {V<4>{V<3>{(1,0),(1,0),(1,0)},V<3>{(1,0),(1,0),(1,0)},V<3>{(1,0),(1,0),(1,0)},V<3>{(1,0),(1,0),(1,0)}}} +[0,0,0,0,2] S {V<4>{V<3>{(4,0),(4,0),(4,0)},V<3>{(4,0),(4,0),(4,0)},V<3>{(4,0),(4,0),(4,0)},V<3>{(4,0),(4,0),(4,0)}}} +[0,0,0,0,3] S {V<4>{V<3>{(5,0),(5,0),(5,0)},V<3>{(5,0),(5,0),(5,0)},V<3>{(5,0),(5,0),(5,0)},V<3>{(5,0),(5,0),(5,0)}}} +[0,0,0,0,4] S {V<4>{V<3>{(2,0),(2,0),(2,0)},V<3>{(2,0),(2,0),(2,0)},V<3>{(2,0),(2,0),(2,0)},V<3>{(2,0),(2,0),(2,0)}}} +[0,0,0,0,5] S {V<4>{V<3>{(3,0),(3,0),(3,0)},V<3>{(3,0),(3,0),(3,0)},V<3>{(3,0),(3,0),(3,0)},V<3>{(3,0),(3,0),(3,0)}}} +[0,0,0,0,6] S {V<4>{V<3>{(6,0),(6,0),(6,0)},V<3>{(6,0),(6,0),(6,0)},V<3>{(6,0),(6,0),(6,0)},V<3>{(6,0),(6,0),(6,0)}}} +[0,0,0,0,7] S {V<4>{V<3>{(7,0),(7,0),(7,0)},V<3>{(7,0),(7,0),(7,0)},V<3>{(7,0),(7,0),(7,0)},V<3>{(7,0),(7,0),(7,0)}}} +[0,0,0,0,8] S {V<4>{V<3>{(8,0),(8,0),(8,0)},V<3>{(8,0),(8,0),(8,0)},V<3>{(8,0),(8,0),(8,0)},V<3>{(8,0),(8,0),(8,0)}}} +[0,0,0,0,9] S {V<4>{V<3>{(9,0),(9,0),(9,0)},V<3>{(9,0),(9,0),(9,0)},V<3>{(9,0),(9,0),(9,0)},V<3>{(9,0),(9,0),(9,0)}}} +[0,0,0,0,10] S {V<4>{V<3>{(12,0),(12,0),(12,0)},V<3>{(12,0),(12,0),(12,0)},V<3>{(12,0),(12,0),(12,0)},V<3>{(12,0),(12,0),(12,0)}}} +[0,0,0,0,11] S {V<4>{V<3>{(13,0),(13,0),(13,0)},V<3>{(13,0),(13,0),(13,0)},V<3>{(13,0),(13,0),(13,0)},V<3>{(13,0),(13,0),(13,0)}}} +[0,0,0,0,12] S {V<4>{V<3>{(10,0),(10,0),(10,0)},V<3>{(10,0),(10,0),(10,0)},V<3>{(10,0),(10,0),(10,0)},V<3>{(10,0),(10,0),(10,0)}}} +[0,0,0,0,13] S {V<4>{V<3>{(11,0),(11,0),(11,0)},V<3>{(11,0),(11,0),(11,0)},V<3>{(11,0),(11,0),(11,0)},V<3>{(11,0),(11,0),(11,0)}}} +[0,0,0,0,14] S {V<4>{V<3>{(14,0),(14,0),(14,0)},V<3>{(14,0),(14,0),(14,0)},V<3>{(14,0),(14,0),(14,0)},V<3>{(14,0),(14,0),(14,0)}}} +[0,0,0,0,15] S {V<4>{V<3>{(15,0),(15,0),(15,0)},V<3>{(15,0),(15,0),(15,0)},V<3>{(15,0),(15,0),(15,0)},V<3>{(15,0),(15,0),(15,0)}}} + + +Process decomp +[A(0 1) A(2 3) B(0 1) B(2 3)] [ A(4 5) A(6 7) B(4 5) B(6 7)] [ A(8 9) A(10 11) B(8 9) B(10 11)] [A(12 13) A(14 15) B(12 13) B(14 15)] + +A2A(Full) + -- divides M*fL into fP segments of size M*fL/fP = fL/sP + -- total is fP * fL/sP = M * fL + A(0 1) A(4 5) A(8 9) A(12 13) + A(2 3) A(6 7) A(10 11) A(14 15) + B(0 1) B(4 5) B(8 9) B(12 13) + B(2 3) B(6 7) B(10 11) B(14 15) + + +A2A(Split) + A(0 1) A(4 5) A(2 3) A(6 7) + A(8 9) A(12 13) A(10 11) A(14 15) + B(0 1) B(2 3) B(4 5) B(6 7) + B(8 9) B(10 11) B(12 13) B(14 15) + +-------------------- +-- General case +-------------------- +G global lattice +fP - procs +sP - Procs in split grid +M - subdivisions/vectors - M*sP = fP ** constraint 1 +fL = G/fP per node (full) +sL = G/sP per node split + +[ G * M ] total = G*fP/sP. +[ Subdivide fL*M by fP => fL *M / fP = fL/fP *fP/sP = fL/sP ] +-------------------- +-- 1st A2A chunk is fL*M/fP = G/fP *fP/sP /fP = fL/sP +-- Let cL = fL/sP chunk. ( Divide into fP/sP = M chunks ) + +-- node 0 1st cL of node 0,1,... fP-1 ; vector 0 +-- node 1 2nd cL of node 0,1,... fP-1 +-- node 2 3nd cL of node 0,1,... fP-1 +-- node 3 4th cL of node 0,1,... fP-1 +... when node > sP get vector 1 etc... + +-- 2nd A2A (over sP nodes; subdivide the fP into sP chunks of M) +-- node 0 1st cL of node 0M..(1M-1); 2nd cL of node 0M..(1M-1)).. +-- node 1 1st cL of node 1M..(2M-1); 2nd cL of node 1M..(2M-1).. +-- node 2 1st cL of node 2M..(3M-1); 2nd cL of node 2M..(3M-1).. +-- node 3 1st cL of node 3M..(3M-1); 2nd cL of node 2M..(3M-1).. +-- +-- Insert correctly + */ template void Grid_split(std::vector > & full,Lattice & split) { diff --git a/tests/solver/Test_dwf_mrhs_cg_mpi.cc b/tests/solver/Test_dwf_mrhs_cg_mpi.cc index f640edff..d380f91e 100644 --- a/tests/solver/Test_dwf_mrhs_cg_mpi.cc +++ b/tests/solver/Test_dwf_mrhs_cg_mpi.cc @@ -95,7 +95,7 @@ int main (int argc, char ** argv) FermionField tmp(FGrid); for(int s=0;s HermOp(Ddwf); MdagMLinearOperator HermOpCk(Dchk); - ConjugateGradient CG((1.0e-5),10000); + ConjugateGradient CG((1.0e-2),10000); s_res = zero; CG(HermOp,s_src,s_res); From e55397bc134ead26dbac8e2ef244406a8b9d6a3b Mon Sep 17 00:00:00 2001 From: azusayamaguchi Date: Fri, 24 Nov 2017 14:18:30 +0000 Subject: [PATCH 57/63] Staggerd cg --- lib/algorithms/LinearOperator.h | 20 +++++++++++--------- lib/algorithms/iterative/SchurRedBlack.h | 9 +++++++-- lib/lattice/Lattice_transfer.h | 16 ++++++---------- tests/solver/Test_staggered_cg_schur.cc | 14 ++++++++++++++ 4 files changed, 38 insertions(+), 21 deletions(-) diff --git a/lib/algorithms/LinearOperator.h b/lib/algorithms/LinearOperator.h index 0fa039c8..26746e6e 100644 --- a/lib/algorithms/LinearOperator.h +++ b/lib/algorithms/LinearOperator.h @@ -308,32 +308,34 @@ namespace Grid { public: SchurStaggeredOperator (Matrix &Mat): _Mat(Mat){}; virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ + GridLogIterative.TimingMode(1); + std::cout << GridLogIterative << " HermOpAndNorm "< inline void pickCheckerboard(int cb,Lattice &half,const Lattice &full){ half.checkerboard = cb; - int ssh=0; - //parallel_for - for(int ss=0;ssoSites();ss++){ - std::vector coor; + + parallel_for(int ss=0;ssoSites();ss++){ int cbos; - + std::vector coor; full._grid->oCoorFromOindex(coor,ss); cbos=half._grid->CheckerBoard(coor); if (cbos==cb) { + int ssh=half._grid->oIndex(coor); half._odata[ssh] = full._odata[ss]; - ssh++; } } } template inline void setCheckerboard(Lattice &full,const Lattice &half){ int cb = half.checkerboard; - int ssh=0; - //parallel_for - for(int ss=0;ssoSites();ss++){ + parallel_for(int ss=0;ssoSites();ss++){ std::vector coor; int cbos; @@ -77,8 +73,8 @@ inline void subdivides(GridBase *coarse,GridBase *fine) cbos=half._grid->CheckerBoard(coor); if (cbos==cb) { + int ssh=half._grid->oIndex(coor); full._odata[ss]=half._odata[ssh]; - ssh++; } } } diff --git a/tests/solver/Test_staggered_cg_schur.cc b/tests/solver/Test_staggered_cg_schur.cc index 09044995..a5c25b85 100644 --- a/tests/solver/Test_staggered_cg_schur.cc +++ b/tests/solver/Test_staggered_cg_schur.cc @@ -70,7 +70,21 @@ int main (int argc, char ** argv) ConjugateGradient CG(1.0e-8,10000); SchurRedBlackStaggeredSolve SchurSolver(CG); + double volume=1.0; + for(int mu=0;mu volume * 1146 + double ncall=CG.IterationsToComplete; + double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146 + + std::cout< &processors, ////////////////////////////////////////////////////////////////////////////////////////////////////// InitFromMPICommunicator(processors,comm_split); - std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl; - for(int d=0;d &proc MPI_Comm_rank(communicator,&_processor); MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]); - if ( communicator_base != communicator_world ) { + if ( 0 && (communicator_base != communicator_world) ) { std::cout << "InitFromMPICommunicator Cartesian communicator created with a non-world communicator"< Date: Mon, 27 Nov 2017 12:33:08 +0000 Subject: [PATCH 59/63] Believe split/unsplit works, but need to make pretty --- lib/lattice/Lattice_transfer.h | 201 ++++++++++++++++++++++----------- 1 file changed, 133 insertions(+), 68 deletions(-) diff --git a/lib/lattice/Lattice_transfer.h b/lib/lattice/Lattice_transfer.h index dd03fb4f..3d9289d6 100644 --- a/lib/lattice/Lattice_transfer.h +++ b/lib/lattice/Lattice_transfer.h @@ -890,50 +890,85 @@ void Grid_split(std::vector > & full,Lattice & split) if ( ratio[d] != 1 ) { full_grid ->AllToAll(d,alldata,tmpdata); - // std::cout << GridLogMessage << "Grid_split: dim " <_processors[d]<_processors[d] > 1 ) { + alldata=tmpdata; + split_grid->AllToAll(d,alldata,tmpdata); + } - auto rdims = ldims; rdims[d] *= ratio[d]; - auto rsites= lsites*ratio[d]; - for(int v=0;v_processors[d]; + int fP = full_grid->_processors[d]; - for(int r=0;r_processors[d] > 1 ) { - tmpdata = alldata; - split_grid->AllToAll(d,tmpdata,alldata); - } } } vectorizeFromLexOrdArray(alldata,split); @@ -1008,55 +1043,84 @@ void Grid_unsplit(std::vector > & full,Lattice & split) std::vector rcoor(ndim); int nvec = 1; - lsites = split_grid->lSites(); - std::vector ldims = split_grid->_ldimensions; + uint64_t rsites = split_grid->lSites(); + std::vector rdims = split_grid->_ldimensions; - // for(int d=ndim-1;d>=0;d--){ for(int d=0;d_processors[d]; + int fP = full_grid->_processors[d]; - if ( split_grid->_processors[d] > 1 ) { - tmpdata = alldata; - split_grid->AllToAll(d,tmpdata,alldata); - } + int M = ratio[d]; + auto ldims = rdims; ldims[d] /= M; // Decrease local dims by same factor + auto lsites= rsites/M; // Decreases rsites by M - ////////////////////////////////////////// - //Local volume for this dimension is expanded by ratio of processor extents - // Number of vectors is decreased by same factor - // Rearrange to lexico for bigger volume - ////////////////////////////////////////// - auto rsites= lsites/ratio[d]; - auto rdims = ldims; rdims[d]/=ratio[d]; + int fvol = lsites; + int svol = rsites; + int chunk = (nvec*fvol)/sP; + int cL = (nvec*ldims[d])/sP; + + for(int c=0;c= tmpdata.size() ) { - // rsite, rcoor --> smaller local volume - // lsite, lcoor --> bigger original (single node?) volume - // For loop over each site within smaller subvol - for(int rsite=0;rsiteAllToAll(d,tmpdata,alldata); + if ( split_grid->_processors[d] > 1 ) { + split_grid->AllToAll(d,tmpdata,alldata); + tmpdata=alldata; + } + full_grid ->AllToAll(d,tmpdata,alldata); + + rdims[d]/= M; + rsites /= M; + nvec *= M; // Increase nvec by subdivision factor + } } } @@ -1064,12 +1128,13 @@ void Grid_unsplit(std::vector > & full,Lattice & split) for(int v=0;v Date: Mon, 27 Nov 2017 12:34:25 +0000 Subject: [PATCH 60/63] Clean on multinode target after split 1 1 2 4 -> 1 1 2 2 --- tests/solver/Test_dwf_mrhs_cg_mpi.cc | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/solver/Test_dwf_mrhs_cg_mpi.cc b/tests/solver/Test_dwf_mrhs_cg_mpi.cc index d380f91e..b3611e01 100644 --- a/tests/solver/Test_dwf_mrhs_cg_mpi.cc +++ b/tests/solver/Test_dwf_mrhs_cg_mpi.cc @@ -121,12 +121,12 @@ int main (int argc, char ** argv) random(pRNG5,src[s]); tmp = 100.0*s; src[s] = (src[s] * 0.1) + tmp; - std::cout << " src ["< Date: Mon, 27 Nov 2017 15:10:22 +0000 Subject: [PATCH 61/63] Debug --- tests/solver/Test_dwf_mrhs_cg_mpi.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/solver/Test_dwf_mrhs_cg_mpi.cc b/tests/solver/Test_dwf_mrhs_cg_mpi.cc index b3611e01..06df58c6 100644 --- a/tests/solver/Test_dwf_mrhs_cg_mpi.cc +++ b/tests/solver/Test_dwf_mrhs_cg_mpi.cc @@ -173,6 +173,7 @@ int main (int argc, char ** argv) // std::cout << " s_src \n" << s_src << std::endl; // std::cout << " s_src_tmp \n" << s_src_tmp << std::endl; // std::cout << " s_src_diff \n" << s_src_diff << std::endl; + // exit(0); #endif /////////////////////////////////////////////////////////////// From 28ceacec45e052578cf6b4fa1f394c87f417d1d2 Mon Sep 17 00:00:00 2001 From: paboyle Date: Mon, 27 Nov 2017 15:13:29 +0000 Subject: [PATCH 62/63] Split/Unsplit working --- lib/lattice/Lattice_transfer.h | 275 ++++++++------------------------- 1 file changed, 65 insertions(+), 210 deletions(-) diff --git a/lib/lattice/Lattice_transfer.h b/lib/lattice/Lattice_transfer.h index 78b80ba4..c7e2a507 100644 --- a/lib/lattice/Lattice_transfer.h +++ b/lib/lattice/Lattice_transfer.h @@ -694,30 +694,6 @@ void precisionChange(Lattice &out, const Lattice &in){ //////////////////////////////////////////////////////////////////////////////// // Communicate between grids //////////////////////////////////////////////////////////////////////////////// -// -// All to all plan -// -// Subvolume on fine grid is v. Vectors a,b,c,d -// -/////////////////////////////////////////////////////////////////////////////////////////////////////////// -// SIMPLEST CASE: -/////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Mesh of nodes (2) ; subdivide to 1 subdivisions -// -// Lex ord: -// N0 va0 vb0 N1 va1 vb1 -// -// For each dimension do an all to all -// -// full AllToAll(0) -// N0 va0 va1 N1 vb0 vb1 -// -// REARRANGE -// N0 va01 N1 vb01 -// -// Must also rearrange data to get into the NEW lex order of grid at each stage. Some kind of "insert/extract". -// NB: Easiest to programme if keep in lex order. -// /////////////////////////////////////////////////////////////////////////////////////////////////////////// // SIMPLE CASE: /////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -751,75 +727,16 @@ void precisionChange(Lattice &out, const Lattice &in){ // // Must also rearrange data to get into the NEW lex order of grid at each stage. Some kind of "insert/extract". // NB: Easiest to programme if keep in lex order. -// -///////////////////////////////////////////////////////// /* - -[0,0,0,0,0] S {V<4>{V<3>{(0,0),(0,0),(0,0)},V<3>{(0,0),(0,0),(0,0)},V<3>{(0,0),(0,0),(0,0)},V<3>{(0,0),(0,0),(0,0)}}} -[0,0,0,0,1] S {V<4>{V<3>{(1,0),(1,0),(1,0)},V<3>{(1,0),(1,0),(1,0)},V<3>{(1,0),(1,0),(1,0)},V<3>{(1,0),(1,0),(1,0)}}} -[0,0,0,0,2] S {V<4>{V<3>{(4,0),(4,0),(4,0)},V<3>{(4,0),(4,0),(4,0)},V<3>{(4,0),(4,0),(4,0)},V<3>{(4,0),(4,0),(4,0)}}} -[0,0,0,0,3] S {V<4>{V<3>{(5,0),(5,0),(5,0)},V<3>{(5,0),(5,0),(5,0)},V<3>{(5,0),(5,0),(5,0)},V<3>{(5,0),(5,0),(5,0)}}} -[0,0,0,0,4] S {V<4>{V<3>{(2,0),(2,0),(2,0)},V<3>{(2,0),(2,0),(2,0)},V<3>{(2,0),(2,0),(2,0)},V<3>{(2,0),(2,0),(2,0)}}} -[0,0,0,0,5] S {V<4>{V<3>{(3,0),(3,0),(3,0)},V<3>{(3,0),(3,0),(3,0)},V<3>{(3,0),(3,0),(3,0)},V<3>{(3,0),(3,0),(3,0)}}} -[0,0,0,0,6] S {V<4>{V<3>{(6,0),(6,0),(6,0)},V<3>{(6,0),(6,0),(6,0)},V<3>{(6,0),(6,0),(6,0)},V<3>{(6,0),(6,0),(6,0)}}} -[0,0,0,0,7] S {V<4>{V<3>{(7,0),(7,0),(7,0)},V<3>{(7,0),(7,0),(7,0)},V<3>{(7,0),(7,0),(7,0)},V<3>{(7,0),(7,0),(7,0)}}} -[0,0,0,0,8] S {V<4>{V<3>{(8,0),(8,0),(8,0)},V<3>{(8,0),(8,0),(8,0)},V<3>{(8,0),(8,0),(8,0)},V<3>{(8,0),(8,0),(8,0)}}} -[0,0,0,0,9] S {V<4>{V<3>{(9,0),(9,0),(9,0)},V<3>{(9,0),(9,0),(9,0)},V<3>{(9,0),(9,0),(9,0)},V<3>{(9,0),(9,0),(9,0)}}} -[0,0,0,0,10] S {V<4>{V<3>{(12,0),(12,0),(12,0)},V<3>{(12,0),(12,0),(12,0)},V<3>{(12,0),(12,0),(12,0)},V<3>{(12,0),(12,0),(12,0)}}} -[0,0,0,0,11] S {V<4>{V<3>{(13,0),(13,0),(13,0)},V<3>{(13,0),(13,0),(13,0)},V<3>{(13,0),(13,0),(13,0)},V<3>{(13,0),(13,0),(13,0)}}} -[0,0,0,0,12] S {V<4>{V<3>{(10,0),(10,0),(10,0)},V<3>{(10,0),(10,0),(10,0)},V<3>{(10,0),(10,0),(10,0)},V<3>{(10,0),(10,0),(10,0)}}} -[0,0,0,0,13] S {V<4>{V<3>{(11,0),(11,0),(11,0)},V<3>{(11,0),(11,0),(11,0)},V<3>{(11,0),(11,0),(11,0)},V<3>{(11,0),(11,0),(11,0)}}} -[0,0,0,0,14] S {V<4>{V<3>{(14,0),(14,0),(14,0)},V<3>{(14,0),(14,0),(14,0)},V<3>{(14,0),(14,0),(14,0)},V<3>{(14,0),(14,0),(14,0)}}} -[0,0,0,0,15] S {V<4>{V<3>{(15,0),(15,0),(15,0)},V<3>{(15,0),(15,0),(15,0)},V<3>{(15,0),(15,0),(15,0)},V<3>{(15,0),(15,0),(15,0)}}} - - -Process decomp -[A(0 1) A(2 3) B(0 1) B(2 3)] [ A(4 5) A(6 7) B(4 5) B(6 7)] [ A(8 9) A(10 11) B(8 9) B(10 11)] [A(12 13) A(14 15) B(12 13) B(14 15)] - -A2A(Full) - -- divides M*fL into fP segments of size M*fL/fP = fL/sP - -- total is fP * fL/sP = M * fL - A(0 1) A(4 5) A(8 9) A(12 13) - A(2 3) A(6 7) A(10 11) A(14 15) - B(0 1) B(4 5) B(8 9) B(12 13) - B(2 3) B(6 7) B(10 11) B(14 15) - - -A2A(Split) - A(0 1) A(4 5) A(2 3) A(6 7) - A(8 9) A(12 13) A(10 11) A(14 15) - B(0 1) B(2 3) B(4 5) B(6 7) - B(8 9) B(10 11) B(12 13) B(14 15) - --------------------- --- General case --------------------- -G global lattice -fP - procs -sP - Procs in split grid -M - subdivisions/vectors - M*sP = fP ** constraint 1 -fL = G/fP per node (full) -sL = G/sP per node split - -[ G * M ] total = G*fP/sP. -[ Subdivide fL*M by fP => fL *M / fP = fL/fP *fP/sP = fL/sP ] --------------------- --- 1st A2A chunk is fL*M/fP = G/fP *fP/sP /fP = fL/sP --- Let cL = fL/sP chunk. ( Divide into fP/sP = M chunks ) - --- node 0 1st cL of node 0,1,... fP-1 ; vector 0 --- node 1 2nd cL of node 0,1,... fP-1 --- node 2 3nd cL of node 0,1,... fP-1 --- node 3 4th cL of node 0,1,... fP-1 -... when node > sP get vector 1 etc... - --- 2nd A2A (over sP nodes; subdivide the fP into sP chunks of M) --- node 0 1st cL of node 0M..(1M-1); 2nd cL of node 0M..(1M-1)).. --- node 1 1st cL of node 1M..(2M-1); 2nd cL of node 1M..(2M-1).. --- node 2 1st cL of node 2M..(3M-1); 2nd cL of node 2M..(3M-1).. --- node 3 1st cL of node 3M..(3M-1); 2nd cL of node 2M..(3M-1).. --- --- Insert correctly + * Let chunk = (fvol*nvec)/sP be size of a chunk. ( Divide lexico vol * nvec into fP/sP = M chunks ) + * + * 2nd A2A (over sP nodes; subdivide the fP into sP chunks of M) + * + * node 0 1st chunk of node 0M..(1M-1); 2nd chunk of node 0M..(1M-1).. data chunk x M x sP = fL / sP * M * sP = fL * M growth + * node 1 1st chunk of node 1M..(2M-1); 2nd chunk of node 1M..(2M-1).. + * node 2 1st chunk of node 2M..(3M-1); 2nd chunk of node 2M..(3M-1).. + * node 3 1st chunk of node 3M..(3M-1); 2nd chunk of node 2M..(3M-1).. + * etc... */ template void Grid_split(std::vector > & full,Lattice & split) @@ -879,7 +796,6 @@ void Grid_split(std::vector > & full,Lattice & split) int nvec = nvector; // Counts down to 1 as we collapse dims std::vector ldims = full_grid->_ldimensions; - std::vector lcoor(ndim); for(int d=ndim-1;d>=0;d--){ @@ -891,73 +807,40 @@ void Grid_split(std::vector > & full,Lattice & split) split_grid->AllToAll(d,alldata,tmpdata); } - /* --- Let chunk = (fL*nvec)/sP chunk. ( Divide into fP/sP = M chunks ) --- --- 2nd A2A (over sP nodes; subdivide the fP into sP chunks of M) --- --- node 0 1st chunk of node 0M..(1M-1); 2nd chunk of node 0M..(1M-1).. data chunk x M x sP = fL / sP * M * sP = fL * M growth --- node 1 1st chunk of node 1M..(2M-1); 2nd chunk of node 1M..(2M-1).. --- node 2 1st chunk of node 2M..(3M-1); 2nd chunk of node 2M..(3M-1).. --- node 3 1st chunk of node 3M..(3M-1); 2nd chunk of node 2M..(3M-1).. --- --- Loop over c = 0..chunk-1 --- Loop over n = 0..M --- Loop over j = 0..sP --- total chunk*M*sP = fL/sP*fP/sP*sP = G/sP = sL --- csite = (c+m*chunk)% --- split into m*chunk+o = lsite*nvec/fP --- Must turn to vec, rsite, - */ - auto rdims = ldims; - int M = ratio[d]; - nvec /= M; // Reduce nvec by subdivision factor - rdims[d] *= M; // increase local dims by same factor + auto M = ratio[d]; auto rsites= lsites*M;// increases rsites by M + nvec /= M; // Reduce nvec by subdivision factor + rdims[d] *= M; // increase local dim by same factor int sP = split_grid->_processors[d]; int fP = full_grid->_processors[d]; int fvol = lsites; - int svol = rsites; - int chunk = (nvec*fvol)/sP; - int cL = (nvec*ldims[d])/sP; - - for(int c=0;c coor(ndim); + Lexicographic::CoorFromIndex(coor, lex_fvol, ldims); + coor[d] += m*ldims[d]; + Lexicographic::IndexFromCoor(coor, lex_r, rdims); + lex_r += lex_vec * rsites; - alldata[rsite] = tmpdata[c+chunk*m+chunk*M*s]; - - if ( 0 - &&(lcoor[0]==0) - &&(lcoor[1]==0) - &&(lcoor[2]==0) - &&(lcoor[3]==0) ) { - - std::cout << GridLogMessage << " SPLIT rcoor[d] = "< > & full,Lattice & split) ///////////////////////////////////////////////////////////////// // Start from split grid and work towards full grid ///////////////////////////////////////////////////////////////// - std::vector lcoor(ndim); - std::vector rcoor(ndim); int nvec = 1; uint64_t rsites = split_grid->lSites(); @@ -1046,77 +927,52 @@ void Grid_unsplit(std::vector > & full,Lattice & split) if ( ratio[d] != 1 ) { - { - int sP = split_grid->_processors[d]; - int fP = full_grid->_processors[d]; + auto M = ratio[d]; - int M = ratio[d]; - auto ldims = rdims; ldims[d] /= M; // Decrease local dims by same factor - auto lsites= rsites/M; // Decreases rsites by M - - int fvol = lsites; - int svol = rsites; - int chunk = (nvec*fvol)/sP; - int cL = (nvec*ldims[d])/sP; + int sP = split_grid->_processors[d]; + int fP = full_grid->_processors[d]; + + auto ldims = rdims; ldims[d] /= M; // Decrease local dims by same factor + auto lsites= rsites/M; // Decreases rsites by M + + int fvol = lsites; + int chunk = (nvec*fvol)/sP; assert(chunk*sP == nvec*fvol); + { + // Loop over reordered data post A2A for(int c=0;c= tmpdata.size() ) { - - std::cout << "c "< coor(ndim); + Lexicographic::CoorFromIndex(coor, lex_fvol, ldims); + coor[d] += m*ldims[d]; + Lexicographic::IndexFromCoor(coor, lex_r, rdims); + lex_r += lex_vec * rsites; + // LexicoFind coordinate & vector number within split lattice + tmpdata[lex_c] = alldata[lex_r]; } } } - - if ( split_grid->_processors[d] > 1 ) { - split_grid->AllToAll(d,tmpdata,alldata); - tmpdata=alldata; - } - full_grid ->AllToAll(d,tmpdata,alldata); - - rdims[d]/= M; - rsites /= M; - nvec *= M; // Increase nvec by subdivision factor } + + if ( split_grid->_processors[d] > 1 ) { + split_grid->AllToAll(d,tmpdata,alldata); + tmpdata=alldata; + } + full_grid ->AllToAll(d,tmpdata,alldata); + rdims[d]/= M; + rsites /= M; + nvec *= M; // Increase nvec by subdivision factor } } @@ -1129,7 +985,6 @@ void Grid_unsplit(std::vector > & full,Lattice & split) } vectorizeFromLexOrdArray(scalardata,full[v]); } - } } From 2427a21428b6704a08119a24c60f0e77830c55c7 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Fri, 1 Dec 2017 19:44:07 +0000 Subject: [PATCH 63/63] minor serial IO fixes, XML now issues warning when trying to read absent nodes, these becomes --- lib/serialisation/JSON_IO.cc | 2 +- lib/serialisation/MacroMagic.h | 6 ++++- lib/serialisation/XmlIO.cc | 41 ++++++++++++++++++++++------------ lib/serialisation/XmlIO.h | 10 +++++++-- 4 files changed, 41 insertions(+), 18 deletions(-) diff --git a/lib/serialisation/JSON_IO.cc b/lib/serialisation/JSON_IO.cc index 99a9cdd6..6a01aa84 100644 --- a/lib/serialisation/JSON_IO.cc +++ b/lib/serialisation/JSON_IO.cc @@ -25,7 +25,7 @@ See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ -#include +#include using namespace Grid; using namespace std; diff --git a/lib/serialisation/MacroMagic.h b/lib/serialisation/MacroMagic.h index 774c947f..5df2c780 100644 --- a/lib/serialisation/MacroMagic.h +++ b/lib/serialisation/MacroMagic.h @@ -125,7 +125,11 @@ static inline void write(Writer &WR,const std::string &s, const cname &obj){ }\ template \ static inline void read(Reader &RD,const std::string &s, cname &obj){ \ - push(RD,s);\ + if (!push(RD,s))\ + {\ + std::cout << Grid::GridLogWarning << "IO: Cannot open node '" << s << "'" << std::endl;\ + return;\ + };\ GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_READ_MEMBER,__VA_ARGS__)) \ pop(RD);\ }\ diff --git a/lib/serialisation/XmlIO.cc b/lib/serialisation/XmlIO.cc index 260611a5..8ac7422c 100644 --- a/lib/serialisation/XmlIO.cc +++ b/lib/serialisation/XmlIO.cc @@ -100,13 +100,16 @@ XmlReader::XmlReader(const string &fileName,string toplev) : fileName_(fileName) bool XmlReader::push(const string &s) { + if (node_.child(s.c_str())) + { + node_ = node_.child(s.c_str()); - if (node_.child(s.c_str()) == NULL ) + return true; + } + else + { return false; - - node_ = node_.child(s.c_str()); - return true; - + } } void XmlReader::pop(void) @@ -117,20 +120,30 @@ void XmlReader::pop(void) bool XmlReader::nextElement(const std::string &s) { if (node_.next_sibling(s.c_str())) - { - node_ = node_.next_sibling(s.c_str()); - - return true; - } + { + node_ = node_.next_sibling(s.c_str()); + + return true; + } else - { - return false; - } + { + return false; + } } template <> void XmlReader::readDefault(const string &s, string &output) { - output = node_.child(s.c_str()).first_child().value(); + if (node_.child(s.c_str())) + { + output = node_.child(s.c_str()).first_child().value(); + } + else + { + std::cout << GridLogWarning << "XML: cannot open node '" << s << "'"; + std::cout << std::endl; + + output = ""; + } } diff --git a/lib/serialisation/XmlIO.h b/lib/serialisation/XmlIO.h index fcdbf1e4..e37eb8d9 100644 --- a/lib/serialisation/XmlIO.h +++ b/lib/serialisation/XmlIO.h @@ -39,6 +39,7 @@ Author: paboyle #include #include +#include namespace Grid { @@ -119,7 +120,6 @@ namespace Grid std::string buf; readDefault(s, buf); - // std::cout << s << " " << buf << std::endl; fromString(output, buf); } @@ -132,7 +132,13 @@ namespace Grid std::string buf; unsigned int i = 0; - push(s); + if (!push(s)) + { + std::cout << GridLogWarning << "XML: cannot open node '" << s << "'"; + std::cout << std::endl; + + return; + } while (node_.child("elem")) { output.resize(i + 1);