Better IRL interface

2026-07-20 18:43:27 +01:00 · 2017-10-26 01:59:59 +01:00
parent e9be293444
commit ccd20df827
4 changed files with 557 additions and 12 deletions
@@ -0,0 +1,143 @@
+namespace Grid { 
+
+/*
+  BlockProjector
+
+  If _HP_BLOCK_PROJECTORS_ is defined, we assume that _evec is a basis that is not
+  fully orthonormalized (to the precision of the coarse field) and we allow for higher-precision
+  coarse field than basis field.
+
+*/
+//#define _HP_BLOCK_PROJECTORS_
+
+template<typename Field>
+class BlockProjector {
+public:
+
+  BasisFieldVector<Field>& _evec;
+  BlockedGrid<Field>& _bgrid;
+
+  BlockProjector(BasisFieldVector<Field>& evec, BlockedGrid<Field>& bgrid) : _evec(evec), _bgrid(bgrid) {
+  }
+
+  void createOrthonormalBasis(RealD thres = 0.0) {
+
+    GridStopWatch sw;
+    sw.Start();
+
+    int cnt = 0;
+
+#pragma omp parallel shared(cnt)
+    {
+      int lcnt = 0;
+
+#pragma omp for
+      for (int b=0;b<_bgrid._o_blocks;b++) {
+	
+	for (int i=0;i<_evec._Nm;i++) {
+	  
+	  auto nrm0 = _bgrid.block_sp(b,_evec._v[i],_evec._v[i]);
+	  
+	  // |i> -= <j|i> |j>
+	  for (int j=0;j<i;j++) {
+	    _bgrid.block_caxpy(b,_evec._v[i],-_bgrid.block_sp(b,_evec._v[j],_evec._v[i]),_evec._v[j],_evec._v[i]);
+	  }
+	  
+	  auto nrm = _bgrid.block_sp(b,_evec._v[i],_evec._v[i]);
+	  
+	  auto eps = nrm/nrm0;
+	  if (Reduce(eps).real() < thres) {
+	    lcnt++;
+	  }
+	  
+	  // TODO: if norm is too small, remove this eigenvector/mark as not needed; in practice: set it to zero norm here and return a mask
+	  // that is then used later to decide not to write certain eigenvectors to disk (add a norm calculation before subtraction step and look at nrm/nrm0 < eps to decide)
+	  _bgrid.block_cscale(b,1.0 / sqrt(nrm),_evec._v[i]);
+	  
+	}
+	
+      }
+
+#pragma omp critical
+      {
+	cnt += lcnt;
+      }
+    }
+    sw.Stop();
+    std::cout << GridLogMessage << "Gram-Schmidt to create blocked basis took " << sw.Elapsed() << " (" << ((RealD)cnt / (RealD)_bgrid._o_blocks / (RealD)_evec._Nm) 
+	      << " below threshold)" << std::endl;
+
+  }
+
+  template<typename CoarseField>
+  void coarseToFine(const CoarseField& in, Field& out) {
+
+    out = zero;
+    out.checkerboard = _evec._v[0].checkerboard;
+
+    int Nbasis = sizeof(in._odata[0]._internal._internal) / sizeof(in._odata[0]._internal._internal[0]);
+    assert(Nbasis == _evec._Nm);
+    
+#pragma omp parallel for
+    for (int b=0;b<_bgrid._o_blocks;b++) {
+      for (int j=0;j<_evec._Nm;j++) {
+	_bgrid.block_caxpy(b,out,in._odata[b]._internal._internal[j],_evec._v[j],out);
+      }
+    }
+
+  }
+
+  template<typename CoarseField>
+  void fineToCoarse(const Field& in, CoarseField& out) {
+
+    out = zero;
+
+    int Nbasis = sizeof(out._odata[0]._internal._internal) / sizeof(out._odata[0]._internal._internal[0]);
+    assert(Nbasis == _evec._Nm);
+
+
+    Field tmp(_bgrid._grid);
+    tmp = in;
+    
+#pragma omp parallel for
+    for (int b=0;b<_bgrid._o_blocks;b++) {
+      for (int j=0;j<_evec._Nm;j++) {
+	// |rhs> -= <j|rhs> |j>
+	auto c = _bgrid.block_sp(b,_evec._v[j],tmp);
+	_bgrid.block_caxpy(b,tmp,-c,_evec._v[j],tmp); // may make this more numerically stable
+	out._odata[b]._internal._internal[j] = c;
+      }
+    }
+
+  }
+
+  template<typename CoarseField>
+    void deflateFine(BasisFieldVector<CoarseField>& _coef,const std::vector<RealD>& eval,int N,const Field& src_orig,Field& result) {
+    result = zero;
+    for (int i=0;i<N;i++) {
+      Field tmp(result._grid);
+      coarseToFine(_coef._v[i],tmp);
+      axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
+    }
+  }
+
+  template<typename CoarseField>
+    void deflateCoarse(BasisFieldVector<CoarseField>& _coef,const std::vector<RealD>& eval,int N,const Field& src_orig,Field& result) {
+    CoarseField src_coarse(_coef._v[0]._grid);
+    CoarseField result_coarse = src_coarse;
+    result_coarse = zero;
+    fineToCoarse(src_orig,src_coarse);
+    for (int i=0;i<N;i++) {
+      axpy(result_coarse,TensorRemove(innerProduct(_coef._v[i],src_coarse)) / eval[i],_coef._v[i],result_coarse);
+    }
+    coarseToFine(result_coarse,result);
+  }
+
+  template<typename CoarseField>
+    void deflate(BasisFieldVector<CoarseField>& _coef,const std::vector<RealD>& eval,int N,const Field& src_orig,Field& result) {
+    // Deflation on coarse Grid is much faster, so use it by default.  Deflation on fine Grid is kept for legacy reasons for now.
+    deflateCoarse(_coef,eval,N,src_orig,result);
+  }
+
+};
+}
@@ -0,0 +1,401 @@
+namespace Grid {
+
+template<typename Field>
+class BlockedGrid {
+public:
+  GridBase* _grid;
+  typedef typename Field::scalar_type  Coeff_t;
+  typedef typename Field::vector_type vCoeff_t;
+  
+  std::vector<int> _bs; // block size
+  std::vector<int> _nb; // number of blocks
+  std::vector<int> _l;  // local dimensions irrespective of cb
+  std::vector<int> _l_cb;  // local dimensions of checkerboarded vector
+  std::vector<int> _l_cb_o;  // local dimensions of inner checkerboarded vector
+  std::vector<int> _bs_cb; // block size in checkerboarded vector
+  std::vector<int> _nb_o; // number of blocks of simd o-sites
+
+  int _nd, _blocks, _cf_size, _cf_block_size, _cf_o_block_size, _o_blocks, _block_sites;
+  
+  BlockedGrid(GridBase* grid, const std::vector<int>& block_size) :
+    _grid(grid), _bs(block_size), _nd((int)_bs.size()), 
+      _nb(block_size), _l(block_size), _l_cb(block_size), _nb_o(block_size),
+      _l_cb_o(block_size), _bs_cb(block_size) {
+
+    _blocks = 1;
+    _o_blocks = 1;
+    _l = grid->FullDimensions();
+    _l_cb = grid->LocalDimensions();
+    _l_cb_o = grid->_rdimensions;
+
+    _cf_size = 1;
+    _block_sites = 1;
+    for (int i=0;i<_nd;i++) {
+      _l[i] /= grid->_processors[i];
+
+      assert(!(_l[i] % _bs[i])); // lattice must accommodate choice of blocksize
+
+      int r = _l[i] / _l_cb[i];
+      assert(!(_bs[i] % r)); // checkerboarding must accommodate choice of blocksize
+      _bs_cb[i] = _bs[i] / r;
+      _block_sites *= _bs_cb[i];
+      _nb[i] = _l[i] / _bs[i];
+      _nb_o[i] = _nb[i] / _grid->_simd_layout[i];
+      if (_nb[i] % _grid->_simd_layout[i]) { // simd must accommodate choice of blocksize
+	std::cout << GridLogMessage << "Problem: _nb[" << i << "] = " << _nb[i] << " _grid->_simd_layout[" << i << "] = " << _grid->_simd_layout[i] << std::endl;
+	assert(0);
+      }
+      _blocks *= _nb[i];
+      _o_blocks *= _nb_o[i];
+      _cf_size *= _l[i];
+    }
+
+    _cf_size *= 12 / 2;
+    _cf_block_size = _cf_size / _blocks;
+    _cf_o_block_size = _cf_size / _o_blocks;
+
+    std::cout << GridLogMessage << "BlockedGrid:" << std::endl;
+    std::cout << GridLogMessage << " _l     = " << _l << std::endl;
+    std::cout << GridLogMessage << " _l_cb     = " << _l_cb << std::endl;
+    std::cout << GridLogMessage << " _l_cb_o     = " << _l_cb_o << std::endl;
+    std::cout << GridLogMessage << " _bs    = " << _bs << std::endl;
+    std::cout << GridLogMessage << " _bs_cb    = " << _bs_cb << std::endl;
+
+    std::cout << GridLogMessage << " _nb    = " << _nb << std::endl;
+    std::cout << GridLogMessage << " _nb_o    = " << _nb_o << std::endl;
+    std::cout << GridLogMessage << " _blocks = " << _blocks << std::endl;
+    std::cout << GridLogMessage << " _o_blocks = " << _o_blocks << std::endl;
+    std::cout << GridLogMessage << " sizeof(vCoeff_t) = " << sizeof(vCoeff_t) << std::endl;
+    std::cout << GridLogMessage << " _cf_size = " << _cf_size << std::endl;
+    std::cout << GridLogMessage << " _cf_block_size = " << _cf_block_size << std::endl;
+    std::cout << GridLogMessage << " _block_sites = " << _block_sites << std::endl;
+    std::cout << GridLogMessage << " _grid->oSites() = " << _grid->oSites() << std::endl;
+
+    //    _grid->Barrier();
+    //abort();
+  }
+
+    void block_to_coor(int b, std::vector<int>& x0) {
+
+      std::vector<int> bcoor;
+      bcoor.resize(_nd);
+      x0.resize(_nd);
+      assert(b < _o_blocks);
+      Lexicographic::CoorFromIndex(bcoor,b,_nb_o);
+      int i;
+
+      for (i=0;i<_nd;i++) {
+	x0[i] = bcoor[i]*_bs_cb[i];
+      }
+
+      //std::cout << GridLogMessage << "Map block b -> " << x0 << std::endl;
+
+    }
+
+    void block_site_to_o_coor(const std::vector<int>& x0, std::vector<int>& coor, int i) {
+      Lexicographic::CoorFromIndex(coor,i,_bs_cb);
+      for (int j=0;j<_nd;j++)
+	coor[j] += x0[j];
+    }
+
+    int block_site_to_o_site(const std::vector<int>& x0, int i) {
+      std::vector<int> coor;  coor.resize(_nd);
+      block_site_to_o_coor(x0,coor,i);
+      Lexicographic::IndexFromCoor(coor,i,_l_cb_o);
+      return i;
+    }
+
+    vCoeff_t block_sp(int b, const Field& x, const Field& y) {
+
+      std::vector<int> x0;
+      block_to_coor(b,x0);
+
+      vCoeff_t ret = 0.0;
+      for (int i=0;i<_block_sites;i++) { // only odd sites
+	int ss = block_site_to_o_site(x0,i);
+	ret += TensorRemove(innerProduct(x._odata[ss],y._odata[ss]));
+      }
+
+      return ret;
+
+    }
+
+    vCoeff_t block_sp(int b, const Field& x, const std::vector< ComplexD >& y) {
+
+      std::vector<int> x0;
+      block_to_coor(b,x0);
+
+      constexpr int nsimd = sizeof(vCoeff_t) / sizeof(Coeff_t);
+      int lsize = _cf_o_block_size / _block_sites;
+
+      std::vector< ComplexD > ret(nsimd);
+      for (int i=0;i<nsimd;i++)
+	ret[i] = 0.0;
+
+      for (int i=0;i<_block_sites;i++) { // only odd sites
+	int ss = block_site_to_o_site(x0,i);
+
+	int n = lsize / nsimd;
+	for (int l=0;l<n;l++) {
+	  for (int j=0;j<nsimd;j++) {
+	    int t = lsize * i + l*nsimd + j;
+
+	    ret[j] += conjugate(((Coeff_t*)&x._odata[ss]._internal)[l*nsimd + j]) * y[t];
+	  }
+	}
+      }
+
+      vCoeff_t vret;
+      for (int i=0;i<nsimd;i++)
+	((Coeff_t*)&vret)[i] = (Coeff_t)ret[i];
+
+      return vret;
+
+    }
+
+    template<class T>
+      void vcaxpy(iScalar<T>& r,const vCoeff_t& a,const iScalar<T>& x,const iScalar<T>& y) {
+      vcaxpy(r._internal,a,x._internal,y._internal);
+    }
+
+    template<class T,int N>
+      void vcaxpy(iVector<T,N>& r,const vCoeff_t& a,const iVector<T,N>& x,const iVector<T,N>& y) {
+      for (int i=0;i<N;i++)
+	vcaxpy(r._internal[i],a,x._internal[i],y._internal[i]);
+    }
+
+    void vcaxpy(vCoeff_t& r,const vCoeff_t& a,const vCoeff_t& x,const vCoeff_t& y) {
+      r = a*x + y;
+    }
+
+    void block_caxpy(int b, Field& ret, const vCoeff_t& a, const Field& x, const Field& y) {
+
+      std::vector<int> x0;
+      block_to_coor(b,x0);
+
+      for (int i=0;i<_block_sites;i++) { // only odd sites
+	int ss = block_site_to_o_site(x0,i);
+	vcaxpy(ret._odata[ss],a,x._odata[ss],y._odata[ss]);
+      }
+
+    }
+
+    void block_caxpy(int b, std::vector< ComplexD >& ret, const vCoeff_t& a, const Field& x, const std::vector< ComplexD >& y) {
+      std::vector<int> x0;
+      block_to_coor(b,x0);
+
+      constexpr int nsimd = sizeof(vCoeff_t) / sizeof(Coeff_t);
+      int lsize = _cf_o_block_size / _block_sites;
+
+      for (int i=0;i<_block_sites;i++) { // only odd sites
+	int ss = block_site_to_o_site(x0,i);
+
+	int n = lsize / nsimd;
+	for (int l=0;l<n;l++) {
+	  vCoeff_t r = a* ((vCoeff_t*)&x._odata[ss]._internal)[l];
+
+	  for (int j=0;j<nsimd;j++) {
+	    int t = lsize * i + l*nsimd + j;
+	    ret[t] = y[t] + ((Coeff_t*)&r)[j];
+	  }
+	}
+      }
+
+    }
+
+    void block_set(int b, Field& ret, const std::vector< ComplexD >& x) {
+      std::vector<int> x0;
+      block_to_coor(b,x0);
+
+      int lsize = _cf_o_block_size / _block_sites;
+
+      for (int i=0;i<_block_sites;i++) { // only odd sites
+	int ss = block_site_to_o_site(x0,i);
+
+	for (int l=0;l<lsize;l++)
+	  ((Coeff_t*)&ret._odata[ss]._internal)[l] = (Coeff_t)x[lsize * i + l]; // convert precision
+      }
+
+    }
+
+    void block_get(int b, const Field& ret, std::vector< ComplexD >& x) {
+      std::vector<int> x0;
+      block_to_coor(b,x0);
+
+      int lsize = _cf_o_block_size / _block_sites;
+
+      for (int i=0;i<_block_sites;i++) { // only odd sites
+	int ss = block_site_to_o_site(x0,i);
+
+	for (int l=0;l<lsize;l++)
+	  x[lsize * i + l] = (ComplexD)((Coeff_t*)&ret._odata[ss]._internal)[l];
+      }
+
+    }
+
+    template<class T>
+    void vcscale(iScalar<T>& r,const vCoeff_t& a,const iScalar<T>& x) {
+      vcscale(r._internal,a,x._internal);
+    }
+
+    template<class T,int N>
+    void vcscale(iVector<T,N>& r,const vCoeff_t& a,const iVector<T,N>& x) {
+      for (int i=0;i<N;i++)
+	vcscale(r._internal[i],a,x._internal[i]);
+    }
+
+    void vcscale(vCoeff_t& r,const vCoeff_t& a,const vCoeff_t& x) {
+      r = a*x;
+    }
+
+    void block_cscale(int b, const vCoeff_t& a, Field& ret) {
+
+      std::vector<int> x0;
+      block_to_coor(b,x0);
+      
+      for (int i=0;i<_block_sites;i++) { // only odd sites
+	int ss = block_site_to_o_site(x0,i);
+	vcscale(ret._odata[ss],a,ret._odata[ss]);
+      }
+    }
+
+    void getCanonicalBlockOffset(int cb, std::vector<int>& x0) {
+      const int ndim = 5;
+      assert(_nb.size() == ndim);
+      std::vector<int> _nbc = { _nb[1], _nb[2], _nb[3], _nb[4], _nb[0] };
+      std::vector<int> _bsc = { _bs[1], _bs[2], _bs[3], _bs[4], _bs[0] };
+      x0.resize(ndim);
+
+      assert(cb >= 0);
+      assert(cb < _nbc[0]*_nbc[1]*_nbc[2]*_nbc[3]*_nbc[4]);
+
+      Lexicographic::CoorFromIndex(x0,cb,_nbc);
+      int i;
+
+      for (i=0;i<ndim;i++) {
+	x0[i] *= _bsc[i];
+      }
+
+      //if (cb < 2)
+      //	std::cout << GridLogMessage << "Map: " << cb << " To: " << x0 << std::endl;
+    }
+
+    void pokeBlockOfVectorCanonical(int cb,Field& v,const std::vector<float>& buf) {
+      std::vector<int> _bsc = { _bs[1], _bs[2], _bs[3], _bs[4], _bs[0] };
+      std::vector<int> ldim = v._grid->LocalDimensions();
+      std::vector<int> cldim = { ldim[1], ldim[2], ldim[3], ldim[4], ldim[0] };
+      const int _nbsc = _bs_cb[0]*_bs_cb[1]*_bs_cb[2]*_bs_cb[3]*_bs_cb[4];
+      // take canonical block cb of v and put it in canonical ordering in buf
+      std::vector<int> cx0;
+      getCanonicalBlockOffset(cb,cx0);
+
+#pragma omp parallel
+      {
+	std::vector<int> co0,cl0;
+	co0=cx0; cl0=cx0;
+
+#pragma omp for
+	for (int i=0;i<_nbsc;i++) {
+	  Lexicographic::CoorFromIndex(co0,2*i,_bsc); // 2* for eo
+	  for (int j=0;j<(int)_bsc.size();j++)
+	    cl0[j] = cx0[j] + co0[j];
+	  
+	  std::vector<int> l0 = { cl0[4], cl0[0], cl0[1], cl0[2], cl0[3] };
+	  int oi = v._grid->oIndex(l0);
+	  int ii = v._grid->iIndex(l0);
+	  int lti = i;
+
+	  //if (cb < 2 && i<2)
+	  //  std::cout << GridLogMessage << "Map: " << cb << ", " << i << " To: " << cl0 << ", " << cx0 << ", " << oi << ", " << ii << std::endl;
+	  
+	  for (int s=0;s<4;s++)
+	    for (int c=0;c<3;c++) {
+	      Coeff_t& ld = ((Coeff_t*)&v._odata[oi]._internal._internal[s]._internal[c])[ii];
+	      int ti = 12*lti + 3*s + c;
+	      ld = Coeff_t(buf[2*ti+0], buf[2*ti+1]);
+	    }
+	}
+      }
+    }
+
+    void peekBlockOfVectorCanonical(int cb,const Field& v,std::vector<float>& buf) {
+      std::vector<int> _bsc = { _bs[1], _bs[2], _bs[3], _bs[4], _bs[0] };
+      std::vector<int> ldim = v._grid->LocalDimensions();
+      std::vector<int> cldim = { ldim[1], ldim[2], ldim[3], ldim[4], ldim[0] };
+      const int _nbsc = _bs_cb[0]*_bs_cb[1]*_bs_cb[2]*_bs_cb[3]*_bs_cb[4];
+      // take canonical block cb of v and put it in canonical ordering in buf
+      std::vector<int> cx0;
+      getCanonicalBlockOffset(cb,cx0);
+
+      buf.resize(_cf_block_size * 2);
+
+#pragma omp parallel
+      {
+	std::vector<int> co0,cl0;
+	co0=cx0; cl0=cx0;
+
+#pragma omp for
+	for (int i=0;i<_nbsc;i++) {
+	  Lexicographic::CoorFromIndex(co0,2*i,_bsc); // 2* for eo
+	  for (int j=0;j<(int)_bsc.size();j++)
+	    cl0[j] = cx0[j] + co0[j];
+	  
+	  std::vector<int> l0 = { cl0[4], cl0[0], cl0[1], cl0[2], cl0[3] };
+	  int oi = v._grid->oIndex(l0);
+	  int ii = v._grid->iIndex(l0);
+	  int lti = i;
+	  
+	  //if (cb < 2 && i<2)
+	  //  std::cout << GridLogMessage << "Map: " << cb << ", " << i << " To: " << cl0 << ", " << cx0 << ", " << oi << ", " << ii << std::endl;
+
+	  for (int s=0;s<4;s++)
+	    for (int c=0;c<3;c++) {
+	      Coeff_t& ld = ((Coeff_t*)&v._odata[oi]._internal._internal[s]._internal[c])[ii];
+	      int ti = 12*lti + 3*s + c;
+	      buf[2*ti+0] = ld.real();
+	      buf[2*ti+1] = ld.imag();
+	    }
+	}
+      }
+    }
+
+    int globalToLocalCanonicalBlock(int slot,const std::vector<int>& src_nodes,int nb) {
+      // processor coordinate
+      int _nd = (int)src_nodes.size();
+      std::vector<int> _src_nodes = src_nodes;
+      std::vector<int> pco(_nd);
+      Lexicographic::CoorFromIndex(pco,slot,_src_nodes);
+      std::vector<int> cpco = { pco[1], pco[2], pco[3], pco[4], pco[0] };
+
+      // get local block
+      std::vector<int> _nbc = { _nb[1], _nb[2], _nb[3], _nb[4], _nb[0] };
+      assert(_nd == 5);
+      std::vector<int> c_src_local_blocks(_nd);
+      for (int i=0;i<_nd;i++) {
+	assert(_grid->_fdimensions[i] % (src_nodes[i] * _bs[i]) == 0);
+	c_src_local_blocks[(i+4) % 5] = _grid->_fdimensions[i] / src_nodes[i] / _bs[i];
+      }
+      std::vector<int> cbcoor(_nd); // coordinate of block in slot in canonical form
+      Lexicographic::CoorFromIndex(cbcoor,nb,c_src_local_blocks);
+
+      // cpco, cbcoor
+      std::vector<int> clbcoor(_nd);
+      for (int i=0;i<_nd;i++) {
+	int cgcoor = cpco[i] * c_src_local_blocks[i] + cbcoor[i]; // global block coordinate
+	int pcoor = cgcoor / _nbc[i]; // processor coordinate in my Grid
+	int tpcoor = _grid->_processor_coor[(i+1)%5];
+	if (pcoor != tpcoor)
+	  return -1;
+	clbcoor[i] = cgcoor - tpcoor * _nbc[i]; // canonical local block coordinate for canonical dimension i
+      }
+
+      int lnb;
+      Lexicographic::IndexFromCoor(clbcoor,lnb,_nbc);
+      //std::cout << "Mapped slot = " << slot << " nb = " << nb << " to " << lnb << std::endl;
+      return lnb;
+    }
+
+
+ };
+
+}
@@ -331,7 +331,7 @@ void CoarseGridLanczos(BlockProjector<Field>& pr,RealD alpha2,RealD beta,int Npo
      ) {
    

-    IRL2.calc(eval2,coef._v,src_coarse,Nconv,true,SkipTest2);
+    IRL2.calc(eval2,coef._v,src_coarse,Nconv,true);

    coef.resize(Nstop2);
    eval2.resize(Nstop2);
@@ -635,7 +635,7 @@ int main (int argc, char ** argv) {
    if (simple_krylov_basis) {
      quick_krylov_basis(evec,src,Op1,Nstop1);
    } else {
-      IRL1.calc(eval1,evec._v,src,Nconv,false,1);
+      IRL1.calc(eval1,evec._v,src,Nconv,false);
    }
    evec.resize(Nstop1); // and throw away superfluous
    eval1.resize(Nstop1);
@@ -56,6 +56,7 @@ struct CompressedLanczosParams : Serializable {
 				  LanczosParams, FineParams,
 				  LanczosParams, CoarseParams,
 				  ChebyParams,   Smoother,
+				  RealD        , coarse_relax_tol,
 				  std::vector<int>, blockSize,
 				  std::string, config,
 				  std::vector < std::complex<double>  >, omega,
@@ -137,12 +138,13 @@ class ImplicitlyRestartedLanczosSmoothedTester  : public ImplicitlyRestartedLanc
  OperatorFunction<FineField>   & _smoother;
  LinearOperatorBase<FineField> &_Linop;
  Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
-
+  RealD                             _coarse_relax_tol;
  ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField>   &Poly,
 					   OperatorFunction<FineField>   &smoother,
 					   LinearOperatorBase<FineField> &Linop,
-					   Aggregation<Fobj,CComplex,nbasis> &Aggregate) 
-    : _smoother(smoother), _Linop(Linop),_Aggregate(Aggregate), _Poly(Poly)  {    };
+					   Aggregation<Fobj,CComplex,nbasis> &Aggregate,
+					   RealD coarse_relax_tol=5.0e3) 
+    : _smoother(smoother), _Linop(Linop),_Aggregate(Aggregate), _Poly(Poly), _coarse_relax_tol(coarse_relax_tol)  {    };

  int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
@@ -196,7 +198,7 @@ class ImplicitlyRestartedLanczosSmoothedTester  : public ImplicitlyRestartedLanc
 	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
 	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
 	     <<std::endl;
-
+    if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
    if( (vv<eresid*eresid) ) return 1;
    return 0;
  }
@@ -337,7 +339,7 @@ public:
    FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard;

    int Nconv;
-    IRL.calc(evals_fine,_Aggregate.subspace,src,Nconv,false,0);
+    IRL.calc(evals_fine,_Aggregate.subspace,src,Nconv,false);
    
    // Shrink down to number saved
    assert(Nstop>=nbasis);
@@ -345,7 +347,7 @@ public:
    evals_fine.resize(nbasis);
    _Aggregate.subspace.resize(nbasis,_FineGrid);
  }
-  void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,
+  void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
 		  int Nstop, int Nk, int Nm,RealD resid, 
 		  RealD MaxIt, RealD betastp, int MinRes)
  {
@@ -357,8 +359,7 @@ public:
    //////////////////////////////////////////////////////////////////////////////////////////////////

    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth);
-    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate);
-
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax);

    evals_coarse.resize(Nm);
    evec_coarse.resize(Nm,_CoarseGrid);
@@ -367,7 +368,7 @@ public:

    ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
    int Nconv=0;
-    IRL.calc(evals_coarse,evec_coarse,src,Nconv,false,1);
+    IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
    assert(Nconv>=Nstop);

    for (int i=0;i<Nstop;i++){
@@ -492,7 +493,7 @@ int main (int argc, char ** argv) {
  IRL.Orthogonalise();

  std::cout << GridLogMessage << "Performing coarse grid IRL Nstop "<< Ns2<< " Nk "<<Nk2<<" Nm "<<Nm2<< std::endl;
-  IRL.calcCoarse(coarse.Cheby,Params.Smoother,
+  IRL.calcCoarse(coarse.Cheby,Params.Smoother,Params.coarse_relax_tol,
 		 coarse.Nstop, coarse.Nk,coarse.Nm,
 		 coarse.resid, coarse.MaxIt, 
 		 coarse.betastp,coarse.MinRes);