Merge branch 'develop' into feature/hadrons-new-memory-model

2026-06-30 23:43:29 +01:00 · 2017-12-01 19:45:12 +00:00
parent 514993ed17 2fd4989029
commit 456c78c233
49 changed files with 2292 additions and 1625 deletions
@@ -37,8 +37,15 @@ RealD InverseApproximation(RealD x){
 RealD SqrtApproximation(RealD x){
  return std::sqrt(x);
 }
+RealD Approximation32(RealD x){
+  return std::pow(x,-1.0/32.0);
+}
+RealD Approximation2(RealD x){
+  return std::pow(x,-1.0/2.0);
+}
+
 RealD StepFunction(RealD x){
-  if ( x<0.1 )  return 1.0;
+  if ( x<10.0 )  return 1.0;
  else return 0.0;
 }

@@ -56,7 +63,6 @@ int main (int argc, char ** argv)

  Chebyshev<LatticeFermion> ChebyInv(lo,hi,2000,InverseApproximation);

-
  {
    std::ofstream of("chebyinv");
    ChebyInv.csv(of);
@@ -78,7 +84,6 @@ int main (int argc, char ** argv)


  ChebyStep.JacksonSmooth();
-
  {
    std::ofstream of("chebystepjack");
    ChebyStep.csv(of);
@@ -100,5 +105,30 @@ int main (int argc, char ** argv)
    ChebyNE.csv(of);
  }

+  lo=0.0;
+  hi=4.0;
+  Chebyshev<LatticeFermion> Cheby32(lo,hi,2000,Approximation32);
+  {
+    std::ofstream of("cheby32");
+    Cheby32.csv(of);
+  }
+  Cheby32.JacksonSmooth();
+  {
+    std::ofstream of("cheby32jack");
+    Cheby32.csv(of);
+  }
+
+  Chebyshev<LatticeFermion> ChebySqrt(lo,hi,2000,Approximation2);
+  {
+    std::ofstream of("chebysqrt");
+    ChebySqrt.csv(of);
+  }
+  ChebySqrt.JacksonSmooth();
+  {
+    std::ofstream of("chebysqrtjack");
+    ChebySqrt.csv(of);
+  }
+
+
  Grid_finalize();
 }
@@ -38,11 +38,11 @@ int main (int argc, char ** argv)

  std::cout<<GridLogMessage << "Testing Remez"<<std::endl;

-  double     lo=0.01;
-  double     hi=1.0;
+  double     lo=1.0e-3;
+  double     hi=5.0;
  int precision=64;
-  int    degree=10;
-  AlgRemez remez(0.001,1.0,precision);
+  int    degree=16;
+  AlgRemez remez(lo,hi,precision);

  ////////////////////////////////////////
  // sqrt and inverse sqrt
@@ -50,21 +50,50 @@ int main (int argc, char ** argv)

  std::cout<<GridLogMessage << "Generating degree "<<degree<<" for x^(1/2)"<<std::endl;
  remez.generateApprox(degree,1,2);
-  MultiShiftFunction Sqrt(remez,1.0,false);
-  MultiShiftFunction InvSqrt(remez,1.0,true);
+  MultiShiftFunction Root2(remez,1.0,false);
+  MultiShiftFunction InvRoot2(remez,1.0,true);


  std::cout<<GridLogMessage << "Generating degree "<<degree<<" for x^(1/4)"<<std::endl;
  remez.generateApprox(degree,1,4);
-  MultiShiftFunction SqrtSqrt(remez,1.0,false);
-  MultiShiftFunction InvSqrtSqrt(remez,1.0,true);
+  MultiShiftFunction Root4(remez,1.0,false);
+  MultiShiftFunction InvRoot4(remez,1.0,true);

+  std::cout<<GridLogMessage << "Generating degree "<<degree<<" for x^(1/8)"<<std::endl;
+  remez.generateApprox(degree,1,8);
+  MultiShiftFunction Root8(remez,1.0,false);
+  MultiShiftFunction InvRoot8(remez,1.0,true);
+
+  std::cout<<GridLogMessage << "Generating degree "<<degree<<" for x^(1/16)"<<std::endl;
+  remez.generateApprox(degree,1,16);
+  MultiShiftFunction Root16(remez,1.0,false);
+  MultiShiftFunction InvRoot16(remez,1.0,true);
+
+  std::cout<<GridLogMessage << "Generating degree "<<degree<<" for x^(1/32)"<<std::endl;
+  remez.generateApprox(degree,1,32);
+  MultiShiftFunction Root32(remez,1.0,false);
+  MultiShiftFunction InvRoot32(remez,1.0,true);
  
-  ofstream gnuplot(std::string("Sqrt.gnu"),std::ios::out|std::ios::trunc);
-  Sqrt.gnuplot(gnuplot);
+  ofstream gnuplot(std::string("Root2.gnu"),std::ios::out|std::ios::trunc);
+  Root2.gnuplot(gnuplot);
+
+  ofstream gnuplot_i2(std::string("InvRoot2.gnu"),std::ios::out|std::ios::trunc);
+  InvRoot2.gnuplot(gnuplot_i2);
+
+  ofstream gnuplot_i4(std::string("InvRoot4.gnu"),std::ios::out|std::ios::trunc);
+  InvRoot4.gnuplot(gnuplot_i4);
+
+  ofstream gnuplot_i8(std::string("InvRoot8.gnu"),std::ios::out|std::ios::trunc);
+  InvRoot8.gnuplot(gnuplot_i8);
+
+  ofstream gnuplot_i16(std::string("InvRoot16.gnu"),std::ios::out|std::ios::trunc);
+  InvRoot16.gnuplot(gnuplot_i16);
+
+  ofstream gnuplot_i32(std::string("InvRoot32.gnu"),std::ios::out|std::ios::trunc);
+  InvRoot32.gnuplot(gnuplot_i32);
+
+

-  ofstream gnuplot_inv(std::string("InvSqrt.gnu"),std::ios::out|std::ios::trunc);
-  InvSqrt.gnuplot(gnuplot);

  double x=0.6789;
  double sx=std::sqrt(x);
@@ -72,10 +101,10 @@ int main (int argc, char ** argv)
  double isx=1.0/sx;
  double issx=1.0/ssx;

-  double asx  =Sqrt.approx(x);
-  double assx =SqrtSqrt.approx(x);
-  double aisx =InvSqrt.approx(x);
-  double aissx=InvSqrtSqrt.approx(x);
+  double asx  =Root2.approx(x);
+  double assx =Root4.approx(x);
+  double aisx =InvRoot2.approx(x);
+  double aissx=InvRoot4.approx(x);

  std::cout<<GridLogMessage << "x^(1/2) : "<<sx<<" "<<asx<<std::endl;
  std::cout<<GridLogMessage << "x^(1/4) : "<<ssx<<" "<<assx<<std::endl;
@@ -0,0 +1,143 @@
+namespace Grid { 
+
+/*
+  BlockProjector
+
+  If _HP_BLOCK_PROJECTORS_ is defined, we assume that _evec is a basis that is not
+  fully orthonormalized (to the precision of the coarse field) and we allow for higher-precision
+  coarse field than basis field.
+
+*/
+//#define _HP_BLOCK_PROJECTORS_
+
+template<typename Field>
+class BlockProjector {
+public:
+
+  BasisFieldVector<Field>& _evec;
+  BlockedGrid<Field>& _bgrid;
+
+  BlockProjector(BasisFieldVector<Field>& evec, BlockedGrid<Field>& bgrid) : _evec(evec), _bgrid(bgrid) {
+  }
+
+  void createOrthonormalBasis(RealD thres = 0.0) {
+
+    GridStopWatch sw;
+    sw.Start();
+
+    int cnt = 0;
+
+#pragma omp parallel shared(cnt)
+    {
+      int lcnt = 0;
+
+#pragma omp for
+      for (int b=0;b<_bgrid._o_blocks;b++) {
+	
+	for (int i=0;i<_evec._Nm;i++) {
+	  
+	  auto nrm0 = _bgrid.block_sp(b,_evec._v[i],_evec._v[i]);
+	  
+	  // |i> -= <j|i> |j>
+	  for (int j=0;j<i;j++) {
+	    _bgrid.block_caxpy(b,_evec._v[i],-_bgrid.block_sp(b,_evec._v[j],_evec._v[i]),_evec._v[j],_evec._v[i]);
+	  }
+	  
+	  auto nrm = _bgrid.block_sp(b,_evec._v[i],_evec._v[i]);
+	  
+	  auto eps = nrm/nrm0;
+	  if (Reduce(eps).real() < thres) {
+	    lcnt++;
+	  }
+	  
+	  // TODO: if norm is too small, remove this eigenvector/mark as not needed; in practice: set it to zero norm here and return a mask
+	  // that is then used later to decide not to write certain eigenvectors to disk (add a norm calculation before subtraction step and look at nrm/nrm0 < eps to decide)
+	  _bgrid.block_cscale(b,1.0 / sqrt(nrm),_evec._v[i]);
+	  
+	}
+	
+      }
+
+#pragma omp critical
+      {
+	cnt += lcnt;
+      }
+    }
+    sw.Stop();
+    std::cout << GridLogMessage << "Gram-Schmidt to create blocked basis took " << sw.Elapsed() << " (" << ((RealD)cnt / (RealD)_bgrid._o_blocks / (RealD)_evec._Nm) 
+	      << " below threshold)" << std::endl;
+
+  }
+
+  template<typename CoarseField>
+  void coarseToFine(const CoarseField& in, Field& out) {
+
+    out = zero;
+    out.checkerboard = _evec._v[0].checkerboard;
+
+    int Nbasis = sizeof(in._odata[0]._internal._internal) / sizeof(in._odata[0]._internal._internal[0]);
+    assert(Nbasis == _evec._Nm);
+    
+#pragma omp parallel for
+    for (int b=0;b<_bgrid._o_blocks;b++) {
+      for (int j=0;j<_evec._Nm;j++) {
+	_bgrid.block_caxpy(b,out,in._odata[b]._internal._internal[j],_evec._v[j],out);
+      }
+    }
+
+  }
+
+  template<typename CoarseField>
+  void fineToCoarse(const Field& in, CoarseField& out) {
+
+    out = zero;
+
+    int Nbasis = sizeof(out._odata[0]._internal._internal) / sizeof(out._odata[0]._internal._internal[0]);
+    assert(Nbasis == _evec._Nm);
+
+
+    Field tmp(_bgrid._grid);
+    tmp = in;
+    
+#pragma omp parallel for
+    for (int b=0;b<_bgrid._o_blocks;b++) {
+      for (int j=0;j<_evec._Nm;j++) {
+	// |rhs> -= <j|rhs> |j>
+	auto c = _bgrid.block_sp(b,_evec._v[j],tmp);
+	_bgrid.block_caxpy(b,tmp,-c,_evec._v[j],tmp); // may make this more numerically stable
+	out._odata[b]._internal._internal[j] = c;
+      }
+    }
+
+  }
+
+  template<typename CoarseField>
+    void deflateFine(BasisFieldVector<CoarseField>& _coef,const std::vector<RealD>& eval,int N,const Field& src_orig,Field& result) {
+    result = zero;
+    for (int i=0;i<N;i++) {
+      Field tmp(result._grid);
+      coarseToFine(_coef._v[i],tmp);
+      axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
+    }
+  }
+
+  template<typename CoarseField>
+    void deflateCoarse(BasisFieldVector<CoarseField>& _coef,const std::vector<RealD>& eval,int N,const Field& src_orig,Field& result) {
+    CoarseField src_coarse(_coef._v[0]._grid);
+    CoarseField result_coarse = src_coarse;
+    result_coarse = zero;
+    fineToCoarse(src_orig,src_coarse);
+    for (int i=0;i<N;i++) {
+      axpy(result_coarse,TensorRemove(innerProduct(_coef._v[i],src_coarse)) / eval[i],_coef._v[i],result_coarse);
+    }
+    coarseToFine(result_coarse,result);
+  }
+
+  template<typename CoarseField>
+    void deflate(BasisFieldVector<CoarseField>& _coef,const std::vector<RealD>& eval,int N,const Field& src_orig,Field& result) {
+    // Deflation on coarse Grid is much faster, so use it by default.  Deflation on fine Grid is kept for legacy reasons for now.
+    deflateCoarse(_coef,eval,N,src_orig,result);
+  }
+
+};
+}
@@ -0,0 +1,401 @@
+namespace Grid {
+
+template<typename Field>
+class BlockedGrid {
+public:
+  GridBase* _grid;
+  typedef typename Field::scalar_type  Coeff_t;
+  typedef typename Field::vector_type vCoeff_t;
+  
+  std::vector<int> _bs; // block size
+  std::vector<int> _nb; // number of blocks
+  std::vector<int> _l;  // local dimensions irrespective of cb
+  std::vector<int> _l_cb;  // local dimensions of checkerboarded vector
+  std::vector<int> _l_cb_o;  // local dimensions of inner checkerboarded vector
+  std::vector<int> _bs_cb; // block size in checkerboarded vector
+  std::vector<int> _nb_o; // number of blocks of simd o-sites
+
+  int _nd, _blocks, _cf_size, _cf_block_size, _cf_o_block_size, _o_blocks, _block_sites;
+  
+  BlockedGrid(GridBase* grid, const std::vector<int>& block_size) :
+    _grid(grid), _bs(block_size), _nd((int)_bs.size()), 
+      _nb(block_size), _l(block_size), _l_cb(block_size), _nb_o(block_size),
+      _l_cb_o(block_size), _bs_cb(block_size) {
+
+    _blocks = 1;
+    _o_blocks = 1;
+    _l = grid->FullDimensions();
+    _l_cb = grid->LocalDimensions();
+    _l_cb_o = grid->_rdimensions;
+
+    _cf_size = 1;
+    _block_sites = 1;
+    for (int i=0;i<_nd;i++) {
+      _l[i] /= grid->_processors[i];
+
+      assert(!(_l[i] % _bs[i])); // lattice must accommodate choice of blocksize
+
+      int r = _l[i] / _l_cb[i];
+      assert(!(_bs[i] % r)); // checkerboarding must accommodate choice of blocksize
+      _bs_cb[i] = _bs[i] / r;
+      _block_sites *= _bs_cb[i];
+      _nb[i] = _l[i] / _bs[i];
+      _nb_o[i] = _nb[i] / _grid->_simd_layout[i];
+      if (_nb[i] % _grid->_simd_layout[i]) { // simd must accommodate choice of blocksize
+	std::cout << GridLogMessage << "Problem: _nb[" << i << "] = " << _nb[i] << " _grid->_simd_layout[" << i << "] = " << _grid->_simd_layout[i] << std::endl;
+	assert(0);
+      }
+      _blocks *= _nb[i];
+      _o_blocks *= _nb_o[i];
+      _cf_size *= _l[i];
+    }
+
+    _cf_size *= 12 / 2;
+    _cf_block_size = _cf_size / _blocks;
+    _cf_o_block_size = _cf_size / _o_blocks;
+
+    std::cout << GridLogMessage << "BlockedGrid:" << std::endl;
+    std::cout << GridLogMessage << " _l     = " << _l << std::endl;
+    std::cout << GridLogMessage << " _l_cb     = " << _l_cb << std::endl;
+    std::cout << GridLogMessage << " _l_cb_o     = " << _l_cb_o << std::endl;
+    std::cout << GridLogMessage << " _bs    = " << _bs << std::endl;
+    std::cout << GridLogMessage << " _bs_cb    = " << _bs_cb << std::endl;
+
+    std::cout << GridLogMessage << " _nb    = " << _nb << std::endl;
+    std::cout << GridLogMessage << " _nb_o    = " << _nb_o << std::endl;
+    std::cout << GridLogMessage << " _blocks = " << _blocks << std::endl;
+    std::cout << GridLogMessage << " _o_blocks = " << _o_blocks << std::endl;
+    std::cout << GridLogMessage << " sizeof(vCoeff_t) = " << sizeof(vCoeff_t) << std::endl;
+    std::cout << GridLogMessage << " _cf_size = " << _cf_size << std::endl;
+    std::cout << GridLogMessage << " _cf_block_size = " << _cf_block_size << std::endl;
+    std::cout << GridLogMessage << " _block_sites = " << _block_sites << std::endl;
+    std::cout << GridLogMessage << " _grid->oSites() = " << _grid->oSites() << std::endl;
+
+    //    _grid->Barrier();
+    //abort();
+  }
+
+    void block_to_coor(int b, std::vector<int>& x0) {
+
+      std::vector<int> bcoor;
+      bcoor.resize(_nd);
+      x0.resize(_nd);
+      assert(b < _o_blocks);
+      Lexicographic::CoorFromIndex(bcoor,b,_nb_o);
+      int i;
+
+      for (i=0;i<_nd;i++) {
+	x0[i] = bcoor[i]*_bs_cb[i];
+      }
+
+      //std::cout << GridLogMessage << "Map block b -> " << x0 << std::endl;
+
+    }
+
+    void block_site_to_o_coor(const std::vector<int>& x0, std::vector<int>& coor, int i) {
+      Lexicographic::CoorFromIndex(coor,i,_bs_cb);
+      for (int j=0;j<_nd;j++)
+	coor[j] += x0[j];
+    }
+
+    int block_site_to_o_site(const std::vector<int>& x0, int i) {
+      std::vector<int> coor;  coor.resize(_nd);
+      block_site_to_o_coor(x0,coor,i);
+      Lexicographic::IndexFromCoor(coor,i,_l_cb_o);
+      return i;
+    }
+
+    vCoeff_t block_sp(int b, const Field& x, const Field& y) {
+
+      std::vector<int> x0;
+      block_to_coor(b,x0);
+
+      vCoeff_t ret = 0.0;
+      for (int i=0;i<_block_sites;i++) { // only odd sites
+	int ss = block_site_to_o_site(x0,i);
+	ret += TensorRemove(innerProduct(x._odata[ss],y._odata[ss]));
+      }
+
+      return ret;
+
+    }
+
+    vCoeff_t block_sp(int b, const Field& x, const std::vector< ComplexD >& y) {
+
+      std::vector<int> x0;
+      block_to_coor(b,x0);
+
+      constexpr int nsimd = sizeof(vCoeff_t) / sizeof(Coeff_t);
+      int lsize = _cf_o_block_size / _block_sites;
+
+      std::vector< ComplexD > ret(nsimd);
+      for (int i=0;i<nsimd;i++)
+	ret[i] = 0.0;
+
+      for (int i=0;i<_block_sites;i++) { // only odd sites
+	int ss = block_site_to_o_site(x0,i);
+
+	int n = lsize / nsimd;
+	for (int l=0;l<n;l++) {
+	  for (int j=0;j<nsimd;j++) {
+	    int t = lsize * i + l*nsimd + j;
+
+	    ret[j] += conjugate(((Coeff_t*)&x._odata[ss]._internal)[l*nsimd + j]) * y[t];
+	  }
+	}
+      }
+
+      vCoeff_t vret;
+      for (int i=0;i<nsimd;i++)
+	((Coeff_t*)&vret)[i] = (Coeff_t)ret[i];
+
+      return vret;
+
+    }
+
+    template<class T>
+      void vcaxpy(iScalar<T>& r,const vCoeff_t& a,const iScalar<T>& x,const iScalar<T>& y) {
+      vcaxpy(r._internal,a,x._internal,y._internal);
+    }
+
+    template<class T,int N>
+      void vcaxpy(iVector<T,N>& r,const vCoeff_t& a,const iVector<T,N>& x,const iVector<T,N>& y) {
+      for (int i=0;i<N;i++)
+	vcaxpy(r._internal[i],a,x._internal[i],y._internal[i]);
+    }
+
+    void vcaxpy(vCoeff_t& r,const vCoeff_t& a,const vCoeff_t& x,const vCoeff_t& y) {
+      r = a*x + y;
+    }
+
+    void block_caxpy(int b, Field& ret, const vCoeff_t& a, const Field& x, const Field& y) {
+
+      std::vector<int> x0;
+      block_to_coor(b,x0);
+
+      for (int i=0;i<_block_sites;i++) { // only odd sites
+	int ss = block_site_to_o_site(x0,i);
+	vcaxpy(ret._odata[ss],a,x._odata[ss],y._odata[ss]);
+      }
+
+    }
+
+    void block_caxpy(int b, std::vector< ComplexD >& ret, const vCoeff_t& a, const Field& x, const std::vector< ComplexD >& y) {
+      std::vector<int> x0;
+      block_to_coor(b,x0);
+
+      constexpr int nsimd = sizeof(vCoeff_t) / sizeof(Coeff_t);
+      int lsize = _cf_o_block_size / _block_sites;
+
+      for (int i=0;i<_block_sites;i++) { // only odd sites
+	int ss = block_site_to_o_site(x0,i);
+
+	int n = lsize / nsimd;
+	for (int l=0;l<n;l++) {
+	  vCoeff_t r = a* ((vCoeff_t*)&x._odata[ss]._internal)[l];
+
+	  for (int j=0;j<nsimd;j++) {
+	    int t = lsize * i + l*nsimd + j;
+	    ret[t] = y[t] + ((Coeff_t*)&r)[j];
+	  }
+	}
+      }
+
+    }
+
+    void block_set(int b, Field& ret, const std::vector< ComplexD >& x) {
+      std::vector<int> x0;
+      block_to_coor(b,x0);
+
+      int lsize = _cf_o_block_size / _block_sites;
+
+      for (int i=0;i<_block_sites;i++) { // only odd sites
+	int ss = block_site_to_o_site(x0,i);
+
+	for (int l=0;l<lsize;l++)
+	  ((Coeff_t*)&ret._odata[ss]._internal)[l] = (Coeff_t)x[lsize * i + l]; // convert precision
+      }
+
+    }
+
+    void block_get(int b, const Field& ret, std::vector< ComplexD >& x) {
+      std::vector<int> x0;
+      block_to_coor(b,x0);
+
+      int lsize = _cf_o_block_size / _block_sites;
+
+      for (int i=0;i<_block_sites;i++) { // only odd sites
+	int ss = block_site_to_o_site(x0,i);
+
+	for (int l=0;l<lsize;l++)
+	  x[lsize * i + l] = (ComplexD)((Coeff_t*)&ret._odata[ss]._internal)[l];
+      }
+
+    }
+
+    template<class T>
+    void vcscale(iScalar<T>& r,const vCoeff_t& a,const iScalar<T>& x) {
+      vcscale(r._internal,a,x._internal);
+    }
+
+    template<class T,int N>
+    void vcscale(iVector<T,N>& r,const vCoeff_t& a,const iVector<T,N>& x) {
+      for (int i=0;i<N;i++)
+	vcscale(r._internal[i],a,x._internal[i]);
+    }
+
+    void vcscale(vCoeff_t& r,const vCoeff_t& a,const vCoeff_t& x) {
+      r = a*x;
+    }
+
+    void block_cscale(int b, const vCoeff_t& a, Field& ret) {
+
+      std::vector<int> x0;
+      block_to_coor(b,x0);
+      
+      for (int i=0;i<_block_sites;i++) { // only odd sites
+	int ss = block_site_to_o_site(x0,i);
+	vcscale(ret._odata[ss],a,ret._odata[ss]);
+      }
+    }
+
+    void getCanonicalBlockOffset(int cb, std::vector<int>& x0) {
+      const int ndim = 5;
+      assert(_nb.size() == ndim);
+      std::vector<int> _nbc = { _nb[1], _nb[2], _nb[3], _nb[4], _nb[0] };
+      std::vector<int> _bsc = { _bs[1], _bs[2], _bs[3], _bs[4], _bs[0] };
+      x0.resize(ndim);
+
+      assert(cb >= 0);
+      assert(cb < _nbc[0]*_nbc[1]*_nbc[2]*_nbc[3]*_nbc[4]);
+
+      Lexicographic::CoorFromIndex(x0,cb,_nbc);
+      int i;
+
+      for (i=0;i<ndim;i++) {
+	x0[i] *= _bsc[i];
+      }
+
+      //if (cb < 2)
+      //	std::cout << GridLogMessage << "Map: " << cb << " To: " << x0 << std::endl;
+    }
+
+    void pokeBlockOfVectorCanonical(int cb,Field& v,const std::vector<float>& buf) {
+      std::vector<int> _bsc = { _bs[1], _bs[2], _bs[3], _bs[4], _bs[0] };
+      std::vector<int> ldim = v._grid->LocalDimensions();
+      std::vector<int> cldim = { ldim[1], ldim[2], ldim[3], ldim[4], ldim[0] };
+      const int _nbsc = _bs_cb[0]*_bs_cb[1]*_bs_cb[2]*_bs_cb[3]*_bs_cb[4];
+      // take canonical block cb of v and put it in canonical ordering in buf
+      std::vector<int> cx0;
+      getCanonicalBlockOffset(cb,cx0);
+
+#pragma omp parallel
+      {
+	std::vector<int> co0,cl0;
+	co0=cx0; cl0=cx0;
+
+#pragma omp for
+	for (int i=0;i<_nbsc;i++) {
+	  Lexicographic::CoorFromIndex(co0,2*i,_bsc); // 2* for eo
+	  for (int j=0;j<(int)_bsc.size();j++)
+	    cl0[j] = cx0[j] + co0[j];
+	  
+	  std::vector<int> l0 = { cl0[4], cl0[0], cl0[1], cl0[2], cl0[3] };
+	  int oi = v._grid->oIndex(l0);
+	  int ii = v._grid->iIndex(l0);
+	  int lti = i;
+
+	  //if (cb < 2 && i<2)
+	  //  std::cout << GridLogMessage << "Map: " << cb << ", " << i << " To: " << cl0 << ", " << cx0 << ", " << oi << ", " << ii << std::endl;
+	  
+	  for (int s=0;s<4;s++)
+	    for (int c=0;c<3;c++) {
+	      Coeff_t& ld = ((Coeff_t*)&v._odata[oi]._internal._internal[s]._internal[c])[ii];
+	      int ti = 12*lti + 3*s + c;
+	      ld = Coeff_t(buf[2*ti+0], buf[2*ti+1]);
+	    }
+	}
+      }
+    }
+
+    void peekBlockOfVectorCanonical(int cb,const Field& v,std::vector<float>& buf) {
+      std::vector<int> _bsc = { _bs[1], _bs[2], _bs[3], _bs[4], _bs[0] };
+      std::vector<int> ldim = v._grid->LocalDimensions();
+      std::vector<int> cldim = { ldim[1], ldim[2], ldim[3], ldim[4], ldim[0] };
+      const int _nbsc = _bs_cb[0]*_bs_cb[1]*_bs_cb[2]*_bs_cb[3]*_bs_cb[4];
+      // take canonical block cb of v and put it in canonical ordering in buf
+      std::vector<int> cx0;
+      getCanonicalBlockOffset(cb,cx0);
+
+      buf.resize(_cf_block_size * 2);
+
+#pragma omp parallel
+      {
+	std::vector<int> co0,cl0;
+	co0=cx0; cl0=cx0;
+
+#pragma omp for
+	for (int i=0;i<_nbsc;i++) {
+	  Lexicographic::CoorFromIndex(co0,2*i,_bsc); // 2* for eo
+	  for (int j=0;j<(int)_bsc.size();j++)
+	    cl0[j] = cx0[j] + co0[j];
+	  
+	  std::vector<int> l0 = { cl0[4], cl0[0], cl0[1], cl0[2], cl0[3] };
+	  int oi = v._grid->oIndex(l0);
+	  int ii = v._grid->iIndex(l0);
+	  int lti = i;
+	  
+	  //if (cb < 2 && i<2)
+	  //  std::cout << GridLogMessage << "Map: " << cb << ", " << i << " To: " << cl0 << ", " << cx0 << ", " << oi << ", " << ii << std::endl;
+
+	  for (int s=0;s<4;s++)
+	    for (int c=0;c<3;c++) {
+	      Coeff_t& ld = ((Coeff_t*)&v._odata[oi]._internal._internal[s]._internal[c])[ii];
+	      int ti = 12*lti + 3*s + c;
+	      buf[2*ti+0] = ld.real();
+	      buf[2*ti+1] = ld.imag();
+	    }
+	}
+      }
+    }
+
+    int globalToLocalCanonicalBlock(int slot,const std::vector<int>& src_nodes,int nb) {
+      // processor coordinate
+      int _nd = (int)src_nodes.size();
+      std::vector<int> _src_nodes = src_nodes;
+      std::vector<int> pco(_nd);
+      Lexicographic::CoorFromIndex(pco,slot,_src_nodes);
+      std::vector<int> cpco = { pco[1], pco[2], pco[3], pco[4], pco[0] };
+
+      // get local block
+      std::vector<int> _nbc = { _nb[1], _nb[2], _nb[3], _nb[4], _nb[0] };
+      assert(_nd == 5);
+      std::vector<int> c_src_local_blocks(_nd);
+      for (int i=0;i<_nd;i++) {
+	assert(_grid->_fdimensions[i] % (src_nodes[i] * _bs[i]) == 0);
+	c_src_local_blocks[(i+4) % 5] = _grid->_fdimensions[i] / src_nodes[i] / _bs[i];
+      }
+      std::vector<int> cbcoor(_nd); // coordinate of block in slot in canonical form
+      Lexicographic::CoorFromIndex(cbcoor,nb,c_src_local_blocks);
+
+      // cpco, cbcoor
+      std::vector<int> clbcoor(_nd);
+      for (int i=0;i<_nd;i++) {
+	int cgcoor = cpco[i] * c_src_local_blocks[i] + cbcoor[i]; // global block coordinate
+	int pcoor = cgcoor / _nbc[i]; // processor coordinate in my Grid
+	int tpcoor = _grid->_processor_coor[(i+1)%5];
+	if (pcoor != tpcoor)
+	  return -1;
+	clbcoor[i] = cgcoor - tpcoor * _nbc[i]; // canonical local block coordinate for canonical dimension i
+      }
+
+      int lnb;
+      Lexicographic::IndexFromCoor(clbcoor,lnb,_nbc);
+      //std::cout << "Mapped slot = " << slot << " nb = " << nb << " to " << lnb << std::endl;
+      return lnb;
+    }
+
+
+ };
+
+}
@@ -0,0 +1,81 @@
+namespace Grid { 
+
+template<class Field>
+class BasisFieldVector {
+ public:
+  int _Nm;
+
+  typedef typename Field::scalar_type Coeff_t;
+  typedef typename Field::vector_type vCoeff_t;
+  typedef typename Field::vector_object vobj;
+  typedef typename vobj::scalar_object sobj;
+
+  std::vector<Field> _v; // _Nfull vectors
+
+  void report(int n,GridBase* value) {
+
+    std::cout << GridLogMessage << "BasisFieldVector allocated:\n";
+    std::cout << GridLogMessage << " Delta N = " << n << "\n";
+    std::cout << GridLogMessage << " Size of full vectors (size) = " << 
+      ((double)n*sizeof(vobj)*value->oSites() / 1024./1024./1024.) << " GB\n";
+    std::cout << GridLogMessage << " Size = " << _v.size() << " Capacity = " << _v.capacity() << std::endl;
+
+    value->Barrier();
+
+#ifdef __linux
+    if (value->IsBoss()) {
+      system("cat /proc/meminfo");
+    }
+#endif
+
+    value->Barrier();
+
+  }
+
+  BasisFieldVector(int Nm,GridBase* value) : _Nm(Nm), _v(Nm,value) {
+    report(Nm,value);
+  }
+  
+  ~BasisFieldVector() {
+  }
+
+  Field& operator[](int i) {
+    return _v[i];
+  }
+
+  void orthogonalize(Field& w, int k) {
+    basisOrthogonalize(_v,w,k);
+  }
+
+  void rotate(Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm) {
+    basisRotate(_v,Qt,j0,j1,k0,k1,Nm);
+  }
+
+  size_t size() const {
+    return _Nm;
+  }
+
+  void resize(int n) {
+    if (n > _Nm)
+      _v.reserve(n);
+    
+    _v.resize(n,_v[0]._grid);
+
+    if (n < _Nm)
+      _v.shrink_to_fit();
+
+    report(n - _Nm,_v[0]._grid);
+
+    _Nm = n;
+  }
+
+  void sortInPlace(std::vector<RealD>& sort_vals, bool reverse) {
+    basisSortInPlace(_v,sort_vals,reverse);
+  }
+
+  void deflate(const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
+    basisDeflate(_v,eval,src_orig,result);
+  }
+
+ }; 
+}
@@ -21,7 +21,14 @@
    (ortho krylov low poly); and then fix up lowest say 200 eigenvalues by 1 run with high-degree poly (600 could be enough)
 */
 #include <Grid/Grid.h>
-#include <Grid/algorithms/iterative/BlockImplicitlyRestartedLanczos/BlockImplicitlyRestartedLanczos.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
+/////////////////////////////////////////////////////////////////////////////
+// The following are now decoupled from the Lanczos and deal with grids.
+// Safe to replace functionality
+/////////////////////////////////////////////////////////////////////////////
+#include "BlockedGrid.h"
+#include "FieldBasisVector.h"
+#include "BlockProjector.h"
 #include "FieldVectorIO.h"
 #include "Params.h"

@@ -93,19 +100,6 @@ void write_history(char* fn, std::vector<RealD>& hist) {
  fclose(f);
 }

-template<typename Field>
-class FunctionHermOp : public LinearFunction<Field> {
-public:
-  OperatorFunction<Field>   & _poly;
-  LinearOperatorBase<Field> &_Linop;
-
-  FunctionHermOp(OperatorFunction<Field> & poly,LinearOperatorBase<Field>& linop) : _poly(poly), _Linop(linop) {
-  }
-
-  void operator()(const Field& in, Field& out) {
-    _poly(_Linop,in,out);
-  }
-};

 template<typename Field>
 class CheckpointedLinearFunction : public LinearFunction<Field> {
@@ -261,19 +255,6 @@ public:
  }
 };

-template<typename Field>
-class PlainHermOp : public LinearFunction<Field> {
-public:
-  LinearOperatorBase<Field> &_Linop;
-
-  PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop) {
-  }
-
-  void operator()(const Field& in, Field& out) {
-    _Linop.HermOp(in,out);
-  }
-};
-
 template<typename vtype, int N > using CoarseSiteFieldGeneral = iScalar< iVector<vtype, N> >;
 template<int N> using CoarseSiteFieldD = CoarseSiteFieldGeneral< vComplexD, N >;
 template<int N> using CoarseSiteFieldF = CoarseSiteFieldGeneral< vComplexF, N >;
@@ -319,7 +300,7 @@ void CoarseGridLanczos(BlockProjector<Field>& pr,RealD alpha2,RealD beta,int Npo
    Op2 = &Op2plain;
  }
  ProjectedHermOp<CoarseLatticeFermion<Nstop1>,LatticeFermion> Op2nopoly(pr,HermOp);
-  BlockImplicitlyRestartedLanczos<CoarseLatticeFermion<Nstop1> > IRL2(*Op2,*Op2,Nstop2,Nk2,Nm2,resid2,betastp2,MaxIt,MinRes2);
+  ImplicitlyRestartedLanczos<CoarseLatticeFermion<Nstop1> > IRL2(*Op2,*Op2,Nstop2,Nk2,Nm2,resid2,MaxIt,betastp2,MinRes2);


  src_coarse = 1.0;
@@ -350,7 +331,7 @@ void CoarseGridLanczos(BlockProjector<Field>& pr,RealD alpha2,RealD beta,int Npo
      ) {
    

-    IRL2.calc(eval2,coef,src_coarse,Nconv,true,SkipTest2);
+    IRL2.calc(eval2,coef._v,src_coarse,Nconv,true);

    coef.resize(Nstop2);
    eval2.resize(Nstop2);
@@ -450,6 +431,7 @@ void CoarseGridLanczos(BlockProjector<Field>& pr,RealD alpha2,RealD beta,int Npo
    auto result = src_orig; 

    // undeflated solve
+    std::cout << GridLogMessage << " Undeflated solve "<<std::endl;
    result = zero;
    CG(HermOp, src_orig, result);
    //    if (UCoarseGrid->IsBoss())
@@ -457,6 +439,7 @@ void CoarseGridLanczos(BlockProjector<Field>& pr,RealD alpha2,RealD beta,int Npo
    //    CG.ResHistory.clear();

    // deflated solve with all eigenvectors
+    std::cout << GridLogMessage << " Deflated solve with all evectors"<<std::endl;
    result = zero;
    pr.deflate(coef,eval2,Nstop2,src_orig,result);
    CG(HermOp, src_orig, result);
@@ -465,6 +448,7 @@ void CoarseGridLanczos(BlockProjector<Field>& pr,RealD alpha2,RealD beta,int Npo
    //    CG.ResHistory.clear();

    // deflated solve with non-blocked eigenvectors
+    std::cout << GridLogMessage << " Deflated solve with non-blocked evectors"<<std::endl;
    result = zero;
    pr.deflate(coef,eval1,Nstop1,src_orig,result);
    CG(HermOp, src_orig, result);
@@ -473,6 +457,7 @@ void CoarseGridLanczos(BlockProjector<Field>& pr,RealD alpha2,RealD beta,int Npo
    //    CG.ResHistory.clear();

    // deflated solve with all eigenvectors and original eigenvalues from proj
+    std::cout << GridLogMessage << " Deflated solve with all eigenvectors and original eigenvalues from proj"<<std::endl;
    result = zero;
    pr.deflate(coef,eval3,Nstop2,src_orig,result);
    CG(HermOp, src_orig, result);
@@ -641,7 +626,7 @@ int main (int argc, char ** argv) {
  }

  // First round of Lanczos to get low mode basis
-  BlockImplicitlyRestartedLanczos<LatticeFermion> IRL1(Op1,Op1test,Nstop1,Nk1,Nm1,resid1,betastp1,MaxIt,MinRes1);
+  ImplicitlyRestartedLanczos<LatticeFermion> IRL1(Op1,Op1test,Nstop1,Nk1,Nm1,resid1,MaxIt,betastp1,MinRes1);
  int Nconv;

  char tag[1024];
@@ -650,7 +635,7 @@ int main (int argc, char ** argv) {
    if (simple_krylov_basis) {
      quick_krylov_basis(evec,src,Op1,Nstop1);
    } else {
-      IRL1.calc(eval1,evec,src,Nconv,false,1);
+      IRL1.calc(eval1,evec._v,src,Nconv,false);
    }
    evec.resize(Nstop1); // and throw away superfluous
    eval1.resize(Nstop1);
@@ -0,0 +1,254 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_compressed_lanczos_reorg.cc
+
+    Copyright (C) 2017
+
+Author: Leans heavily on Christoph Lehner's code
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+/*
+ *  Reimplement the badly named "multigrid" lanczos as compressed Lanczos using the features 
+ *  in Grid that were intended to be used to support blocked Aggregates, from
+ */
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
+#include <Grid/algorithms/iterative/LocalCoherenceLanczos.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class Fobj,class CComplex,int nbasis>
+class LocalCoherenceLanczosScidac : public LocalCoherenceLanczos<Fobj,CComplex,nbasis>
+{ 
+public:
+  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
+  typedef Lattice<CoarseSiteVector>           CoarseField;
+  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
+  typedef Lattice<Fobj>          FineField;
+
+  LocalCoherenceLanczosScidac(GridBase *FineGrid,GridBase *CoarseGrid,
+			      LinearOperatorBase<FineField> &FineOp,
+			      int checkerboard) 
+    // Base constructor
+    : LocalCoherenceLanczos<Fobj,CComplex,nbasis>(FineGrid,CoarseGrid,FineOp,checkerboard) 
+  {};
+
+  void checkpointFine(std::string evecs_file,std::string evals_file)
+  {
+    assert(this->_Aggregate.subspace.size()==nbasis);
+    emptyUserRecord record;
+    Grid::QCD::ScidacWriter WR;
+    WR.open(evecs_file);
+    for(int k=0;k<nbasis;k++) {
+      WR.writeScidacFieldRecord(this->_Aggregate.subspace[k],record);
+    }
+    WR.close();
+    
+    XmlWriter WRx(evals_file);
+    write(WRx,"evals",this->evals_fine);
+  }
+
+  void checkpointFineRestore(std::string evecs_file,std::string evals_file)
+  {
+    this->evals_fine.resize(nbasis);
+    this->_Aggregate.subspace.resize(nbasis,this->_FineGrid);
+    
+    std::cout << GridLogIRL<< "checkpointFineRestore:  Reading evals from "<<evals_file<<std::endl;
+    XmlReader RDx(evals_file);
+    read(RDx,"evals",this->evals_fine);
+    
+    assert(this->evals_fine.size()==nbasis);
+    
+    std::cout << GridLogIRL<< "checkpointFineRestore:  Reading evecs from "<<evecs_file<<std::endl;
+    emptyUserRecord record;
+    Grid::QCD::ScidacReader RD ;
+    RD.open(evecs_file);
+    for(int k=0;k<nbasis;k++) {
+      this->_Aggregate.subspace[k].checkerboard=this->_checkerboard;
+      RD.readScidacFieldRecord(this->_Aggregate.subspace[k],record);
+      
+    }
+    RD.close();
+  }
+
+  void checkpointCoarse(std::string evecs_file,std::string evals_file)
+  {
+    int n = this->evec_coarse.size();
+    emptyUserRecord record;
+    Grid::QCD::ScidacWriter WR;
+    WR.open(evecs_file);
+    for(int k=0;k<n;k++) {
+      WR.writeScidacFieldRecord(this->evec_coarse[k],record);
+    }
+    WR.close();
+    
+    XmlWriter WRx(evals_file);
+    write(WRx,"evals",this->evals_coarse);
+  }
+
+  void checkpointCoarseRestore(std::string evecs_file,std::string evals_file,int nvec)
+  {
+    std::cout << "resizing coarse vecs to " << nvec<< std::endl;
+    this->evals_coarse.resize(nvec);
+    this->evec_coarse.resize(nvec,this->_CoarseGrid);
+    std::cout << GridLogIRL<< "checkpointCoarseRestore:  Reading evals from "<<evals_file<<std::endl;
+    XmlReader RDx(evals_file);
+    read(RDx,"evals",this->evals_coarse);
+
+    assert(this->evals_coarse.size()==nvec);
+    emptyUserRecord record;
+    std::cout << GridLogIRL<< "checkpointCoarseRestore:  Reading evecs from "<<evecs_file<<std::endl;
+    Grid::QCD::ScidacReader RD ;
+    RD.open(evecs_file);
+    for(int k=0;k<nvec;k++) {
+      RD.readScidacFieldRecord(this->evec_coarse[k],record);
+    }
+    RD.close();
+  }
+};
+
+int main (int argc, char ** argv) {
+
+  Grid_init(&argc,&argv);
+  GridLogIRL.TimingMode(1);
+
+  LocalCoherenceLanczosParams Params;
+  {
+    Params.omega.resize(10);
+    Params.blockSize.resize(5);
+    XmlWriter writer("Params_template.xml");
+    write(writer,"Params",Params);
+    std::cout << GridLogMessage << " Written Params_template.xml" <<std::endl;
+  }
+  
+  { 
+    XmlReader reader(std::string("./Params.xml"));
+    read(reader, "Params", Params);
+  }
+
+  int     Ls = (int)Params.omega.size();
+  RealD mass = Params.mass;
+  RealD M5   = Params.M5;
+  std::vector<int> blockSize = Params.blockSize;
+
+  // Grids
+  GridCartesian         * UGrid     = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
+								     GridDefaultSimd(Nd,vComplex::Nsimd()),
+								     GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid   = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> fineLatt     = GridDefaultLatt();
+  int dims=fineLatt.size();
+  assert(blockSize.size()==dims+1);
+  std::vector<int> coarseLatt(dims);
+  std::vector<int> coarseLatt5d ;
+
+  for (int d=0;d<coarseLatt.size();d++){
+    coarseLatt[d] = fineLatt[d]/blockSize[d];    assert(coarseLatt[d]*blockSize[d]==fineLatt[d]);
+  }
+
+  std::cout << GridLogMessage<< " 5d coarse lattice is ";
+  for (int i=0;i<coarseLatt.size();i++){
+    std::cout << coarseLatt[i]<<"x";
+  } 
+  int cLs = Ls/blockSize[dims]; assert(cLs*blockSize[dims]==Ls);
+  std::cout << cLs<<std::endl;
+  
+  GridCartesian         * CoarseGrid4    = SpaceTimeGrid::makeFourDimGrid(coarseLatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * CoarseGrid4rb  = SpaceTimeGrid::makeFourDimRedBlackGrid(CoarseGrid4);
+  GridCartesian         * CoarseGrid5    = SpaceTimeGrid::makeFiveDimGrid(cLs,CoarseGrid4);
+  GridRedBlackCartesian * CoarseGrid5rb  = SpaceTimeGrid::makeFourDimRedBlackGrid(CoarseGrid5);
+
+  // Gauge field
+  LatticeGaugeField Umu(UGrid);
+  FieldMetaData header;
+  NerscIO::readConfiguration(Umu,header,Params.config);
+  std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt() << "   Ls: " << Ls << std::endl;
+
+  // ZMobius EO Operator
+  ZMobiusFermionR Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5, Params.omega,1.,0.);
+  SchurDiagTwoOperator<ZMobiusFermionR,LatticeFermion> HermOp(Ddwf);
+
+  // Eigenvector storage
+  LanczosParams fine  =Params.FineParams;  
+  LanczosParams coarse=Params.CoarseParams;  
+
+  const int Ns1 = fine.Nstop;   const int Ns2 = coarse.Nstop;
+  const int Nk1 = fine.Nk;      const int Nk2 = coarse.Nk;
+  const int Nm1 = fine.Nm;      const int Nm2 = coarse.Nm;
+
+  std::cout << GridLogMessage << "Keep " << fine.Nstop   << " fine   vectors" << std::endl;
+  std::cout << GridLogMessage << "Keep " << coarse.Nstop << " coarse vectors" << std::endl;
+  assert(Nm2 >= Nm1);
+
+  const int nbasis= 60;
+  assert(nbasis==Ns1);
+  LocalCoherenceLanczosScidac<vSpinColourVector,vTComplex,nbasis> _LocalCoherenceLanczos(FrbGrid,CoarseGrid5rb,HermOp,Odd);
+  std::cout << GridLogMessage << "Constructed LocalCoherenceLanczos" << std::endl;
+
+  assert( (Params.doFine)||(Params.doFineRead));
+
+  if ( Params.doFine ) { 
+    std::cout << GridLogMessage << "Performing fine grid IRL Nstop "<< Ns1 << " Nk "<<Nk1<<" Nm "<<Nm1<< std::endl;
+    _LocalCoherenceLanczos.calcFine(fine.Cheby,
+		 fine.Nstop,fine.Nk,fine.Nm,
+		 fine.resid,fine.MaxIt, 
+		 fine.betastp,fine.MinRes);
+
+    std::cout << GridLogIRL<<"Checkpointing Fine evecs"<<std::endl;
+    _LocalCoherenceLanczos.checkpointFine(std::string("evecs.scidac"),std::string("evals.xml"));
+    _LocalCoherenceLanczos.testFine(fine.resid*100.0); // Coarse check
+    _LocalCoherenceLanczos.Orthogonalise();
+  }
+
+  if ( Params.doFineRead ) { 
+    _LocalCoherenceLanczos.checkpointFineRestore(std::string("evecs.scidac"),std::string("evals.xml"));
+    _LocalCoherenceLanczos.testFine(fine.resid*100.0); // Coarse check
+    _LocalCoherenceLanczos.Orthogonalise();
+  }
+
+  if ( Params.doCoarse ) {
+    std::cout << GridLogMessage << "Orthogonalising " << nbasis<<" Nm "<<Nm2<< std::endl;
+    
+    std::cout << GridLogMessage << "Performing coarse grid IRL Nstop "<< Ns2<< " Nk "<<Nk2<<" Nm "<<Nm2<< std::endl;
+    _LocalCoherenceLanczos.calcCoarse(coarse.Cheby,Params.Smoother,Params.coarse_relax_tol,
+			      coarse.Nstop, coarse.Nk,coarse.Nm,
+			      coarse.resid, coarse.MaxIt, 
+			      coarse.betastp,coarse.MinRes);
+
+
+    std::cout << GridLogIRL<<"Checkpointing coarse evecs"<<std::endl;
+    _LocalCoherenceLanczos.checkpointCoarse(std::string("evecs.coarse.scidac"),std::string("evals.coarse.xml"));
+  }
+
+  if ( Params.doCoarseRead ) {
+    // Verify we can reread ???
+    _LocalCoherenceLanczos.checkpointCoarseRestore(std::string("evecs.coarse.scidac"),std::string("evals.coarse.xml"),coarse.Nstop);
+    _LocalCoherenceLanczos.testCoarse(coarse.resid*100.0,Params.Smoother,Params.coarse_relax_tol); // Coarse check
+  }
+  Grid_finalize();
+}
+
@@ -84,11 +84,12 @@ int main (int argc, char ** argv)

  std::vector<double> Coeffs { 0.,-1.};
  Polynomial<FermionField> PolyX(Coeffs);
-  Chebyshev<FermionField> Cheb(0.2,5.,11);
-//  ChebyshevLanczos<LatticeFermion> Cheb(9.,1.,0.,20);
-//  Cheb.csv(std::cout);
-//  exit(-24);
-  ImplicitlyRestartedLanczos<FermionField> IRL(HermOp,Cheb,Nstop,Nk,Nm,resid,MaxIt);
+  Chebyshev<FermionField> Cheby(0.2,5.,11);
+
+  FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
+     PlainHermOp<FermionField> Op     (HermOp);
+
+  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby,Op,Nstop,Nk,Nm,resid,MaxIt);

  
  std::vector<RealD>          eval(Nm);
@@ -119,12 +119,13 @@ int main (int argc, char ** argv)
  RealD beta  = 0.1;
  RealD mu    = 0.0;
  int order = 11;
-  ChebyshevLanczos<LatticeComplex> Cheby(alpha,beta,mu,order);
+  Chebyshev<LatticeComplex> Cheby(alpha,beta,order);
  std::ofstream file("cheby.dat");
  Cheby.csv(file);

-  HermOpOperatorFunction<LatticeComplex> X;
  DumbOperator<LatticeComplex> HermOp(grid);
+  FunctionHermOp<LatticeComplex> OpCheby(Cheby,HermOp);
+     PlainHermOp<LatticeComplex> Op(HermOp);

  const int Nk = 40;
  const int Nm = 80;
@@ -133,8 +134,9 @@ int main (int argc, char ** argv)
  int Nconv;
  RealD eresid = 1.0e-6;

-  ImplicitlyRestartedLanczos<LatticeComplex> IRL(HermOp,X,Nk,Nk,Nm,eresid,Nit);
-  ImplicitlyRestartedLanczos<LatticeComplex> ChebyIRL(HermOp,Cheby,Nk,Nk,Nm,eresid,Nit);
+
+  ImplicitlyRestartedLanczos<LatticeComplex> IRL(Op,Op,Nk,Nk,Nm,eresid,Nit);
+  ImplicitlyRestartedLanczos<LatticeComplex> ChebyIRL(OpCheby,Op,Nk,Nk,Nm,eresid,Nit);

  LatticeComplex src(grid); gaussian(RNG,src);
  {
@@ -86,9 +86,12 @@ int main(int argc, char** argv) {

  std::vector<double> Coeffs{0, 1.};
  Polynomial<FermionField> PolyX(Coeffs);
-  Chebyshev<FermionField> Cheb(0.0, 10., 12);
-  ImplicitlyRestartedLanczos<FermionField> IRL(HermOp, PolyX, Nstop, Nk, Nm,
-                                               resid, MaxIt);
+  Chebyshev<FermionField> Cheby(0.0, 10., 12);
+
+  FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
+     PlainHermOp<FermionField> Op     (HermOp);
+
+  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);

  std::vector<RealD> eval(Nm);
  FermionField src(FGrid);
@@ -555,13 +555,13 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "Calling Aggregation class to build subspace" <<std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermDefOp(Ddwf);
-  Subspace Aggregates(Coarse5d,FGrid);
+  Subspace Aggregates(Coarse5d,FGrid,0);
  //  Aggregates.CreateSubspace(RNG5,HermDefOp,nbasis);
  assert ( (nbasis & 0x1)==0);
  int nb=nbasis/2;
  std::cout<<GridLogMessage << " nbasis/2 = "<<nb<<std::endl;
-  //  Aggregates.CreateSubspace(RNG5,HermDefOp,nb);
-  Aggregates.CreateSubspaceLanczos(RNG5,HermDefOp,nb);
+  Aggregates.CreateSubspace(RNG5,HermDefOp,nb);
+  //  Aggregates.CreateSubspaceLanczos(RNG5,HermDefOp,nb);
  for(int n=0;n<nb;n++){
    G5R5(Aggregates.subspace[n+nb],Aggregates.subspace[n]);
    std::cout<<GridLogMessage<<n<<" subspace "<<norm2(Aggregates.subspace[n+nb])<<" "<<norm2(Aggregates.subspace[n]) <<std::endl;
@@ -52,15 +52,28 @@ int main (int argc, char ** argv)
  GridRedBlackCartesian * rbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);

-  int nrhs = UGrid->RankCount() ;
-
  /////////////////////////////////////////////
  // Split into 1^4 mpi communicators
  /////////////////////////////////////////////
+  for(int i=0;i<argc;i++){
+    if(std::string(argv[i]) == "--split"){
+      for(int k=0;k<mpi_layout.size();k++){
+	std::stringstream ss; 
+	ss << argv[i+1+k]; 
+	ss >> mpi_split[k];
+      }
+      break;
+    }
+  }
+
+  int nrhs = 1;
+  int me;
+  for(int i=0;i<mpi_layout.size();i++) nrhs *= (mpi_layout[i]/mpi_split[i]);
+
  GridCartesian         * SGrid = new GridCartesian(GridDefaultLatt(),
 						    GridDefaultSimd(Nd,vComplex::Nsimd()),
 						    mpi_split,
-						    *UGrid); 
+						    *UGrid,me); 

  GridCartesian         * SFGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,SGrid);
  GridRedBlackCartesian * SrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(SGrid);
@@ -70,7 +83,6 @@ int main (int argc, char ** argv)
  // Set up the problem as a 4d spreadout job
  ///////////////////////////////////////////////
  std::vector<int> seeds({1,2,3,4});
-
  GridParallelRNG pRNG(UGrid );  pRNG.SeedFixedIntegers(seeds);
  GridParallelRNG pRNG5(FGrid);  pRNG5.SeedFixedIntegers(seeds);
  std::vector<FermionField>    src(nrhs,FGrid);
@@ -93,7 +105,7 @@ int main (int argc, char ** argv)
  emptyUserRecord record;
  std::string file("./scratch.scidac");
  std::string filef("./scratch.scidac.ferm");
-  int me = UGrid->ThisRank();
+
  LatticeGaugeField s_Umu(SGrid);
  FermionField s_src(SFGrid);
  FermionField s_src_split(SFGrid);
@@ -169,7 +181,7 @@ int main (int argc, char ** argv)
  for(int n=0;n<nrhs;n++){
    FGrid->Barrier();
    if ( n==me ) {
-      std::cerr << GridLogMessage<<"Split "<< me << " " << norm2(s_src_split) << " " << norm2(s_src)<< " diff " << norm2(s_tmp)<<std::endl;
+      std::cout << GridLogMessage<<"Split "<< me << " " << norm2(s_src_split) << " " << norm2(s_src)<< " diff " << norm2(s_tmp)<<std::endl;
    }
    FGrid->Barrier();
  }
@@ -190,7 +202,7 @@ int main (int argc, char ** argv)

  MdagMLinearOperator<DomainWallFermionR,FermionField> HermOp(Ddwf);
  MdagMLinearOperator<DomainWallFermionR,FermionField> HermOpCk(Dchk);
-  ConjugateGradient<FermionField> CG((1.0e-8/(me+1)),10000);
+  ConjugateGradient<FermionField> CG((1.0e-5/(me+1)),10000);
  s_res = zero;
  CG(HermOp,s_src,s_res);

@@ -218,7 +230,6 @@ int main (int argc, char ** argv)
    std::cout << " diff " <<tmp<<std::endl;
  }
  */
-
  std::cout << GridLogMessage<< "Checking the residuals"<<std::endl;
  for(int n=0;n<nrhs;n++){
    HermOpCk.HermOp(result[n],tmp); tmp = tmp - src[n];
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+   /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -47,20 +47,36 @@ int main (int argc, char ** argv)
  std::vector<int> mpi_layout  = GridDefaultMpi();
  std::vector<int> mpi_split (mpi_layout.size(),1);

-  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * rbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);

-  int nrhs = UGrid->RankCount() ;
-
  /////////////////////////////////////////////
  // Split into 1^4 mpi communicators
  /////////////////////////////////////////////
+
+  for(int i=0;i<argc;i++){
+    if(std::string(argv[i]) == "--split"){
+      for(int k=0;k<mpi_layout.size();k++){
+	std::stringstream ss; 
+	ss << argv[i+1+k]; 
+	ss >> mpi_split[k];
+      }
+      break;
+    }
+  }
+
+  int nrhs = 1;
+  int me;
+  for(int i=0;i<mpi_layout.size();i++) nrhs *= (mpi_layout[i]/mpi_split[i]);
+
  GridCartesian         * SGrid = new GridCartesian(GridDefaultLatt(),
 						    GridDefaultSimd(Nd,vComplex::Nsimd()),
 						    mpi_split,
-						    *UGrid); 
+						    *UGrid,me); 

  GridCartesian         * SFGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,SGrid);
  GridRedBlackCartesian * SrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(SGrid);
@@ -78,16 +94,46 @@ int main (int argc, char ** argv)
  std::vector<FermionField> result(nrhs,FGrid);
  FermionField tmp(FGrid);

-  for(int s=0;s<nrhs;s++) random(pRNG5,src[s]);
  for(int s=0;s<nrhs;s++) result[s]=zero;
+#define LEXICO_TEST
+#ifdef LEXICO_TEST
+  {
+    LatticeFermion lex(FGrid);  lex = zero;
+    LatticeFermion ftmp(FGrid);
+    Integer stride =10000;
+    double nrm;
+    LatticeComplex coor(FGrid);
+    for(int d=0;d<5;d++){
+      LatticeCoordinate(coor,d);
+      ftmp = stride;
+      ftmp = ftmp * coor;
+      lex = lex + ftmp;
+      stride=stride/10;
+    }
+    for(int s=0;s<nrhs;s++) {
+      src[s]=lex;
+      ftmp = 1000*1000*s;
+      src[s] = src[s] + ftmp;
+    }    
+  }
+#else
+  for(int s=0;s<nrhs;s++) {
+    random(pRNG5,src[s]);
+    tmp = 100.0*s;
+    src[s] = (src[s] * 0.1) + tmp;
+    std::cout << GridLogMessage << " src ["<<s<<"] "<<norm2(src[s])<<std::endl;
+  }
+#endif
+
+  for(int n =0 ; n< nrhs ; n++) { 
+    //    std::cout << " src"<<n<<"\n"<< src[n] <<std::endl;
+  }

  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);

  /////////////////
  // MPI only sends
  /////////////////
-  int me = UGrid->ThisRank();
-
  LatticeGaugeField s_Umu(SGrid);
  FermionField s_src(SFGrid);
  FermionField s_tmp(SFGrid);
@@ -98,6 +144,37 @@ int main (int argc, char ** argv)
  ///////////////////////////////////////////////////////////////
  Grid_split  (Umu,s_Umu);
  Grid_split  (src,s_src);
+  std::cout << GridLogMessage << " split rank  " <<me << " s_src "<<norm2(s_src)<<std::endl;
+  //  std::cout << " s_src\n "<< s_src <<std::endl;
+
+#ifdef LEXICO_TEST
+  FermionField s_src_tmp(SFGrid);
+  FermionField s_src_diff(SFGrid);
+  {
+    LatticeFermion lex(SFGrid);  lex = zero;
+    LatticeFermion ftmp(SFGrid);
+    Integer stride =10000;
+    double nrm;
+    LatticeComplex coor(SFGrid);
+    for(int d=0;d<5;d++){
+      LatticeCoordinate(coor,d);
+      ftmp = stride;
+      ftmp = ftmp * coor;
+      lex = lex + ftmp;
+      stride=stride/10;
+    }
+    s_src_tmp=lex;
+    ftmp = 1000*1000*me;
+    s_src_tmp = s_src_tmp + ftmp;
+  }
+  s_src_diff = s_src_tmp - s_src;
+  std::cout << GridLogMessage <<" LEXICO test:  s_src_diff " << norm2(s_src_diff)<<std::endl;
+
+  //  std::cout << " s_src \n" << s_src << std::endl;
+  //  std::cout << " s_src_tmp \n" << s_src_tmp << std::endl;
+  //  std::cout << " s_src_diff \n" << s_src_diff << std::endl;
+  //  exit(0);
+#endif

  ///////////////////////////////////////////////////////////////
  // Set up N-solvers as trivially parallel
@@ -113,10 +190,11 @@ int main (int argc, char ** argv)

  MdagMLinearOperator<DomainWallFermionR,FermionField> HermOp(Ddwf);
  MdagMLinearOperator<DomainWallFermionR,FermionField> HermOpCk(Dchk);
-  ConjugateGradient<FermionField> CG((1.0e-8/(me+1)),10000);
+  ConjugateGradient<FermionField> CG((1.0e-2),10000);
  s_res = zero;
  CG(HermOp,s_src,s_res);

+  std::cout << GridLogMessage << " split residual norm "<<norm2(s_res)<<std::endl;
  /////////////////////////////////////////////////////////////
  // Report how long they all took
  /////////////////////////////////////////////////////////////
@@ -134,10 +212,12 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< "Unsplitting the result"<<std::endl;
  Grid_unsplit(result,s_res);

+
  std::cout << GridLogMessage<< "Checking the residuals"<<std::endl;
  for(int n=0;n<nrhs;n++){
+    std::cout << GridLogMessage<< " res["<<n<<"] norm "<<norm2(result[n])<<std::endl;
    HermOpCk.HermOp(result[n],tmp); tmp = tmp - src[n];
-    std::cout << GridLogMessage<<" resid["<<n<<"]  "<< norm2(tmp)<<std::endl;
+    std::cout << GridLogMessage<<" resid["<<n<<"]  "<< norm2(tmp)/norm2(src[n])<<std::endl;
  }

  Grid_finalize();
@@ -47,7 +47,9 @@ int main (int argc, char ** argv)
  std::vector<int> mpi_layout  = GridDefaultMpi();
  std::vector<int> mpi_split (mpi_layout.size(),1);

-  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * rbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
@@ -57,10 +59,11 @@ int main (int argc, char ** argv)
  /////////////////////////////////////////////
  // Split into 1^4 mpi communicators
  /////////////////////////////////////////////
+  int me;
  GridCartesian         * SGrid = new GridCartesian(GridDefaultLatt(),
 						    GridDefaultSimd(Nd,vComplex::Nsimd()),
 						    mpi_split,
-						    *UGrid); 
+						    *UGrid,me); 

  GridCartesian         * SFGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,SGrid);
  GridRedBlackCartesian * SrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(SGrid);
@@ -89,8 +92,6 @@ int main (int argc, char ** argv)
  /////////////////
  // MPI only sends
  /////////////////
-  int me = UGrid->ThisRank();
-
  LatticeGaugeField s_Umu(SGrid);
  FermionField s_src(SFGrid);
  FermionField s_src_e(SFrbGrid);
@@ -0,0 +1,157 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_mrhs_cg.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  typedef typename DomainWallFermionR::FermionField FermionField; 
+  typedef typename DomainWallFermionR::ComplexField ComplexField; 
+  typename DomainWallFermionR::ImplParams params; 
+
+  const int Ls=4;
+
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+  std::vector<int> mpi_split (mpi_layout.size(),1);
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * rbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  /////////////////////////////////////////////
+  // Split into 1^4 mpi communicators
+  /////////////////////////////////////////////
+
+  for(int i=0;i<argc;i++){
+    if(std::string(argv[i]) == "--split"){
+      for(int k=0;k<mpi_layout.size();k++){
+	std::stringstream ss; 
+	ss << argv[i+1+k]; 
+	ss >> mpi_split[k];
+      }
+      break;
+    }
+  }
+
+  int nrhs = 1;
+  for(int i=0;i<mpi_layout.size();i++) nrhs *= (mpi_layout[i]/mpi_split[i]);
+
+  GridCartesian         * SGrid = new GridCartesian(GridDefaultLatt(),
+						    GridDefaultSimd(Nd,vComplex::Nsimd()),
+						    mpi_split,
+						    *UGrid); 
+
+  GridCartesian         * SFGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,SGrid);
+  GridRedBlackCartesian * SrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(SGrid);
+  GridRedBlackCartesian * SFrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,SGrid);
+
+  ///////////////////////////////////////////////
+  // Set up the problem as a 4d spreadout job
+  ///////////////////////////////////////////////
+  std::vector<int> seeds({1,2,3,4});
+
+  GridParallelRNG pRNG(UGrid );  pRNG.SeedFixedIntegers(seeds);
+  GridParallelRNG pRNG5(FGrid);  pRNG5.SeedFixedIntegers(seeds);
+  std::vector<FermionField>    src(nrhs,FGrid);
+  std::vector<FermionField> src_chk(nrhs,FGrid);
+  std::vector<FermionField> result(nrhs,FGrid);
+  FermionField tmp(FGrid);
+
+  for(int s=0;s<nrhs;s++) random(pRNG5,src[s]);
+  for(int s=0;s<nrhs;s++) result[s]=zero;
+
+  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
+
+  /////////////////
+  // MPI only sends
+  /////////////////
+  int me = UGrid->ThisRank();
+
+  LatticeGaugeField s_Umu(SGrid);
+  FermionField s_src(SFGrid);
+  FermionField s_tmp(SFGrid);
+  FermionField s_res(SFGrid);
+
+  ///////////////////////////////////////////////////////////////
+  // split the source out using MPI instead of I/O
+  ///////////////////////////////////////////////////////////////
+  Grid_split  (Umu,s_Umu);
+  Grid_split  (src,s_src);
+
+  ///////////////////////////////////////////////////////////////
+  // Set up N-solvers as trivially parallel
+  ///////////////////////////////////////////////////////////////
+  RealD mass=0.01;
+  RealD M5=1.8;
+  DomainWallFermionR Dchk(Umu,*FGrid,*FrbGrid,*UGrid,*rbGrid,mass,M5);
+  DomainWallFermionR Ddwf(s_Umu,*SFGrid,*SFrbGrid,*SGrid,*SrbGrid,mass,M5);
+
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+  std::cout << GridLogMessage << " Calling DWF CG "<<std::endl;
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+
+  MdagMLinearOperator<DomainWallFermionR,FermionField> HermOp(Ddwf);
+  MdagMLinearOperator<DomainWallFermionR,FermionField> HermOpCk(Dchk);
+  ConjugateGradient<FermionField> CG((1.0e-8/(me+1)),10000);
+  s_res = zero;
+  CG(HermOp,s_src,s_res);
+
+  /////////////////////////////////////////////////////////////
+  // Report how long they all took
+  /////////////////////////////////////////////////////////////
+  std::vector<uint32_t> iterations(nrhs,0);
+  iterations[me] = CG.IterationsToComplete;
+
+  for(int n=0;n<nrhs;n++){
+    UGrid->GlobalSum(iterations[n]);
+    std::cout << GridLogMessage<<" Rank "<<n<<" "<< iterations[n]<<" CG iterations"<<std::endl;
+  }
+
+  /////////////////////////////////////////////////////////////
+  // Gather and residual check on the results
+  /////////////////////////////////////////////////////////////
+  std::cout << GridLogMessage<< "Unsplitting the result"<<std::endl;
+  Grid_unsplit(result,s_res);
+
+  std::cout << GridLogMessage<< "Checking the residuals"<<std::endl;
+  for(int n=0;n<nrhs;n++){
+    HermOpCk.HermOp(result[n],tmp); tmp = tmp - src[n];
+    std::cout << GridLogMessage<<" resid["<<n<<"]  "<< norm2(tmp)<<std::endl;
+  }
+
+  Grid_finalize();
+}
@@ -48,7 +48,6 @@ struct scal {
 int main (int argc, char ** argv)
 {
  typedef typename ImprovedStaggeredFermionR::FermionField FermionField; 
-  typedef typename ImprovedStaggeredFermionR::ComplexField ComplexField; 
  typename ImprovedStaggeredFermionR::ImplParams params; 

  Grid_init(&argc,&argv);
@@ -0,0 +1,90 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_wilson_cg_schur.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::Algebra Gmu [] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ,
+    Gamma::Algebra::GammaT
+  };
+
+int main (int argc, char ** argv)
+{
+  typedef typename ImprovedStaggeredFermionR::FermionField FermionField; 
+  typename ImprovedStaggeredFermionR::ImplParams params; 
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian     RBGrid(&Grid);
+
+  std::vector<int> seeds({1,2,3,4});
+  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
+
+  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+
+  FermionField    src(&Grid); random(pRNG,src);
+  FermionField result(&Grid); result=zero;
+  FermionField  resid(&Grid); 
+
+  RealD mass=0.1;
+  ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass);
+
+  ConjugateGradient<FermionField> CG(1.0e-8,10000);
+  SchurRedBlackStaggeredSolve<FermionField> SchurSolver(CG);
+
+  double volume=1.0;
+  for(int mu=0;mu<Nd;mu++){
+    volume=volume*latt_size[mu];
+  }  
+  double t1=usecond();
+  SchurSolver(Ds,src,result);
+  double t2=usecond();
+
+  // Schur solver: uses DeoDoe => volume * 1146
+  double ncall=CG.IterationsToComplete;
+  double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 +  == 1146
+
+  std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
+  std::cout<<GridLogMessage << "flop/s  =   "<< flops<<std::endl;
+  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
+  
+  Grid_finalize();
+}