Merge branch 'develop' into feature/gpu-port

2025-10-26 09:39:34 +00:00 · 2018-12-13 05:11:34 +00:00
parent adbdc4e65b c509bd3fe2
commit b57a4d32aa
647 changed files with 49155 additions and 11160 deletions
--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@@ -0,0 +1,61 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/Algorithms.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_ALGORITHMS_H
+#define GRID_ALGORITHMS_H
+
+#include <Grid/algorithms/SparseMatrix.h>
+#include <Grid/algorithms/LinearOperator.h>
+#include <Grid/algorithms/Preconditioner.h>
+
+#include <Grid/algorithms/approx/Zolotarev.h>
+#include <Grid/algorithms/approx/Chebyshev.h>
+#include <Grid/algorithms/approx/Remez.h>
+#include <Grid/algorithms/approx/MultiShiftFunction.h>
+#include <Grid/algorithms/approx/Forecast.h>
+
+#include <Grid/algorithms/iterative/Deflation.h>
+#include <Grid/algorithms/iterative/ConjugateGradient.h>
+#include <Grid/algorithms/iterative/ConjugateResidual.h>
+#include <Grid/algorithms/iterative/NormalEquations.h>
+#include <Grid/algorithms/iterative/SchurRedBlack.h>
+#include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
+#include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
+#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
+#include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
+#include <Grid/algorithms/CoarsenedMatrix.h>
+#include <Grid/algorithms/FFT.h>
+
+// EigCg
+// Pcg
+// Hdcg
+// GCR
+// etc..
+
+#endif
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@@ -0,0 +1,486 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/CoarsenedMatrix.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef  GRID_ALGORITHM_COARSENED_MATRIX_H
+#define  GRID_ALGORITHM_COARSENED_MATRIX_H
+
+
+NAMESPACE_BEGIN(Grid);
+
+class Geometry {
+  //    int dimension;
+public:
+  int npoint;
+  std::vector<int> directions   ;
+  std::vector<int> displacements;
+
+  Geometry(int _d)  {
+  
+    int base = (_d==5) ? 1:0;
+
+    // make coarse grid stencil for 4d , not 5d
+    if ( _d==5 ) _d=4;
+
+    npoint = 2*_d+1;
+    directions.resize(npoint);
+    displacements.resize(npoint);
+    for(int d=0;d<_d;d++){
+      directions[2*d  ] = d+base;
+      directions[2*d+1] = d+base;
+      displacements[2*d  ] = +1;
+      displacements[2*d+1] = -1;
+    }
+    directions   [2*_d]=0;
+    displacements[2*_d]=0;
+      
+    //// report back
+    std::cout<<GridLogMessage<<"directions    :";
+    for(int d=0;d<npoint;d++) std::cout<< directions[d]<< " ";
+    std::cout <<std::endl;
+    std::cout<<GridLogMessage<<"displacements :";
+    for(int d=0;d<npoint;d++) std::cout<< displacements[d]<< " ";
+    std::cout<<std::endl;
+  }
+  
+  /*
+  // Original cleaner code
+  Geometry(int _d) : dimension(_d), npoint(2*_d+1), directions(npoint), displacements(npoint) {
+  for(int d=0;d<dimension;d++){
+  directions[2*d  ] = d;
+  directions[2*d+1] = d;
+  displacements[2*d  ] = +1;
+  displacements[2*d+1] = -1;
+  }
+  directions   [2*dimension]=0;
+  displacements[2*dimension]=0;
+  }
+  std::vector<int> GetDelta(int point) {
+  std::vector<int> delta(dimension,0);
+  delta[directions[point]] = displacements[point];
+  return delta;
+  };
+  */    
+
+};
+  
+template<class Fobj,class CComplex,int nbasis>
+class Aggregation   {
+public:
+  typedef iVector<CComplex,nbasis >             siteVector;
+  typedef Lattice<siteVector>                 CoarseVector;
+  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
+
+  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
+  typedef Lattice<Fobj >        FineField;
+
+  GridBase *CoarseGrid;
+  GridBase *FineGrid;
+  std::vector<Lattice<Fobj> > subspace;
+  int checkerboard;
+  int Checkerboard(void){return checkerboard;}
+  Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) : 
+    CoarseGrid(_CoarseGrid),
+    FineGrid(_FineGrid),
+    subspace(nbasis,_FineGrid),
+    checkerboard(_checkerboard)
+  {
+  };
+  
+  void Orthogonalise(void){
+    CoarseScalar InnerProd(CoarseGrid); 
+    std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
+    blockOrthogonalise(InnerProd,subspace);
+    std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
+    blockOrthogonalise(InnerProd,subspace);
+    //      std::cout << GridLogMessage <<" Gramm-Schmidt checking orthogonality"<<std::endl;
+    //      CheckOrthogonal();
+  } 
+  void CheckOrthogonal(void){
+    CoarseVector iProj(CoarseGrid); 
+    CoarseVector eProj(CoarseGrid); 
+    for(int i=0;i<nbasis;i++){
+      blockProject(iProj,subspace[i],subspace);
+      eProj=Zero(); 
+      thread_loop( (int ss=0;ss<CoarseGrid->oSites();ss++),{
+	eProj[ss](i)=CComplex(1.0);
+      });
+      eProj=eProj - iProj;
+      std::cout<<GridLogMessage<<"Orthog check error "<<i<<" " << norm2(eProj)<<std::endl;
+    }
+    std::cout<<GridLogMessage <<"CheckOrthog done"<<std::endl;
+  }
+  void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
+    blockProject(CoarseVec,FineVec,subspace);
+  }
+  void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
+    FineVec.Checkerboard() = subspace[0].Checkerboard();
+    blockPromote(CoarseVec,FineVec,subspace);
+  }
+  void CreateSubspaceRandom(GridParallelRNG &RNG){
+    for(int i=0;i<nbasis;i++){
+      random(RNG,subspace[i]);
+      std::cout<<GridLogMessage<<" norm subspace["<<i<<"] "<<norm2(subspace[i])<<std::endl;
+    }
+    Orthogonalise();
+  }
+
+  /*
+    virtual void CreateSubspaceLanczos(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) 
+    {
+    // Run a Lanczos with sloppy convergence
+    const int Nstop = nn;
+    const int Nk = nn+20;
+    const int Np = nn+20;
+    const int Nm = Nk+Np;
+    const int MaxIt= 10000;
+    RealD resid = 1.0e-3;
+
+    Chebyshev<FineField> Cheb(0.5,64.0,21);
+    ImplicitlyRestartedLanczos<FineField> IRL(hermop,Cheb,Nstop,Nk,Nm,resid,MaxIt);
+    //	IRL.lock = 1;
+
+    FineField noise(FineGrid); gaussian(RNG,noise);
+    FineField tmp(FineGrid); 
+    std::vector<RealD>     eval(Nm);
+    std::vector<FineField> evec(Nm,FineGrid);
+
+    int Nconv;
+    IRL.calc(eval,evec,
+    noise,
+    Nconv);
+
+    // pull back nn vectors
+    for(int b=0;b<nn;b++){
+
+    subspace[b]   = evec[b];
+
+    std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
+
+    hermop.Op(subspace[b],tmp); 
+    std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(tmp)<<std::endl;
+
+    noise = tmp -  sqrt(eval[b])*subspace[b] ;
+
+    std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<"  ;  [ M - Lambda ]_"<<b<<" vec_"<<b<<"  = " <<norm2(noise)<<std::endl;
+
+    noise = tmp +  eval[b]*subspace[b] ;
+
+    std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<"  ;  [ M - Lambda ]_"<<b<<" vec_"<<b<<"  = " <<norm2(noise)<<std::endl;
+
+    }
+    Orthogonalise();
+    for(int b=0;b<nn;b++){
+    std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
+    }
+    }
+  */
+  virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
+
+    RealD scale;
+
+    ConjugateGradient<FineField> CG(1.0e-2,10000);
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+
+    for(int b=0;b<nn;b++){
+	
+      gaussian(RNG,noise);
+      scale = std::pow(norm2(noise),-0.5); 
+      noise=noise*scale;
+
+      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+      for(int i=0;i<1;i++){
+
+	CG(hermop,noise,subspace[b]);
+
+	noise = subspace[b];
+	scale = std::pow(norm2(noise),-0.5); 
+	noise=noise*scale;
+
+      }
+
+      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
+      subspace[b]   = noise;
+
+    }
+
+    Orthogonalise();
+
+  }
+};
+// Fine Object == (per site) type of fine field
+// nbasis      == number of deflation vectors
+template<class Fobj,class CComplex,int nbasis>
+class CoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > >  {
+public:
+    
+  typedef iVector<CComplex,nbasis >             siteVector;
+  typedef Lattice<siteVector>                 CoarseVector;
+  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
+
+  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
+  typedef Lattice<Fobj >        FineField;
+
+  ////////////////////
+  // Data members
+  ////////////////////
+  Geometry         geom;
+  GridBase *       _grid; 
+  CartesianStencil<siteVector,siteVector> Stencil; 
+
+  std::vector<CoarseMatrix> A;
+
+      
+  ///////////////////////
+  // Interface
+  ///////////////////////
+  GridBase * Grid(void)         { return _grid; };   // this is all the linalg routines need to know
+
+  RealD M (const CoarseVector &in, CoarseVector &out){
+
+    conformable(_grid,in.Grid());
+    conformable(in.Grid(),out.Grid());
+
+    SimpleCompressor<siteVector> compressor;
+    Stencil.HaloExchange(in,compressor);
+    auto in_v = in.View();
+    auto out_v = in.View();
+    thread_loop( (int ss=0;ss<Grid()->oSites();ss++),{
+      siteVector res = Zero();
+      siteVector nbr;
+      int ptype;
+      StencilEntry *SE;
+      for(int point=0;point<geom.npoint;point++){
+
+	SE=Stencil.GetEntry(ptype,point,ss);
+	  
+	if(SE->_is_local&&SE->_permute) { 
+	  permute(nbr,in_v[SE->_offset],ptype);
+	} else if(SE->_is_local) { 
+	  nbr = in_v[SE->_offset];
+	} else {
+	  nbr = Stencil.CommBuf()[SE->_offset];
+	}
+	auto A_point = A[point].View();
+	res = res + A_point[ss]*nbr;
+      }
+      vstream(out_v[ss],res);
+    });
+    return norm2(out);
+  };
+
+  RealD Mdag (const CoarseVector &in, CoarseVector &out){ 
+    return M(in,out);
+  };
+
+  // Defer support for further coarsening for now
+  void Mdiag    (const CoarseVector &in,  CoarseVector &out){};
+  void Mdir     (const CoarseVector &in,  CoarseVector &out,int dir, int disp){};
+
+  CoarsenedMatrix(GridCartesian &CoarseGrid) 	: 
+
+    _grid(&CoarseGrid),
+    geom(CoarseGrid._ndimension),
+    Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
+    A(geom.npoint,&CoarseGrid)
+  {
+  };
+
+  void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
+		       Aggregation<Fobj,CComplex,nbasis> & Subspace){
+
+    FineField iblock(FineGrid); // contributions from within this block
+    FineField oblock(FineGrid); // contributions from outwith this block
+
+    FineField     phi(FineGrid);
+    FineField     tmp(FineGrid);
+    FineField     zz(FineGrid); zz=Zero();
+    FineField    Mphi(FineGrid);
+
+    Lattice<iScalar<vInteger> > coor(FineGrid);
+
+    CoarseVector iProj(Grid()); 
+    CoarseVector oProj(Grid()); 
+    CoarseScalar InnerProd(Grid()); 
+
+    // Orthogonalise the subblocks over the basis
+    blockOrthogonalise(InnerProd,Subspace.subspace);
+
+    // Compute the matrix elements of linop between this orthonormal
+    // set of vectors.
+    int self_stencil=-1;
+    for(int p=0;p<geom.npoint;p++){ 
+      A[p]=Zero();
+      if( geom.displacements[p]==0){
+	self_stencil=p;
+      }
+    }
+    assert(self_stencil!=-1);
+
+    for(int i=0;i<nbasis;i++){
+      phi=Subspace.subspace[i];
+	
+      std::cout<<GridLogMessage<<"("<<i<<").."<<std::endl;
+
+      for(int p=0;p<geom.npoint;p++){ 
+
+	int dir   = geom.directions[p];
+	int disp  = geom.displacements[p];
+
+	Integer block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]);
+
+	LatticeCoordinate(coor,dir);
+
+	if ( disp==0 ){
+	  linop.OpDiag(phi,Mphi);
+	}
+	else  {
+	  linop.OpDir(phi,Mphi,dir,disp); 
+	}
+
+	////////////////////////////////////////////////////////////////////////
+	// Pick out contributions coming from this cell and neighbour cell
+	////////////////////////////////////////////////////////////////////////
+	if ( disp==0 ) {
+	  iblock = Mphi;
+	  oblock = Zero();
+	} else if ( disp==1 ) {
+	  oblock = where(mod(coor,block)==(block-1),Mphi,zz);
+	  iblock = where(mod(coor,block)!=(block-1),Mphi,zz);
+	} else if ( disp==-1 ) {
+	  oblock = where(mod(coor,block)==(Integer)0,Mphi,zz);
+	  iblock = where(mod(coor,block)!=(Integer)0,Mphi,zz);
+	} else {
+	  assert(0);
+	}
+
+	Subspace.ProjectToSubspace(iProj,iblock);
+	Subspace.ProjectToSubspace(oProj,oblock);
+	//	  blockProject(iProj,iblock,Subspace.subspace);
+	//	  blockProject(oProj,oblock,Subspace.subspace);
+	auto iProj_v = iProj.View() ;
+	auto oProj_v = oProj.View() ;
+	auto A_p     =  A[p].View();
+	auto A_self  = A[self_stencil].View();
+	thread_loop( (int ss=0;ss<Grid()->oSites();ss++),{
+	  for(int j=0;j<nbasis;j++){
+	    if( disp!= 0 ) {
+	      A_p[ss](j,i) = oProj_v[ss](j);
+	    }
+	    A_self[ss](j,i) =	A_self[ss](j,i) + iProj_v[ss](j);
+	  }
+	});
+      }
+    }
+
+#if 0
+    ///////////////////////////
+    // test code worth preserving in if block
+    ///////////////////////////
+    std::cout<<GridLogMessage<< " Computed matrix elements "<< self_stencil <<std::endl;
+    for(int p=0;p<geom.npoint;p++){
+      std::cout<<GridLogMessage<< "A["<<p<<"]" << std::endl;
+      std::cout<<GridLogMessage<< A[p] << std::endl;
+    }
+    std::cout<<GridLogMessage<< " picking by block0 "<< self_stencil <<std::endl;
+
+    phi=Subspace.subspace[0];
+    std::vector<int> bc(FineGrid->_ndimension,0);
+
+    blockPick(Grid(),phi,tmp,bc);      // Pick out a block
+    linop.Op(tmp,Mphi);                // Apply big dop
+    blockProject(iProj,Mphi,Subspace.subspace); // project it and print it
+    std::cout<<GridLogMessage<< " Computed matrix elements from block zero only "<<std::endl;
+    std::cout<<GridLogMessage<< iProj <<std::endl;
+    std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
+#endif
+    //      ForceHermitian();
+    AssertHermitian();
+    // ForceDiagonal();
+  }
+  void ForceDiagonal(void) {
+
+
+    std::cout<<GridLogMessage<<"**************************************************"<<std::endl;
+    std::cout<<GridLogMessage<<"****   Forcing coarse operator to be diagonal ****"<<std::endl;
+    std::cout<<GridLogMessage<<"**************************************************"<<std::endl;
+    for(int p=0;p<8;p++){
+      A[p]=Zero();
+    }
+
+    GridParallelRNG  RNG(Grid()); RNG.SeedFixedIntegers(std::vector<int>({55,72,19,17,34}));
+    Lattice<iScalar<CComplex> > val(Grid()); random(RNG,val);
+
+    Complex one(1.0);
+
+    iMatrix<CComplex,nbasis> ident;  ident=one;
+
+    val = val*adj(val);
+    val = val + 1.0;
+
+    A[8] = val*ident;
+
+    //      for(int s=0;s<Grid()->oSites();s++) {
+    //	A[8][s]=val[s];
+    //      }
+  }
+  void ForceHermitian(void) {
+    for(int d=0;d<4;d++){
+      int dd=d+1;
+      A[2*d] = adj(Cshift(A[2*d+1],dd,1));
+    }
+    //      A[8] = 0.5*(A[8] + adj(A[8]));
+  }
+  void AssertHermitian(void) {
+    CoarseMatrix AA    (Grid());
+    CoarseMatrix AAc   (Grid());
+    CoarseMatrix Diff  (Grid());
+    for(int d=0;d<4;d++){
+	
+      int dd=d+1;
+      AAc = Cshift(A[2*d+1],dd,1);
+      AA  = A[2*d];
+	
+      Diff = AA - adj(AAc);
+
+      std::cout<<GridLogMessage<<"Norm diff dim "<<d<<" "<< norm2(Diff)<<std::endl;
+      std::cout<<GridLogMessage<<"Norm dim "<<d<<" "<< norm2(AA)<<std::endl;
+	  
+    }
+    Diff = A[8] - adj(A[8]);
+    std::cout<<GridLogMessage<<"Norm diff local "<< norm2(Diff)<<std::endl;
+    std::cout<<GridLogMessage<<"Norm local "<< norm2(A[8])<<std::endl;
+  }
+    
+};
+
+NAMESPACE_END(Grid);
+#endif
--- a/Grid/algorithms/FFT.h
+++ b/Grid/algorithms/FFT.h
@@ -0,0 +1,291 @@
+
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/Cshift.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef _GRID_FFT_H_
+#define _GRID_FFT_H_
+
+#ifdef HAVE_FFTW
+#ifdef USE_MKL
+#include <fftw/fftw3.h>
+#else
+#include <fftw3.h>
+#endif
+#endif
+
+
+NAMESPACE_BEGIN(Grid);
+
+template<class scalar> struct FFTW { };
+
+#ifdef HAVE_FFTW	
+template<> struct FFTW<ComplexD> {
+public:
+
+  typedef fftw_complex FFTW_scalar;
+  typedef fftw_plan    FFTW_plan;
+
+  static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
+				      FFTW_scalar *in, const int *inembed,		
+				      int istride, int idist,		
+				      FFTW_scalar *out, const int *onembed,		
+				      int ostride, int odist,		
+				      int sign, unsigned flags) {
+    return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
+  }	  
+    
+  static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
+    ::fftw_flops(p,add,mul,fmas);
+  }
+
+  inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
+    ::fftw_execute_dft(p,in,out);
+  }
+  inline static void fftw_destroy_plan(const FFTW_plan p) {
+    ::fftw_destroy_plan(p);
+  }
+};
+
+template<> struct FFTW<ComplexF> {
+public:
+
+  typedef fftwf_complex FFTW_scalar;
+  typedef fftwf_plan    FFTW_plan;
+
+  static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
+				      FFTW_scalar *in, const int *inembed,		
+				      int istride, int idist,		
+				      FFTW_scalar *out, const int *onembed,		
+				      int ostride, int odist,		
+				      int sign, unsigned flags) {
+    return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
+  }	  
+    
+  static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
+    ::fftwf_flops(p,add,mul,fmas);
+  }
+
+  inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
+    ::fftwf_execute_dft(p,in,out);
+  }
+  inline static void fftw_destroy_plan(const FFTW_plan p) {
+    ::fftwf_destroy_plan(p);
+  }
+};
+
+#endif
+
+#ifndef FFTW_FORWARD
+#define FFTW_FORWARD (-1)
+#define FFTW_BACKWARD (+1)
+#endif
+
+class FFT {
+private:
+    
+  GridCartesian *vgrid;
+  GridCartesian *sgrid;
+    
+  int Nd;
+  double flops;
+  double flops_call;
+  uint64_t usec;
+    
+  Coordinate dimensions;
+  Coordinate processors;
+  Coordinate processor_coor;
+    
+public:
+    
+  static const int forward=FFTW_FORWARD;
+  static const int backward=FFTW_BACKWARD;
+    
+  double Flops(void) {return flops;}
+  double MFlops(void) {return flops/usec;}
+  double USec(void)   {return (double)usec;}    
+
+  FFT ( GridCartesian * grid ) :
+    vgrid(grid),
+    Nd(grid->_ndimension),
+    dimensions(grid->_fdimensions),
+    processors(grid->_processors),
+    processor_coor(grid->_processor_coor)
+  {
+    flops=0;
+    usec =0;
+    Coordinate layout(Nd,1);
+    sgrid = new GridCartesian(dimensions,layout,processors);
+  };
+    
+  ~FFT ( void)  {
+    delete sgrid;
+  }
+    
+  template<class vobj>
+  void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,Coordinate mask,int sign){
+
+    conformable(result.Grid(),vgrid);
+    conformable(source.Grid(),vgrid);
+    Lattice<vobj> tmp(vgrid);
+    tmp = source;
+    for(int d=0;d<Nd;d++){
+      if( mask[d] ) {
+	FFT_dim(result,tmp,d,sign);
+	tmp=result;
+      }
+    }
+  }
+
+  template<class vobj>
+  void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){
+    Coordinate mask(Nd,1);
+    FFT_dim_mask(result,source,mask,sign);
+  }
+
+
+  template<class vobj>
+  void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
+#ifndef HAVE_FFTW
+    assert(0);
+#else
+    conformable(result.Grid(),vgrid);
+    conformable(source.Grid(),vgrid);
+
+    int L = vgrid->_ldimensions[dim];
+    int G = vgrid->_fdimensions[dim];
+      
+    Coordinate layout(Nd,1);
+    Coordinate pencil_gd(vgrid->_fdimensions);
+      
+    pencil_gd[dim] = G*processors[dim];
+      
+    // Pencil global vol LxLxGxLxL per node
+    GridCartesian pencil_g(pencil_gd,layout,processors);
+      
+    // Construct pencils
+    typedef typename vobj::scalar_object sobj;
+    typedef typename sobj::scalar_type   scalar;
+      
+    Lattice<sobj> pgbuf(&pencil_g);
+    auto pgbuf_v = pgbuf.View();
+
+    typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
+    typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
+      
+    int Ncomp = sizeof(sobj)/sizeof(scalar);
+    int Nlow  = 1;
+    for(int d=0;d<dim;d++){
+      Nlow*=vgrid->_ldimensions[d];
+    }
+      
+    int rank = 1;  /* 1d transforms */
+    int n[] = {G}; /* 1d transforms of length G */
+    int howmany = Ncomp;
+    int odist,idist,istride,ostride;
+    idist   = odist   = 1;          /* Distance between consecutive FT's */
+    istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
+    int *inembed = n, *onembed = n;
+      
+    scalar div;
+    if ( sign == backward ) div = 1.0/G;
+    else if ( sign == forward ) div = 1.0;
+    else assert(0);
+      
+    FFTW_plan p;
+    {
+      FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
+      FFTW_scalar *out= (FFTW_scalar *)&pgbuf_v[0];
+      p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
+					   in,inembed,
+					   istride,idist,
+					   out,onembed,
+					   ostride, odist,
+					   sign,FFTW_ESTIMATE);
+    }
+      
+    // Barrel shift and collect global pencil
+    Coordinate lcoor(Nd), gcoor(Nd);
+    result = source;
+    int pc = processor_coor[dim];
+    for(int p=0;p<processors[dim];p++) {
+      thread_loop( (int idx=0;idx<sgrid->lSites();idx++), {
+          Coordinate cbuf(Nd);
+          sobj s;
+	  sgrid->LocalIndexToLocalCoor(idx,cbuf);
+	  peekLocalSite(s,result,cbuf);
+	  cbuf[dim]+=((pc+p) % processors[dim])*L;
+	  //            cbuf[dim]+=p*L;
+	  pokeLocalSite(s,pgbuf,cbuf);
+      });
+      if (p != processors[dim] - 1) {
+	result = Cshift(result,dim,L);
+      }
+    }
+      
+    // Loop over orthog coords
+    int NN=pencil_g.lSites();
+    GridStopWatch timer;
+    timer.Start();
+    thread_loop( (int idx=0;idx<NN;idx++), {
+        Coordinate cbuf(Nd);
+	pencil_g.LocalIndexToLocalCoor(idx, cbuf);
+	if ( cbuf[dim] == 0 ) {  // restricts loop to plane at lcoor[dim]==0
+	  FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[idx];
+	  FFTW_scalar *out= (FFTW_scalar *)&pgbuf_v[idx];
+	  FFTW<scalar>::fftw_execute_dft(p,in,out);
+	}
+    });
+    timer.Stop();
+      
+    // performance counting
+    double add,mul,fma;
+    FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
+    flops_call = add+mul+2.0*fma;
+    usec += timer.useconds();
+    flops+= flops_call*NN;
+      
+    // writing out result
+    thread_loop( (int idx=0;idx<sgrid->lSites();idx++), {
+	Coordinate clbuf(Nd), cgbuf(Nd);
+	sobj s;
+	sgrid->LocalIndexToLocalCoor(idx,clbuf);
+	cgbuf = clbuf;
+	cgbuf[dim] = clbuf[dim]+L*pc;
+	peekLocalSite(s,pgbuf,cgbuf);
+	pokeLocalSite(s,result,clbuf);
+    });
+    result = result*div;
+      
+    // destroying plan
+    FFTW<scalar>::fftw_destroy_plan(p);
+#endif
+  }
+};
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -0,0 +1,498 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/LinearOperator.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef  GRID_ALGORITHM_LINEAR_OP_H
+#define  GRID_ALGORITHM_LINEAR_OP_H
+
+NAMESPACE_BEGIN(Grid);
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+// LinearOperators Take a something and return a something.
+/////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Hopefully linearity is satisfied and the AdjOp is indeed the Hermitian Conjugateugate (transpose if real):
+//SBase
+//   i)  F(a x + b y) = aF(x) + b F(y).
+//  ii)  <x|Op|y> = <y|AdjOp|x>^\ast
+//
+// Would be fun to have a test linearity & Herm Conj function!
+/////////////////////////////////////////////////////////////////////////////////////////////
+template<class Field> class LinearOperatorBase {
+public:
+
+  // Support for coarsening to a multigrid
+  virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base
+  virtual void OpDir  (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base
+
+  virtual void Op     (const Field &in, Field &out) = 0; // Abstract base
+  virtual void AdjOp  (const Field &in, Field &out) = 0; // Abstract base
+  virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
+  virtual void HermOp(const Field &in, Field &out)=0;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+// By sharing the class for Sparse Matrix across multiple operator wrappers, we can share code
+// between RB and non-RB variants. Sparse matrix is like the fermion action def, and then
+// the wrappers implement the specialisation of "Op" and "AdjOp" to the cases minimising
+// replication of code.
+//
+// I'm not entirely happy with implementation; to share the Schur code between herm and non-herm
+// while still having a "OpAndNorm" in the abstract base I had to implement it in both cases
+// with an assert trap in the non-herm. This isn't right; there must be a better C++ way to
+// do it, but I fear it required multiple inheritance and mixed in abstract base classes
+/////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////
+// Construct herm op from non-herm matrix
+////////////////////////////////////////////////////////////////////
+template<class Matrix,class Field>
+class MdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+public:
+  MdagMLinearOperator(Matrix &Mat): _Mat(Mat){};
+
+  // Support for coarsening to a multigrid
+  void OpDiag (const Field &in, Field &out) {
+    _Mat.Mdiag(in,out);
+  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {
+    _Mat.Mdir(in,out,dir,disp);
+  }
+  void Op     (const Field &in, Field &out){
+    _Mat.M(in,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    _Mat.Mdag(in,out);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    _Mat.MdagM(in,out,n1,n2);
+  }
+  void HermOp(const Field &in, Field &out){
+    RealD n1,n2;
+    HermOpAndNorm(in,out,n1,n2);
+  }
+};
+
+////////////////////////////////////////////////////////////////////
+// Construct herm op and shift it for mgrid smoother
+////////////////////////////////////////////////////////////////////
+template<class Matrix,class Field>
+class ShiftedMdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  RealD _shift;
+public:
+  ShiftedMdagMLinearOperator(Matrix &Mat,RealD shift): _Mat(Mat), _shift(shift){};
+  // Support for coarsening to a multigrid
+  void OpDiag (const Field &in, Field &out) {
+    _Mat.Mdiag(in,out);
+    assert(0);
+  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {
+    _Mat.Mdir(in,out,dir,disp);
+    assert(0);
+  }
+  void Op     (const Field &in, Field &out){
+    _Mat.M(in,out);
+    assert(0);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    _Mat.Mdag(in,out);
+    assert(0);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    _Mat.MdagM(in,out,n1,n2);
+    out = out + _shift*in;
+
+    ComplexD dot;	
+    dot= innerProduct(in,out);
+    n1=real(dot);
+    n2=norm2(out);
+  }
+  void HermOp(const Field &in, Field &out){
+    RealD n1,n2;
+    HermOpAndNorm(in,out,n1,n2);
+  }
+};
+
+////////////////////////////////////////////////////////////////////
+// Wrap an already herm matrix
+////////////////////////////////////////////////////////////////////
+template<class Matrix,class Field>
+class HermitianLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+public:
+  HermitianLinearOperator(Matrix &Mat): _Mat(Mat){};
+  // Support for coarsening to a multigrid
+  void OpDiag (const Field &in, Field &out) {
+    _Mat.Mdiag(in,out);
+  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {
+    _Mat.Mdir(in,out,dir,disp);
+  }
+  void Op     (const Field &in, Field &out){
+    _Mat.M(in,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    _Mat.M(in,out);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    _Mat.M(in,out);
+	
+    ComplexD dot= innerProduct(in,out); n1=real(dot);
+    n2=norm2(out);
+  }
+  void HermOp(const Field &in, Field &out){
+    _Mat.M(in,out);
+  }
+};
+
+//////////////////////////////////////////////////////////
+// Even Odd Schur decomp operators; there are several
+// ways to introduce the even odd checkerboarding
+//////////////////////////////////////////////////////////
+
+template<class Field>
+class SchurOperatorBase :  public LinearOperatorBase<Field> {
+public:
+  virtual  RealD Mpc      (const Field &in, Field &out) =0;
+  virtual  RealD MpcDag   (const Field &in, Field &out) =0;
+  virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) 
+  {
+    Field tmp(in.Grid());
+    tmp.Checkerboard() = in.Checkerboard();
+    ni=Mpc(in,tmp);
+    no=MpcDag(tmp,out);
+  }
+  virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    out.Checkerboard() = in.Checkerboard();
+    MpcDagMpc(in,out,n1,n2);
+  }
+  virtual void HermOp(const Field &in, Field &out){
+    RealD n1,n2;
+    HermOpAndNorm(in,out,n1,n2);
+  }
+  void Op     (const Field &in, Field &out){
+    Mpc(in,out);
+  }
+  void AdjOp     (const Field &in, Field &out){ 
+    MpcDag(in,out);
+  }
+  // Support for coarsening to a multigrid
+  void OpDiag (const Field &in, Field &out) {
+    assert(0); // must coarsen the unpreconditioned system
+  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {
+    assert(0);
+  }
+};
+template<class Matrix,class Field>
+class SchurDiagMooeeOperator :  public SchurOperatorBase<Field> {
+protected:
+  Matrix &_Mat;
+public:
+  SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
+  virtual  RealD Mpc      (const Field &in, Field &out) {
+    Field tmp(in.Grid());
+    //	std::cout <<"grid pointers: in.Grid()="<< in.Grid() << " out.Grid()=" << out.Grid() << "  _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
+    tmp.Checkerboard() = !in.Checkerboard();
+
+    _Mat.Meooe(in,tmp);
+    _Mat.MooeeInv(tmp,out);
+    _Mat.Meooe(out,tmp);
+
+      //std::cout << "cb in " << in.Checkerboard() << "  cb out " << out.Checkerboard() << std::endl;
+    _Mat.Mooee(in,out);
+    return axpy_norm(out,-1.0,tmp,out);
+  }
+  virtual  RealD MpcDag   (const Field &in, Field &out){
+    Field tmp(in.Grid());
+
+    _Mat.MeooeDag(in,tmp);
+    _Mat.MooeeInvDag(tmp,out);
+    _Mat.MeooeDag(out,tmp);
+
+    _Mat.MooeeDag(in,out);
+    return axpy_norm(out,-1.0,tmp,out);
+  }
+};
+template<class Matrix,class Field>
+class SchurDiagOneOperator :  public SchurOperatorBase<Field> {
+protected:
+  Matrix &_Mat;
+public:
+  SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){};
+
+  virtual  RealD Mpc      (const Field &in, Field &out) {
+    Field tmp(in.Grid());
+
+    _Mat.Meooe(in,out);
+    _Mat.MooeeInv(out,tmp);
+    _Mat.Meooe(tmp,out);
+    _Mat.MooeeInv(out,tmp);
+
+    return axpy_norm(out,-1.0,tmp,in);
+  }
+  virtual  RealD MpcDag   (const Field &in, Field &out){
+    Field tmp(in.Grid());
+
+    _Mat.MooeeInvDag(in,out);
+    _Mat.MeooeDag(out,tmp);
+    _Mat.MooeeInvDag(tmp,out);
+    _Mat.MeooeDag(out,tmp);
+
+    return axpy_norm(out,-1.0,tmp,in);
+  }
+};
+template<class Matrix,class Field>
+class SchurDiagTwoOperator :  public SchurOperatorBase<Field> {
+protected:
+  Matrix &_Mat;
+public:
+  SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
+
+  virtual  RealD Mpc      (const Field &in, Field &out) {
+    Field tmp(in.Grid());
+
+    _Mat.MooeeInv(in,out);
+    _Mat.Meooe(out,tmp);
+    _Mat.MooeeInv(tmp,out);
+    _Mat.Meooe(out,tmp);
+
+    return axpy_norm(out,-1.0,tmp,in);
+  }
+  virtual  RealD MpcDag   (const Field &in, Field &out){
+    Field tmp(in.Grid());
+
+    _Mat.MeooeDag(in,out);
+    _Mat.MooeeInvDag(out,tmp);
+    _Mat.MeooeDag(tmp,out);
+    _Mat.MooeeInvDag(out,tmp);
+
+    return axpy_norm(out,-1.0,tmp,in);
+  }
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Left  handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta  -->  ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta
+// Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta  -->  ( 1 - Moe Mee^-1 Meo ) Moo^-1 phi=eta ; psi = Moo^-1 phi
+///////////////////////////////////////////////////////////////////////////////////////////////////
+template<class Matrix,class Field> using SchurDiagOneRH = SchurDiagTwoOperator<Matrix,Field> ;
+template<class Matrix,class Field> using SchurDiagOneLH = SchurDiagOneOperator<Matrix,Field> ;
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//  Staggered use
+///////////////////////////////////////////////////////////////////////////////////////////////////
+template<class Matrix,class Field>
+class SchurStaggeredOperator :  public SchurOperatorBase<Field> {
+protected:
+  Matrix &_Mat;
+      Field tmp;
+      RealD mass;
+      double tMpc;
+      double tIP;
+      double tMeo;
+      double taxpby_norm;
+      uint64_t ncall;
+public:
+      void Report(void)
+      {
+	std::cout << GridLogMessage << " HermOpAndNorm.Mpc "<< tMpc/ncall<<" usec "<<std::endl;
+	std::cout << GridLogMessage << " HermOpAndNorm.IP  "<< tIP /ncall<<" usec "<<std::endl;
+	std::cout << GridLogMessage << " Mpc.MeoMoe        "<< tMeo/ncall<<" usec "<<std::endl;
+	std::cout << GridLogMessage << " Mpc.axpby_norm    "<< taxpby_norm/ncall<<" usec "<<std::endl;
+      }
+      SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid()) 
+      { 
+	assert( _Mat.isTrivialEE() );
+	mass = _Mat.Mass();
+	tMpc=0;
+	tIP =0;
+        tMeo=0;
+        taxpby_norm=0;
+	ncall=0;
+      }
+  virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+	ncall++;
+	tMpc-=usecond();
+    n2 = Mpc(in,out);
+	tMpc+=usecond();
+	tIP-=usecond();
+    ComplexD dot= innerProduct(in,out);
+	tIP+=usecond();
+    n1 = real(dot);
+  }
+  virtual void HermOp(const Field &in, Field &out){
+	ncall++;
+	tMpc-=usecond();
+	_Mat.Meooe(in,out);
+	_Mat.Meooe(out,tmp);
+	tMpc+=usecond();
+	taxpby_norm-=usecond();
+	axpby(out,-1.0,mass*mass,tmp,in);
+	taxpby_norm+=usecond();
+  }
+  virtual  RealD Mpc      (const Field &in, Field &out) 
+  {
+
+    Field tmp(in.Grid());
+    Field tmp2(in.Grid());
+
+    //    std::cout << GridLogIterative << " HermOp.Mpc "<<std::endl;
+    _Mat.Mooee(in,out);
+    _Mat.Mooee(out,tmp);
+    //    std::cout << GridLogIterative << " HermOp.MooeeMooee "<<std::endl;
+
+    tMeo-=usecond();
+    _Mat.Meooe(in,out);
+    _Mat.Meooe(out,tmp);
+    tMeo+=usecond();
+    taxpby_norm-=usecond();
+    RealD nn=axpby_norm(out,-1.0,mass*mass,tmp,in);
+    taxpby_norm+=usecond();
+    return nn;
+  }
+  virtual  RealD MpcDag   (const Field &in, Field &out){
+    return Mpc(in,out);
+  }
+  virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
+    assert(0);// Never need with staggered
+  }
+};
+template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>;
+
+
+/////////////////////////////////////////////////////////////
+// Base classes for functions of operators
+/////////////////////////////////////////////////////////////
+template<class Field> class OperatorFunction {
+public:
+  virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0;
+      virtual void operator() (LinearOperatorBase<Field> &Linop, const std::vector<Field> &in,std::vector<Field> &out) {
+	assert(in.size()==out.size());
+	for(int k=0;k<in.size();k++){
+	  (*this)(Linop,in[k],out[k]);
+	}
+      };
+};
+
+template<class Field> class LinearFunction {
+public:
+  virtual void operator() (const Field &in, Field &out) = 0;
+};
+
+template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {
+public:
+  void operator() (const Field &in, Field &out){
+    out = in;
+  };
+};
+
+
+/////////////////////////////////////////////////////////////
+// Base classes for Multishift solvers for operators
+/////////////////////////////////////////////////////////////
+template<class Field> class OperatorMultiFunction {
+public:
+  virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, std::vector<Field> &out) = 0;
+};
+
+// FIXME : To think about
+
+// Chroma functionality list defining LinearOperator
+/*
+  virtual void operator() (T& chi, const T& psi, enum PlusMinus isign) const = 0;
+  virtual void operator() (T& chi, const T& psi, enum PlusMinus isign, Real epsilon) const
+  virtual const Subset& subset() const = 0;
+  virtual unsigned long nFlops() const { return 0; }
+  virtual void deriv(P& ds_u, const T& chi, const T& psi, enum PlusMinus isign) const
+  class UnprecLinearOperator : public DiffLinearOperator<T,P,Q>
+  const Subset& subset() const {return all;}
+  };
+*/
+
+////////////////////////////////////////////////////////////////////////////////////////////
+// Hermitian operator Linear function and operator function
+////////////////////////////////////////////////////////////////////////////////////////////
+template<class Field>
+class HermOpOperatorFunction : public OperatorFunction<Field> {
+  void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
+    Linop.HermOp(in,out);
+  };
+};
+
+template<typename Field>
+class PlainHermOp : public LinearFunction<Field> {
+public:
+  LinearOperatorBase<Field> &_Linop;
+      
+  PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop) 
+  {}
+      
+  void operator()(const Field& in, Field& out) {
+    _Linop.HermOp(in,out);
+  }
+};
+
+template<typename Field>
+class FunctionHermOp : public LinearFunction<Field> {
+public:
+  OperatorFunction<Field>   & _poly;
+  LinearOperatorBase<Field> &_Linop;
+      
+  FunctionHermOp(OperatorFunction<Field> & poly,LinearOperatorBase<Field>& linop) 
+    : _poly(poly), _Linop(linop) {};
+      
+  void operator()(const Field& in, Field& out) {
+    _poly(_Linop,in,out);
+  }
+};
+
+template<class Field>
+class Polynomial : public OperatorFunction<Field> {
+private:
+  std::vector<RealD> Coeffs;
+public:
+  Polynomial(std::vector<RealD> &_Coeffs) : Coeffs(_Coeffs) { };
+
+  // Implement the required interface
+  void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
+
+    Field AtoN(in.Grid());
+    Field Mtmp(in.Grid());
+    AtoN = in;
+    out = AtoN*Coeffs[0];
+    for(int n=1;n<Coeffs.size();n++){
+      Mtmp = AtoN;
+      Linop.HermOp(Mtmp,AtoN);
+      out=out+AtoN*Coeffs[n];
+    }
+  };
+};
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/algorithms/Preconditioner.h
+++ b/Grid/algorithms/Preconditioner.h
@@ -0,0 +1,46 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/Preconditioner.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_PRECONDITIONER_H
+#define GRID_PRECONDITIONER_H
+
+NAMESPACE_BEGIN(Grid);
+
+template<class Field> class Preconditioner :  public LinearFunction<Field> { 
+  virtual void operator()(const Field &src, Field & psi)=0;
+};
+
+template<class Field> class TrivialPrecon :  public Preconditioner<Field> { 
+public:
+  void operator()(const Field &src, Field & psi){
+    psi = src;
+  }
+  TrivialPrecon(void){};
+};
+
+NAMESPACE_END(Grid);
+#endif
--- a/Grid/algorithms/SparseMatrix.h
+++ b/Grid/algorithms/SparseMatrix.h
@@ -0,0 +1,79 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/SparseMatrix.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef  GRID_ALGORITHM_SPARSE_MATRIX_H
+#define  GRID_ALGORITHM_SPARSE_MATRIX_H
+
+
+NAMESPACE_BEGIN(Grid);
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+// Interface defining what I expect of a general sparse matrix, such as a Fermion action
+/////////////////////////////////////////////////////////////////////////////////////////////
+template<class Field> class SparseMatrixBase {
+public:
+  virtual GridBase *Grid(void) =0;
+  // Full checkerboar operations
+  virtual RealD M    (const Field &in, Field &out)=0;
+  virtual RealD Mdag (const Field &in, Field &out)=0;
+  virtual void  MdagM(const Field &in, Field &out,RealD &ni,RealD &no) {
+    Field tmp (in.Grid());
+    ni=M(in,tmp);
+    no=Mdag(tmp,out);
+  }
+  virtual  void Mdiag    (const Field &in, Field &out)=0;
+  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+// Interface augmented by a red black sparse matrix, such as a Fermion action
+/////////////////////////////////////////////////////////////////////////////////////////////
+template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrixBase<Field> {
+public:
+  virtual GridBase *RedBlackGrid(void)=0;
+
+      //////////////////////////////////////////////////////////////////////
+      // Query the even even properties to make algorithmic decisions
+      //////////////////////////////////////////////////////////////////////
+      virtual RealD  Mass(void)        { return 0.0; };
+      virtual int    ConstEE(void)     { return 0; }; // Disable assumptions unless overridden
+      virtual int    isTrivialEE(void) { return 0; }; // by a derived class that knows better
+
+  // half checkerboard operaions
+  virtual  void Meooe    (const Field &in, Field &out)=0;
+  virtual  void Mooee    (const Field &in, Field &out)=0;
+  virtual  void MooeeInv (const Field &in, Field &out)=0;
+
+  virtual  void MeooeDag    (const Field &in, Field &out)=0;
+  virtual  void MooeeDag    (const Field &in, Field &out)=0;
+  virtual  void MooeeInvDag (const Field &in, Field &out)=0;
+
+};
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@@ -0,0 +1,377 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/approx/Chebyshev.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Christoph Lehner <clehner@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_CHEBYSHEV_H
+#define GRID_CHEBYSHEV_H
+
+#include <Grid/algorithms/LinearOperator.h>
+
+NAMESPACE_BEGIN(Grid);
+
+struct ChebyParams : Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(ChebyParams,
+				  RealD, alpha,  
+				  RealD, beta,   
+				  int, Npoly);
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////
+// Generic Chebyshev approximations
+////////////////////////////////////////////////////////////////////////////////////////////
+template<class Field>
+class Chebyshev : public OperatorFunction<Field> {
+private:
+  std::vector<RealD> Coeffs;
+  int order;
+  RealD hi;
+  RealD lo;
+
+public:
+  void csv(std::ostream &out){
+    RealD diff = hi-lo;
+    RealD delta = diff*1.0e-9;
+    for (RealD x=lo; x<hi; x+=delta) {
+      delta*=1.1;
+      RealD f = approx(x);
+      out<< x<<" "<<f<<std::endl;
+    }
+    return;
+  }
+
+  // Convenience for plotting the approximation
+  void   PlotApprox(std::ostream &out) {
+    out<<"Polynomial approx ["<<lo<<","<<hi<<"]"<<std::endl;
+    for(RealD x=lo;x<hi;x+=(hi-lo)/50.0){
+      out <<x<<"\t"<<approx(x)<<std::endl;
+    }
+  };
+
+  Chebyshev(){};
+  Chebyshev(ChebyParams p){ Init(p.alpha,p.beta,p.Npoly);};
+  Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ) {Init(_lo,_hi,_order,func);};
+  Chebyshev(RealD _lo,RealD _hi,int _order) {Init(_lo,_hi,_order);};
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  // c.f. numerical recipes "chebft"/"chebev". This is sec 5.8 "Chebyshev approximation".
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  // CJ: the one we need for Lanczos
+  void Init(RealD _lo,RealD _hi,int _order)
+  {
+    lo=_lo;
+    hi=_hi;
+    order=_order;
+      
+    if(order < 2) exit(-1);
+    Coeffs.resize(order);
+    Coeffs.assign(0.,order);
+    Coeffs[order-1] = 1.;
+  };
+
+  void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
+  {
+    lo=_lo;
+    hi=_hi;
+    order=_order;
+      
+    if(order < 2) exit(-1);
+    Coeffs.resize(order);
+    for(int j=0;j<order;j++){
+      RealD s=0;
+      for(int k=0;k<order;k++){
+	RealD y=std::cos(M_PI*(k+0.5)/order);
+	RealD x=0.5*(y*(hi-lo)+(hi+lo));
+	RealD f=func(x);
+	s=s+f*std::cos( j*M_PI*(k+0.5)/order );
+      }
+      Coeffs[j] = s * 2.0/order;
+    }
+  };
+
+    
+  void JacksonSmooth(void){
+    RealD M=order;
+    RealD alpha = M_PI/(M+2);
+    RealD lmax = std::cos(alpha);
+    RealD sumUsq =0;
+    std::vector<RealD> U(M);
+    std::vector<RealD> a(M);
+    std::vector<RealD> g(M);
+    for(int n=0;n<=M;n++){
+      U[n] = std::sin((n+1)*std::acos(lmax))/std::sin(std::acos(lmax));
+      sumUsq += U[n]*U[n];
+    }      
+    sumUsq = std::sqrt(sumUsq);
+
+    for(int i=1;i<=M;i++){
+      a[i] = U[i]/sumUsq;
+    }
+    g[0] = 1.0;
+    for(int m=1;m<=M;m++){
+      g[m] = 0;
+      for(int i=0;i<=M-m;i++){
+	g[m]+= a[i]*a[m+i];
+      }
+    }
+    for(int m=1;m<=M;m++){
+      Coeffs[m]*=g[m];
+    }
+  }
+  RealD approx(RealD x) // Convenience for plotting the approximation
+  {
+    RealD Tn;
+    RealD Tnm;
+    RealD Tnp;
+      
+    RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
+      
+    RealD T0=1;
+    RealD T1=y;
+      
+    RealD sum;
+    sum = 0.5*Coeffs[0]*T0;
+    sum+= Coeffs[1]*T1;
+      
+    Tn =T1;
+    Tnm=T0;
+    for(int i=2;i<order;i++){
+      Tnp=2*y*Tn-Tnm;
+      Tnm=Tn;
+      Tn =Tnp;
+      sum+= Tn*Coeffs[i];
+    }
+    return sum;
+  };
+
+  RealD approxD(RealD x)
+  {
+    RealD Un;
+    RealD Unm;
+    RealD Unp;
+      
+    RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
+      
+    RealD U0=1;
+    RealD U1=2*y;
+      
+    RealD sum;
+    sum = Coeffs[1]*U0;
+    sum+= Coeffs[2]*U1*2.0;
+      
+    Un =U1;
+    Unm=U0;
+    for(int i=2;i<order-1;i++){
+      Unp=2*y*Un-Unm;
+      Unm=Un;
+      Un =Unp;
+      sum+= Un*Coeffs[i+1]*(i+1.0);
+    }
+    return sum/(0.5*(hi-lo));
+  };
+    
+  RealD approxInv(RealD z, RealD x0, int maxiter, RealD resid) {
+    RealD x = x0;
+    RealD eps;
+      
+    int i;
+    for (i=0;i<maxiter;i++) {
+      eps = approx(x) - z;
+      if (fabs(eps / z) < resid)
+	return x;
+      x = x - eps / approxD(x);
+    }
+      
+    return std::numeric_limits<double>::quiet_NaN();
+  }
+    
+  // Implement the required interface
+  void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
+
+    GridBase *grid=in.Grid();
+
+    // std::cout << "Chevyshef(): in.Grid()="<<in.Grid()<<std::endl;
+    //std::cout <<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
+
+    int vol=grid->gSites();
+
+    Field T0(grid); T0 = in;  
+    Field T1(grid); 
+    Field T2(grid);
+    Field y(grid);
+      
+    Field *Tnm = &T0;
+    Field *Tn  = &T1;
+    Field *Tnp = &T2;
+
+    // Tn=T1 = (xscale M + mscale)in
+    RealD xscale = 2.0/(hi-lo);
+    RealD mscale = -(hi+lo)/(hi-lo);
+    Linop.HermOp(T0,y);
+    T1=y*xscale+in*mscale;
+
+    // sum = .5 c[0] T0 + c[1] T1
+    out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
+    for(int n=2;n<order;n++){
+	
+      Linop.HermOp(*Tn,y);
+
+      y=xscale*y+mscale*(*Tn);
+
+      *Tnp=2.0*y-(*Tnm);
+
+      out=out+Coeffs[n]* (*Tnp);
+
+      // Cycle pointers to avoid copies
+      Field *swizzle = Tnm;
+      Tnm    =Tn;
+      Tn     =Tnp;
+      Tnp    =swizzle;
+	  
+    }
+  }
+};
+
+
+template<class Field>
+class ChebyshevLanczos : public Chebyshev<Field> {
+private:
+  std::vector<RealD> Coeffs;
+  int order;
+  RealD alpha;
+  RealD beta;
+  RealD mu;
+
+public:
+  ChebyshevLanczos(RealD _alpha,RealD _beta,RealD _mu,int _order) :
+    alpha(_alpha),
+    beta(_beta),
+    mu(_mu)
+  {
+    order=_order;
+    Coeffs.resize(order);
+    for(int i=0;i<_order;i++){
+      Coeffs[i] = 0.0;
+    }
+    Coeffs[order-1]=1.0;
+  };
+
+  void csv(std::ostream &out){
+    for (RealD x=-1.2*alpha; x<1.2*alpha; x+=(2.0*alpha)/10000) {
+      RealD f = approx(x);
+      out<< x<<" "<<f<<std::endl;
+    }
+    return;
+  }
+
+  RealD approx(RealD xx) // Convenience for plotting the approximation
+  {
+    RealD Tn;
+    RealD Tnm;
+    RealD Tnp;
+    Real aa = alpha * alpha;
+    Real bb = beta  *  beta;
+      
+    RealD x = ( 2.0 * (xx-mu)*(xx-mu) - (aa+bb) ) / (aa-bb);
+
+    RealD y= x;
+      
+    RealD T0=1;
+    RealD T1=y;
+      
+    RealD sum;
+    sum = 0.5*Coeffs[0]*T0;
+    sum+= Coeffs[1]*T1;
+      
+    Tn =T1;
+    Tnm=T0;
+    for(int i=2;i<order;i++){
+      Tnp=2*y*Tn-Tnm;
+      Tnm=Tn;
+      Tn =Tnp;
+      sum+= Tn*Coeffs[i];
+    }
+    return sum;
+  };
+
+  // shift_Multiply in Rudy's code
+  void AminusMuSq(LinearOperatorBase<Field> &Linop, const Field &in, Field &out) 
+  {
+    GridBase *grid=in.Grid();
+    Field tmp(grid);
+
+    RealD aa= alpha*alpha;
+    RealD bb= beta * beta;
+
+    Linop.HermOp(in,out);
+    out = out - mu*in;
+
+    Linop.HermOp(out,tmp);
+    tmp = tmp - mu * out;
+
+    out = (2.0/ (aa-bb) ) * tmp -  ((aa+bb)/(aa-bb))*in;
+  };
+  // Implement the required interface
+  void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
+
+    GridBase *grid=in.Grid();
+
+    int vol=grid->gSites();
+
+    Field T0(grid); T0 = in;  
+    Field T1(grid); 
+    Field T2(grid);
+    Field  y(grid);
+      
+    Field *Tnm = &T0;
+    Field *Tn  = &T1;
+    Field *Tnp = &T2;
+
+    // Tn=T1 = (xscale M )*in
+    AminusMuSq(Linop,T0,T1);
+
+    // sum = .5 c[0] T0 + c[1] T1
+    out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
+    for(int n=2;n<order;n++){
+	
+      AminusMuSq(Linop,*Tn,y);
+
+      *Tnp=2.0*y-(*Tnm);
+
+      out=out+Coeffs[n]* (*Tnp);
+
+      // Cycle pointers to avoid copies
+      Field *swizzle = Tnm;
+      Tnm    =Tn;
+      Tn     =Tnp;
+      Tnp    =swizzle;
+	  
+    }
+  }
+};
+NAMESPACE_END(Grid);
+#endif
--- a/Grid/algorithms/approx/Forecast.h
+++ b/Grid/algorithms/approx/Forecast.h
@@ -0,0 +1,152 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/algorithms/approx/Forecast.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+			   /*  END LEGAL */
+
+#ifndef INCLUDED_FORECAST_H
+#define INCLUDED_FORECAST_H
+
+NAMESPACE_BEGIN(Grid);
+
+// Abstract base class.
+// Takes a matrix (Mat), a source (phi), and a vector of Fields (chi)
+// and returns a forecasted solution to the system D*psi = phi (psi).
+template<class Matrix, class Field>
+class Forecast
+{
+public:
+  virtual Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& chi) = 0;
+};
+
+// Implementation of Brower et al.'s chronological inverter (arXiv:hep-lat/9509012),
+// used to forecast solutions across poles of the EOFA heatbath.
+//
+// Modified from CPS (cps_pp/src/util/dirac_op/d_op_base/comsrc/minresext.C)
+template<class Matrix, class Field>
+class ChronoForecast : public Forecast<Matrix,Field>
+{
+public:
+  Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& prev_solns)
+  {
+    int degree = prev_solns.size();
+    Field chi(phi); // forecasted solution
+
+    // Trivial cases
+    if(degree == 0){ chi = Zero(); return chi; }
+    else if(degree == 1){ return prev_solns[0]; }
+
+    //    RealD dot;
+    ComplexD xp;
+    Field r(phi); // residual
+    Field Mv(phi);
+    std::vector<Field> v(prev_solns); // orthonormalized previous solutions
+    std::vector<Field> MdagMv(degree,phi);
+
+    // Array to hold the matrix elements
+    std::vector<std::vector<ComplexD>> G(degree, std::vector<ComplexD>(degree));
+
+    // Solution and source vectors
+    std::vector<ComplexD> a(degree);
+    std::vector<ComplexD> b(degree);
+
+    // Orthonormalize the vector basis
+    for(int i=0; i<degree; i++){
+      v[i] *= 1.0/std::sqrt(norm2(v[i]));
+      for(int j=i+1; j<degree; j++){ v[j] -= innerProduct(v[i],v[j]) * v[i]; }
+    }
+
+    // Perform sparse matrix multiplication and construct rhs
+    for(int i=0; i<degree; i++){
+      b[i] = innerProduct(v[i],phi);
+      Mat.M(v[i],Mv);
+      Mat.Mdag(Mv,MdagMv[i]);
+      G[i][i] = innerProduct(v[i],MdagMv[i]);
+    }
+
+    // Construct the matrix
+    for(int j=0; j<degree; j++){
+      for(int k=j+1; k<degree; k++){
+	G[j][k] = innerProduct(v[j],MdagMv[k]);
+	G[k][j] = conjugate(G[j][k]);
+      }}
+
+    // Gauss-Jordan elimination with partial pivoting
+    for(int i=0; i<degree; i++){
+
+      // Perform partial pivoting
+      int k = i;
+      for(int j=i+1; j<degree; j++){ if(abs(G[j][j]) > abs(G[k][k])){ k = j; } }
+      if(k != i){
+	xp = b[k];
+	b[k] = b[i];
+	b[i] = xp;
+	for(int j=0; j<degree; j++){
+	  xp = G[k][j];
+	  G[k][j] = G[i][j];
+	  G[i][j] = xp;
+	}
+      }
+
+      // Convert matrix to upper triangular form
+      for(int j=i+1; j<degree; j++){
+	xp = G[j][i]/G[i][i];
+	b[j] -= xp * b[i];
+	for(int k=0; k<degree; k++){ G[j][k] -= xp*G[i][k]; }
+      }
+    }
+
+    // Use Gaussian elimination to solve equations and calculate initial guess
+    chi = Zero();
+    r = phi;
+    for(int i=degree-1; i>=0; i--){
+      a[i] = 0.0;
+      for(int j=i+1; j<degree; j++){ a[i] += G[i][j] * a[j]; }
+      a[i] = (b[i]-a[i])/G[i][i];
+      chi += a[i]*v[i];
+      r -= a[i]*MdagMv[i];
+    }
+
+    RealD true_r(0.0);
+    ComplexD tmp;
+    for(int i=0; i<degree; i++){
+      tmp = -b[i];
+      for(int j=0; j<degree; j++){ tmp += G[i][j]*a[j]; }
+      tmp = conjugate(tmp)*tmp;
+      true_r += std::sqrt(tmp.real());
+    }
+
+    RealD error = std::sqrt(norm2(r)/norm2(phi));
+    std::cout << GridLogMessage << "ChronoForecast: |res|/|src| = " << error << std::endl;
+
+    return chi;
+  };
+};
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/algorithms/approx/LICENSE
+++ b/Grid/algorithms/approx/LICENSE
@@ -0,0 +1,21 @@
+
+Copyright (c) 2011 Michael Clark
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
--- a/Grid/algorithms/approx/MultiShiftFunction.cc
+++ b/Grid/algorithms/approx/MultiShiftFunction.cc
@@ -0,0 +1,57 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/approx/MultiShiftFunction.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/GridCore.h>
+
+NAMESPACE_BEGIN(Grid);
+
+double MultiShiftFunction::approx(double x)
+{
+  double a = norm;
+  for(int n=0;n<poles.size();n++){
+    a = a + residues[n]/(x+poles[n]);
+  }
+  return a;
+}
+void MultiShiftFunction::gnuplot(std::ostream &out)
+{
+  out<<"f(x) = "<<norm<<"";
+  for(int n=0;n<poles.size();n++){
+    out<<"+("<<residues[n]<<"/(x+"<<poles[n]<<"))";
+  }
+  out<<";"<<std::endl;
+}
+void MultiShiftFunction::csv(std::ostream &out)
+{
+  for (double x=lo; x<hi; x*=1.05) {
+    double f = approx(x);
+    double r = sqrt(x);
+    out<< x<<","<<r<<","<<f<<","<<r-f<<std::endl;
+  }
+  return;
+}
+NAMESPACE_END(Grid);
--- a/Grid/algorithms/approx/MultiShiftFunction.h
+++ b/Grid/algorithms/approx/MultiShiftFunction.h
@@ -0,0 +1,67 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/approx/MultiShiftFunction.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef MULTI_SHIFT_FUNCTION
+#define MULTI_SHIFT_FUNCTION
+
+NAMESPACE_BEGIN(Grid);
+
+class MultiShiftFunction {
+public:
+  int order;
+  std::vector<RealD> poles;
+  std::vector<RealD> residues;
+  std::vector<RealD> tolerances;
+  RealD norm;
+  RealD lo,hi;
+
+  MultiShiftFunction(int n,RealD _lo,RealD _hi): poles(n), residues(n), lo(_lo), hi(_hi) {;};
+  RealD approx(RealD x);
+  void csv(std::ostream &out);
+  void gnuplot(std::ostream &out);
+
+  void Init(AlgRemez & remez,double tol,bool inverse) 
+  {
+    order=remez.getDegree();
+    tolerances.resize(remez.getDegree(),tol);
+    poles.resize(remez.getDegree());
+    residues.resize(remez.getDegree());
+    remez.getBounds(lo,hi);
+    if ( inverse ) remez.getIPFE (&residues[0],&poles[0],&norm);
+    else           remez.getPFE (&residues[0],&poles[0],&norm);
+  }
+  // Allow deferred initialisation
+  MultiShiftFunction(void){};
+  MultiShiftFunction(AlgRemez & remez,double tol,bool inverse)
+  {
+    Init(remez,tol,inverse);
+  }
+
+};
+NAMESPACE_END(Grid);
+#endif
--- a/Grid/algorithms/approx/README
+++ b/Grid/algorithms/approx/README
@@ -0,0 +1,80 @@
+-----------------------------------------------------------------------------------
+
+PAB. Took Mike Clark's AlgRemez from GitHub and (modified a little) include.
+This is open source and license and readme and comments are preserved consistent
+with the license. Mike, thankyou!
+-----------------------------------------------------------------------------------
+-----------------------------------------------------------------------------------
+AlgRemez
+
+The archive downloadable here contains an implementation of the Remez
+algorithm which calculates optimal rational (and polynomial)
+approximations to the nth root over a given spectral range.  The Remez
+algorithm, although in principle is extremely straightforward to
+program, is quite difficult to get completely correct, e.g., the Maple
+implementation of the algorithm does not always converge to the
+correct answer.
+
+To use this algorithm you need to install GMP, the GNU Multiple
+Precision Library, and when configuring the install, you must include
+the --enable-mpfr option (see the GMP manual for more details).  You
+also have to edit the Makefile for AlgRemez appropriately for your
+system, namely to point corrrectly to the location of the GMP library.
+
+The simple main program included with this archive invokes the
+AlgRemez class to calculate an approximation given by command line
+arguments.  It is invoked by the following
+
+./test y z n d lambda_low lambda_high precision,
+
+where the function to be approximated is f(x) = x^(y/z), with degree
+(n,d) over the spectral range [lambda_low, lambda_high], using
+precision digits of precision in the arithmetic.  So an example would
+be
+
+./test 1 2 5 5 0.0004 64 40
+
+which corresponds to constructing a rational approximation to the
+square root function, with degree (5,5) over the range [0.0004,64]
+with 40 digits of precision used for the arithmetic.  The parameters y
+and z must be positive, the approximation to f(x) = x^(-y/z) is simply
+the inverse of the approximation to f(x) = x^(y/z).  After the
+approximation has been constructed, the roots and poles of the
+rational function are found, and then the partial fraction expansion
+of both the rational function and it's inverse are found, the results
+of which are output to a file called "approx.dat".  In addition, the
+error function of the approximation is output to "error.dat", where it
+can be checked that the resultant approximation satisfies Chebychev's
+criterion, namely all error maxima are equal in magnitude, and
+adjacent maxima are oppostie in sign.  There are some caveats here
+however, the optimal polynomial approximation has complex roots, and
+the root finding implemented here cannot (yet) handle complex roots.
+In addition, the partial fraction expansion of rational approximations
+is only found for the case n = d, i.e., the degree of numerator
+polynomial equals that of the denominator polynomial.  The convention
+for the partial fraction expansion is that polar shifts are always
+written added to x, not subtracted.
+
+To do list
+
+1.  Include an exponential dampening factor in the function to be
+approximated.  This may sound trivial to implement, but for some
+parameters, the algorithm seems to breakdown.  Also, the roots in the
+rational approximation sometimes become complex, which currently
+breaks the stupidly simple root finding code.
+
+2. Make the algorithm faster - it's too slow when running on qcdoc.
+
+3. Add complex root finding.
+
+4. Add more options for error minimisation - currently the code
+minimises the relative error, should add options for absolute error,
+and other norms.
+
+There will be a forthcoming publication concerning the results
+generated by this software, but in the meantime, if you use this
+software, please cite it as
+"M.A. Clark and A.D. Kennedy, https://github.com/mikeaclark/AlgRemez, 2005".
+
+If you have any problems using the software, then please email scientist.mike@gmail.com.
+
--- a/Grid/algorithms/approx/Remez.cc
+++ b/Grid/algorithms/approx/Remez.cc
@@ -0,0 +1,759 @@
+/*
+
+  Mike Clark - 25th May 2005
+
+  alg_remez.C
+
+  AlgRemez is an implementation of the Remez algorithm, which in this
+  case is used for generating the optimal nth root rational
+  approximation.
+
+  Note this class requires the gnu multiprecision (GNU MP) library.
+
+*/
+
+#include<math.h>
+#include<stdio.h>
+#include<stdlib.h>
+#include<string>
+#include<iostream>
+#include<iomanip>
+#include<cassert>
+
+#include<Grid/algorithms/approx/Remez.h>
+
+// Constructor
+AlgRemez::AlgRemez(double lower, double upper, long precision) 
+{
+  prec = precision;
+  bigfloat::setDefaultPrecision(prec);
+
+  apstrt = lower;
+  apend = upper;
+  apwidt = apend - apstrt;
+
+  std::cout<<"Approximation bounds are ["<<apstrt<<","<<apend<<"]\n";
+  std::cout<<"Precision of arithmetic is "<<precision<<std::endl;
+
+  alloc = 0;
+  n = 0;
+  d = 0;
+
+  foundRoots = 0;
+
+  // Only require the approximation spread to be less than 1 ulp
+  tolerance = 1e-15;
+}
+
+// Destructor
+AlgRemez::~AlgRemez()
+{
+  if (alloc) {
+    delete [] param;
+    delete [] roots;
+    delete [] poles;
+    delete [] xx;
+    delete [] mm;
+    delete [] a_power;
+    delete [] a;
+  }
+}
+
+// Free memory and reallocate as necessary
+void AlgRemez::allocate(int num_degree, int den_degree)
+{
+  // Arrays have previously been allocated, deallocate first, then allocate
+  if (alloc) {
+    delete [] param;
+    delete [] roots;
+    delete [] poles;
+    delete [] xx;
+    delete [] mm;
+  }
+
+  // Note use of new and delete in memory allocation - cannot run on qcdsp
+  param = new bigfloat[num_degree+den_degree+1];
+  roots = new bigfloat[num_degree];
+  poles = new bigfloat[den_degree];
+  xx = new bigfloat[num_degree+den_degree+3];
+  mm = new bigfloat[num_degree+den_degree+2];
+
+  if (!alloc) {
+    // The coefficients of the sum in the exponential
+    a = new bigfloat[SUM_MAX];
+    a_power = new int[SUM_MAX];
+  }
+
+  alloc = 1;
+}
+
+// Reset the bounds of the approximation
+void AlgRemez::setBounds(double lower, double upper)
+{
+  apstrt = lower;
+  apend = upper;
+  apwidt = apend - apstrt;
+}
+
+// Generate the rational approximation x^(pnum/pden)
+double AlgRemez::generateApprox(int degree, unsigned long pnum, 
+				unsigned long pden)
+{
+  return generateApprox(degree, degree, pnum, pden);
+}
+
+double AlgRemez::generateApprox(int num_degree, int den_degree, 
+				unsigned long pnum, unsigned long pden)
+{
+  double *a_param = 0;
+  int *a_pow = 0;
+  return generateApprox(num_degree, den_degree, pnum, pden, 0, a_param, a_pow);
+}
+
+// Generate the rational approximation x^(pnum/pden)
+double AlgRemez::generateApprox(int num_degree, int den_degree, 
+				unsigned long pnum, unsigned long pden,
+				int a_len, double *a_param, int *a_pow)
+{
+  std::cout<<"Degree of the approximation is ("<<num_degree<<","<<den_degree<<")\n";
+  std::cout<<"Approximating the function x^("<<pnum<<"/"<<pden<<")\n";
+
+  // Reallocate arrays, since degree has changed
+  if (num_degree != n || den_degree != d) allocate(num_degree,den_degree);
+
+  assert(a_len<=SUM_MAX);
+
+  step = new bigfloat[num_degree+den_degree+2];
+
+  a_length = a_len;
+  for (int j=0; j<a_len; j++) {
+    a[j]= a_param[j];
+    a_power[j] = a_pow[j];
+  }
+
+  power_num = pnum;
+  power_den = pden;
+  spread = 1.0e37;
+  iter = 0;
+
+  n = num_degree;
+  d = den_degree;
+  neq = n + d + 1;
+
+  initialGuess();
+  stpini(step);
+
+  while (spread > tolerance) { //iterate until convergance
+
+    if (iter++%100==0) 
+      std::cout<<"Iteration " <<iter-1<<" spread "<<(double)spread<<" delta "<<(double)delta<<std::endl; 
+
+    equations();
+    if (delta < tolerance) {
+      std::cout<<"Delta too small, try increasing precision\n";
+      assert(0);
+    };    
+    assert( delta>= tolerance);
+
+    search(step);
+  }
+
+  int sign;
+  double error = (double)getErr(mm[0],&sign);
+  std::cout<<"Converged at "<<iter<<" iterations; error = "<<error<<std::endl;
+
+  // Once the approximation has been generated, calculate the roots
+  if(!root()) {
+    std::cout<<"Root finding failed\n";
+  } else {
+    foundRoots = 1;
+  }
+  
+  delete [] step;
+
+  // Return the maximum error in the approximation
+  return error;
+}
+
+// Return the partial fraction expansion of the approximation x^(pnum/pden)
+int AlgRemez::getPFE(double *Res, double *Pole, double *Norm) {
+
+  if (n!=d) {
+    std::cout<<"Cannot handle case: Numerator degree neq Denominator degree\n";
+    return 0;
+  }
+
+  if (!alloc) {
+    std::cout<<"Approximation not yet generated\n";
+    return 0;
+  }
+
+  if (!foundRoots) {
+    std::cout<<"Roots not found, so PFE cannot be taken\n";
+    return 0;
+  }
+
+  bigfloat *r = new bigfloat[n];
+  bigfloat *p = new bigfloat[d];
+  
+  for (int i=0; i<n; i++) r[i] = roots[i];
+  for (int i=0; i<d; i++) p[i] = poles[i];
+  
+  // Perform a partial fraction expansion
+  pfe(r, p, norm);
+
+  // Convert to double and return
+  *Norm = (double)norm;
+  for (int i=0; i<n; i++) Res[i] = (double)r[i];
+  for (int i=0; i<d; i++) Pole[i] = (double)p[i];
+
+  delete [] r;
+  delete [] p;
+
+  // Where the smallest shift is located
+  return 0;
+}
+
+// Return the partial fraction expansion of the approximation x^(-pnum/pden)
+int AlgRemez::getIPFE(double *Res, double *Pole, double *Norm) {
+
+  if (n!=d) {
+    std::cout<<"Cannot handle case: Numerator degree neq Denominator degree\n";
+    return 0;
+  }
+
+  if (!alloc) {
+    std::cout<<"Approximation not yet generated\n";
+    return 0;
+  }
+
+  if (!foundRoots) {
+    std::cout<<"Roots not found, so PFE cannot be taken\n";
+    return 0;
+  }
+
+  bigfloat *r = new bigfloat[d];
+  bigfloat *p = new bigfloat[n];
+  
+  // Want the inverse function
+  for (int i=0; i<n; i++) {
+    r[i] = poles[i];
+    p[i] = roots[i];
+  }
+
+  // Perform a partial fraction expansion
+  pfe(r, p, (bigfloat)1l/norm);
+
+  // Convert to double and return
+  *Norm = (double)((bigfloat)1l/(norm));
+  for (int i=0; i<n; i++) {
+    Res[i] = (double)r[i];
+    Pole[i] = (double)p[i];
+  }
+
+  delete [] r;
+  delete [] p;
+
+  // Where the smallest shift is located
+  return 0;
+}
+
+// Initial values of maximal and minimal errors
+void AlgRemez::initialGuess() {
+
+  // Supply initial guesses for solution points
+  long ncheb = neq;			// Degree of Chebyshev error estimate
+  bigfloat a, r;
+
+  // Find ncheb+1 extrema of Chebyshev polynomial
+
+  a = ncheb;
+  mm[0] = apstrt;
+  for (long i = 1; i < ncheb; i++) {
+    r = 0.5 * (1 - cos((M_PI * i)/(double) a));
+    //r *= sqrt_bf(r);
+    r = (exp((double)r)-1.0)/(exp(1.0)-1.0);
+    mm[i] = apstrt + r * apwidt;
+  }
+  mm[ncheb] = apend;
+
+  a = 2.0 * ncheb;
+  for (long i = 0; i <= ncheb; i++) {
+    r = 0.5 * (1 - cos(M_PI * (2*i+1)/(double) a));
+    //r *= sqrt_bf(r); // Squeeze to low end of interval
+    r = (exp((double)r)-1.0)/(exp(1.0)-1.0);
+    xx[i] = apstrt + r * apwidt;
+  }
+}
+
+// Initialise step sizes
+void AlgRemez::stpini(bigfloat *step) {
+  xx[neq+1] = apend;
+  delta = 0.25;
+  step[0] = xx[0] - apstrt;
+  for (int i = 1; i < neq; i++) step[i] = xx[i] - xx[i-1];
+  step[neq] = step[neq-1];
+}
+
+// Search for error maxima and minima
+void AlgRemez::search(bigfloat *step) {
+  bigfloat a, q, xm, ym, xn, yn, xx0, xx1;
+  int i, meq, emsign, ensign, steps;
+
+  meq = neq + 1;
+  bigfloat *yy = new bigfloat[meq];
+
+  bigfloat eclose = 1.0e30;
+  bigfloat farther = 0l;
+
+  xx0 = apstrt;
+
+  for (i = 0; i < meq; i++) {
+    steps = 0;
+    xx1 = xx[i]; // Next zero
+    if (i == meq-1) xx1 = apend;
+    xm = mm[i];
+    ym = getErr(xm,&emsign);
+    q = step[i];
+    xn = xm + q;
+    if (xn < xx0 || xn >= xx1) {	// Cannot skip over adjacent boundaries
+      q = -q;
+      xn = xm;
+      yn = ym;
+      ensign = emsign;
+    } else {
+      yn = getErr(xn,&ensign);
+      if (yn < ym) {
+	q = -q;
+	xn = xm;
+	yn = ym;
+	ensign = emsign;
+      }
+    }
+  
+    while(yn >= ym) {		// March until error becomes smaller.
+      if (++steps > 10) break;
+      ym = yn;
+      xm = xn;
+      emsign = ensign;
+      a = xm + q;
+      if (a == xm || a <= xx0 || a >= xx1) break;// Must not skip over the zeros either side.
+      xn = a;
+      yn = getErr(xn,&ensign);
+    }
+
+    mm[i] = xm;			// Position of maximum
+    yy[i] = ym;			// Value of maximum
+
+    if (eclose > ym) eclose = ym;
+    if (farther < ym) farther = ym;
+
+    xx0 = xx1; // Walk to next zero.
+  } // end of search loop
+
+  q = (farther - eclose);	// Decrease step size if error spread increased
+  if (eclose != 0.0) q /= eclose; // Relative error spread
+  if (q >= spread) delta *= 0.5; // Spread is increasing; decrease step size
+  spread = q;
+
+  for (i = 0; i < neq; i++) {
+    q = yy[i+1];
+    if (q != 0.0) q = yy[i] / q  - (bigfloat)1l;
+    else q = 0.0625;
+    if (q > (bigfloat)0.25) q = 0.25;
+    q *= mm[i+1] - mm[i];
+    step[i] = q * delta;
+  }
+  step[neq] = step[neq-1];
+  
+  for (i = 0; i < neq; i++) {	// Insert new locations for the zeros.
+    xm = xx[i] - step[i];
+    if (xm <= apstrt) continue;
+    if (xm >= apend) continue;
+    if (xm <= mm[i]) xm = (bigfloat)0.5 * (mm[i] + xx[i]);
+    if (xm >= mm[i+1]) xm = (bigfloat)0.5 * (mm[i+1] + xx[i]);
+    xx[i] = xm;
+  }
+
+  delete [] yy;
+}
+
+// Solve the equations
+void AlgRemez::equations(void) {
+  bigfloat x, y, z;
+  int i, j, ip;
+  bigfloat *aa;
+
+  bigfloat *AA = new bigfloat[(neq)*(neq)];
+  bigfloat *BB = new bigfloat[neq];
+  
+  for (i = 0; i < neq; i++) {	// set up the equations for solution by simq()
+    ip = neq * i;		// offset to 1st element of this row of matrix
+    x = xx[i];			// the guess for this row
+    y = func(x);		// right-hand-side vector
+
+    z = (bigfloat)1l;
+    aa = AA+ip;
+    for (j = 0; j <= n; j++) {
+      *aa++ = z;
+      z *= x;
+    }
+
+    z = (bigfloat)1l;
+    for (j = 0; j < d; j++) {
+      *aa++ = -y * z;
+      z *= x;
+    }
+    BB[i] = y * z;		// Right hand side vector
+  }
+
+  // Solve the simultaneous linear equations.
+  if (simq(AA, BB, param, neq)) {
+    std::cout<<"simq failed\n";
+    exit(0);
+  }
+
+  delete [] AA;
+  delete [] BB;
+
+}
+
+// Evaluate the rational form P(x)/Q(x) using coefficients
+// from the solution vector param
+bigfloat AlgRemez::approx(const bigfloat x) {
+  bigfloat yn, yd;
+  int i;
+
+  // Work backwards toward the constant term.
+  yn = param[n];		// Highest order numerator coefficient
+  for (i = n-1; i >= 0; i--) yn = x * yn  +  param[i]; 
+  yd = x + param[n+d];	// Highest degree coefficient = 1.0
+  for (i = n+d-1; i > n; i--) yd = x * yd  +  param[i];
+
+  return(yn/yd);
+}
+
+// Compute size and sign of the approximation error at x
+bigfloat AlgRemez::getErr(bigfloat x, int *sign) {
+  bigfloat e, f;
+
+  f = func(x);
+  e = approx(x) - f;
+  if (f != 0) e /= f;
+  if (e < (bigfloat)0.0) {
+    *sign = -1;
+    e = -e;
+  }
+  else *sign = 1;
+  
+  return(e);
+}
+
+// Calculate function required for the approximation.
+bigfloat AlgRemez::func(const bigfloat x) {
+
+  bigfloat z = (bigfloat)power_num / (bigfloat)power_den;
+  bigfloat y;
+
+  if (x == (bigfloat)1.0) y = (bigfloat)1.0;
+  else y = pow_bf(x,z);
+
+  if (a_length > 0) {
+    bigfloat sum = 0l;
+    for (int j=0; j<a_length; j++) sum += a[j]*pow_bf(x,a_power[j]);
+    return y * exp_bf(sum);
+  } else {
+    return y;
+  }
+
+}
+
+// Solve the system AX=B
+int AlgRemez::simq(bigfloat A[], bigfloat B[], bigfloat X[], int n) {
+
+  int i, j, ij, ip, ipj, ipk, ipn;
+  int idxpiv, iback;
+  int k, kp, kp1, kpk, kpn;
+  int nip, nkp, nm1;
+  bigfloat em, q, rownrm, big, size, pivot, sum;
+  bigfloat *aa;
+
+  // simq() work vector
+  int *IPS = new int[(neq) * sizeof(int)];
+
+  nm1 = n - 1;
+  // Initialize IPS and X
+  
+  ij = 0;
+  for (i = 0; i < n; i++) {
+    IPS[i] = i;
+    rownrm = 0.0;
+    for(j = 0; j < n; j++) {
+      q = abs_bf(A[ij]);
+      if(rownrm < q) rownrm = q;
+      ++ij;
+    }
+    if (rownrm == (bigfloat)0l) {
+      std::cout<<"simq rownrm=0\n";
+      delete [] IPS;
+      return(1);
+    }
+    X[i] = (bigfloat)1.0 / rownrm;
+  }
+  
+  for (k = 0; k < nm1; k++) {
+    big = 0.0;
+    idxpiv = 0;
+    
+    for (i = k; i < n; i++) {
+      ip = IPS[i];
+      ipk = n*ip + k;
+      size = abs_bf(A[ipk]) * X[ip];
+      if (size > big) {
+	big = size;
+	idxpiv = i;
+      }
+    }
+    
+    if (big == (bigfloat)0l) {
+      std::cout<<"simq big=0\n";
+      delete [] IPS;
+      return(2);
+    }
+    if (idxpiv != k) {
+      j = IPS[k];
+      IPS[k] = IPS[idxpiv];
+      IPS[idxpiv] = j;
+    }
+    kp = IPS[k];
+    kpk = n*kp + k;
+    pivot = A[kpk];
+    kp1 = k+1;
+    for (i = kp1; i < n; i++) {
+      ip = IPS[i];
+      ipk = n*ip + k;
+      em = -A[ipk] / pivot;
+      A[ipk] = -em;
+      nip = n*ip;
+      nkp = n*kp;
+      aa = A+nkp+kp1;
+      for (j = kp1; j < n; j++) {
+	ipj = nip + j;
+	A[ipj] = A[ipj] + em * *aa++;
+      }
+    }
+  }
+  kpn = n * IPS[n-1] + n - 1;	// last element of IPS[n] th row
+  if (A[kpn] == (bigfloat)0l) {
+    std::cout<<"simq A[kpn]=0\n";
+    delete [] IPS;
+    return(3);
+  }
+
+  
+  ip = IPS[0];
+  X[0] = B[ip];
+  for (i = 1; i < n; i++) {
+    ip = IPS[i];
+    ipj = n * ip;
+    sum = 0.0;
+    for (j = 0; j < i; j++) {
+      sum += A[ipj] * X[j];
+      ++ipj;
+    }
+    X[i] = B[ip] - sum;
+  }
+  
+  ipn = n * IPS[n-1] + n - 1;
+  X[n-1] = X[n-1] / A[ipn];
+  
+  for (iback = 1; iback < n; iback++) {
+    //i goes (n-1),...,1
+    i = nm1 - iback;
+    ip = IPS[i];
+    nip = n*ip;
+    sum = 0.0;
+    aa = A+nip+i+1;
+    for (j= i + 1; j < n; j++) 
+      sum += *aa++ * X[j];
+    X[i] = (X[i] - sum) / A[nip+i];
+  }
+  
+  delete [] IPS;
+  return(0);
+}
+
+// Calculate the roots of the approximation
+int AlgRemez::root() {
+
+  long i,j;
+  bigfloat x,dx=0.05;
+  bigfloat upper=1, lower=-100000;
+  bigfloat tol = 1e-20;
+
+  bigfloat *poly = new bigfloat[neq+1];
+
+  // First find the numerator roots
+  for (i=0; i<=n; i++) poly[i] = param[i];
+
+  for (i=n-1; i>=0; i--) {
+    roots[i] = rtnewt(poly,i+1,lower,upper,tol);
+    if (roots[i] == 0.0) {
+      std::cout<<"Failure to converge on root "<<i+1<<"/"<<n<<"\n";
+      return 0;
+    }
+    poly[0] = -poly[0]/roots[i];
+    for (j=1; j<=i; j++) poly[j] = (poly[j-1] - poly[j])/roots[i];
+  }
+  
+ // Now find the denominator roots
+  poly[d] = 1l;
+  for (i=0; i<d; i++) poly[i] = param[n+1+i];
+
+  for (i=d-1; i>=0; i--) {
+    poles[i]=rtnewt(poly,i+1,lower,upper,tol);
+    if (poles[i] == 0.0) {
+      std::cout<<"Failure to converge on pole "<<i+1<<"/"<<d<<"\n";
+      return 0;
+    }
+    poly[0] = -poly[0]/poles[i];
+    for (j=1; j<=i; j++) poly[j] = (poly[j-1] - poly[j])/poles[i];
+  }
+
+  norm = param[n];
+
+  delete [] poly;
+
+  return 1;
+}
+
+// Evaluate the polynomial
+bigfloat AlgRemez::polyEval(bigfloat x, bigfloat *poly, long size) {
+  bigfloat f = poly[size];
+  for (int i=size-1; i>=0; i--) f = f*x + poly[i];
+  return f;
+}
+
+// Evaluate the differential of the polynomial
+bigfloat AlgRemez::polyDiff(bigfloat x, bigfloat *poly, long size) {
+  bigfloat df = (bigfloat)size*poly[size];
+  for (int i=size-1; i>0; i--) df = df*x + (bigfloat)i*poly[i];
+  return df;
+}
+
+
+// Newton's method to calculate roots
+bigfloat AlgRemez::rtnewt(bigfloat *poly, long i, bigfloat x1, 
+			  bigfloat x2, bigfloat xacc) {
+  int j;
+  bigfloat df, dx, f, rtn;
+
+  rtn=(bigfloat)0.5*(x1+x2);
+  for (j=1; j<=JMAX;j++) {
+    f = polyEval(rtn, poly, i);
+    df = polyDiff(rtn, poly, i);
+    dx = f/df;
+    rtn -= dx;
+    if (abs_bf(dx) < xacc) return rtn;
+  }
+  std::cout<<"Maximum number of iterations exceeded in rtnewt\n";
+  return 0.0;
+}
+
+// Evaluate the partial fraction expansion of the rational function
+// with res roots and poles poles.  Result is overwritten on input
+// arrays.
+void AlgRemez::pfe(bigfloat *res, bigfloat *poles, bigfloat norm) {
+  int i,j,small;
+  bigfloat temp;
+  bigfloat *numerator = new bigfloat[n];
+  bigfloat *denominator = new bigfloat[d];
+
+  // Construct the polynomials explicitly 
+  for (i=1; i<n; i++) {
+    numerator[i] = 0l;
+    denominator[i] = 0l;
+  }
+  numerator[0]=1l;
+  denominator[0]=1l;
+
+  for (j=0; j<n; j++) {
+    for (i=n-1; i>=0; i--) {
+      numerator[i] *= -res[j];
+      denominator[i] *= -poles[j];
+      if (i>0) {
+	numerator[i] += numerator[i-1];
+	denominator[i] += denominator[i-1];
+      }
+    }
+  }
+
+  // Convert to proper fraction form.
+  // Fraction is now in the form 1 + n/d, where O(n)+1=O(d)
+  for (i=0; i<n; i++) numerator[i] -= denominator[i];
+
+  // Find the residues of the partial fraction expansion and absorb the
+  // coefficients.
+  for (i=0; i<n; i++) {
+    res[i] = 0l;
+    for (j=n-1; j>=0; j--) {
+      res[i] = poles[i]*res[i]+numerator[j];
+    }
+    for (j=n-1; j>=0; j--) {
+      if (i!=j) res[i] /= poles[i]-poles[j];
+    }
+    res[i] *= norm;
+  }  
+
+  // res now holds the residues
+  j = 0;
+  for (i=0; i<n; i++) poles[i] = -poles[i];
+
+  // Move the ordering of the poles from smallest to largest
+  for (j=0; j<n; j++) {
+    small = j;
+    for (i=j+1; i<n; i++) {
+      if (poles[i] < poles[small]) small = i;
+    }
+    if (small != j) {
+      temp = poles[small];
+      poles[small] = poles[j];
+      poles[j] = temp;
+      temp = res[small];
+      res[small] = res[j];
+      res[j] = temp;
+    }
+  }
+
+  delete [] numerator;
+  delete [] denominator;
+}
+
+double AlgRemez::evaluateApprox(double x) {
+  return (double)approx((bigfloat)x);
+}
+
+double AlgRemez::evaluateInverseApprox(double x) {
+  return 1.0/(double)approx((bigfloat)x);
+}
+
+double AlgRemez::evaluateFunc(double x) {
+  return (double)func((bigfloat)x);
+}
+
+double AlgRemez::evaluateInverseFunc(double x) {
+  return 1.0/(double)func((bigfloat)x);
+}
+
+void AlgRemez::csv(std::ostream & os)
+{
+  double lambda_low = apstrt;
+  double lambda_high= apend;
+  for (double x=lambda_low; x<lambda_high; x*=1.05) {
+    double f = evaluateFunc(x);
+    double r = evaluateApprox(x);
+    os<< x<<","<<r<<","<<f<<","<<r-f<<std::endl;
+  }
+  return;
+}
+
--- a/Grid/algorithms/approx/Remez.h
+++ b/Grid/algorithms/approx/Remez.h
@@ -0,0 +1,184 @@
+/*
+
+  Mike Clark - 25th May 2005
+
+  alg_remez.h
+
+  AlgRemez is an implementation of the Remez algorithm, which in this
+  case is used for generating the optimal nth root rational
+  approximation.
+
+  Note this class requires the gnu multiprecision (GNU MP) library.
+
+*/
+
+#ifndef INCLUDED_ALG_REMEZ_H
+#define INCLUDED_ALG_REMEZ_H
+
+#include <stddef.h>
+#include <Grid/GridStd.h>
+
+#ifdef HAVE_LIBGMP
+#include "bigfloat.h"
+#else
+#include "bigfloat_double.h"
+#endif
+
+#define JMAX 10000 //Maximum number of iterations of Newton's approximation
+#define SUM_MAX 10 // Maximum number of terms in exponential
+
+/*
+ *Usage examples
+  AlgRemez remez(lambda_low,lambda_high,precision);
+  error = remez.generateApprox(n,d,y,z);
+  remez.getPFE(res,pole,&norm);
+  remez.getIPFE(res,pole,&norm);
+  remez.csv(ostream &os);
+ */
+
+class AlgRemez
+{
+ private:
+  char *cname;
+
+  // The approximation parameters
+  bigfloat *param, *roots, *poles;
+  bigfloat norm;
+
+  // The numerator and denominator degree (n=d)
+  int n, d;
+  
+  // The bounds of the approximation
+  bigfloat apstrt, apwidt, apend;
+
+  // the numerator and denominator of the power we are approximating
+  unsigned long power_num; 
+  unsigned long power_den;
+
+  // Flag to determine whether the arrays have been allocated
+  int alloc;
+
+  // Flag to determine whether the roots have been found
+  int foundRoots;
+
+  // Variables used to calculate the approximation
+  int nd1, iter;
+  bigfloat *xx, *mm, *step;
+  bigfloat delta, spread, tolerance;
+
+  // The exponential summation coefficients
+  bigfloat *a;
+  int *a_power;
+  int a_length;
+
+  // The number of equations we must solve at each iteration (n+d+1)
+  int neq;
+
+  // The precision of the GNU MP library
+  long prec;
+
+  // Initial values of maximal and minmal errors
+  void initialGuess();
+
+  // Solve the equations
+  void equations();
+
+  // Search for error maxima and minima
+  void search(bigfloat *step); 
+
+  // Initialise step sizes
+  void stpini(bigfloat *step);
+
+  // Calculate the roots of the approximation
+  int root();
+
+  // Evaluate the polynomial
+  bigfloat polyEval(bigfloat x, bigfloat *poly, long size);
+  //complex_bf polyEval(complex_bf x, complex_bf *poly, long size);
+
+  // Evaluate the differential of the polynomial
+  bigfloat polyDiff(bigfloat x, bigfloat *poly, long size);
+  //complex_bf polyDiff(complex_bf x, complex_bf *poly, long size);
+
+  // Newton's method to calculate roots
+  bigfloat rtnewt(bigfloat *poly, long i, bigfloat x1, bigfloat x2, bigfloat xacc);
+  //complex_bf rtnewt(complex_bf *poly, long i, bigfloat x1, bigfloat x2, bigfloat xacc);
+
+  // Evaluate the partial fraction expansion of the rational function
+  // with res roots and poles poles.  Result is overwritten on input
+  // arrays.
+  void pfe(bigfloat *res, bigfloat* poles, bigfloat norm);
+
+  // Calculate function required for the approximation
+  bigfloat func(bigfloat x);
+
+  // Compute size and sign of the approximation error at x
+  bigfloat getErr(bigfloat x, int *sign);
+
+  // Solve the system AX=B
+  int simq(bigfloat *A, bigfloat *B, bigfloat *X, int n);
+
+  // Free memory and reallocate as necessary
+  void allocate(int num_degree, int den_degree);
+
+  // Evaluate the rational form P(x)/Q(x) using coefficients from the
+  // solution vector param
+  bigfloat approx(bigfloat x);
+
+ public:
+  
+  // Constructor
+  AlgRemez(double lower, double upper, long prec);
+
+  // Destructor
+  virtual ~AlgRemez();
+
+  int getDegree(void){ 
+    assert(n==d);
+    return n;
+  }
+  // Reset the bounds of the approximation
+  void setBounds(double lower, double upper);
+  // Reset the bounds of the approximation
+  void getBounds(double &lower, double &upper) { 
+    lower=(double)apstrt;
+    upper=(double)apend;
+  }
+
+  // Generate the rational approximation x^(pnum/pden)
+  double generateApprox(int num_degree, int den_degree, 
+			unsigned long power_num, unsigned long power_den, 
+			int a_len, double* a_param, int* a_pow);
+  double generateApprox(int num_degree, int den_degree, 
+			unsigned long power_num, unsigned long power_den);
+  double generateApprox(int degree, unsigned long power_num, 
+			unsigned long power_den);
+
+  // Return the partial fraction expansion of the approximation x^(pnum/pden)
+  int getPFE(double *res, double *pole, double *norm);
+
+  // Return the partial fraction expansion of the approximation x^(-pnum/pden)
+  int getIPFE(double *res, double *pole, double *norm);
+
+  // Evaluate the rational form P(x)/Q(x) using coefficients from the
+  // solution vector param
+  double evaluateApprox(double x);
+
+  // Evaluate the rational form Q(x)/P(x) using coefficients from the
+  // solution vector param
+  double evaluateInverseApprox(double x);
+
+  // Calculate function required for the approximation
+  double evaluateFunc(double x);
+
+  // Calculate inverse function required for the approximation
+  double evaluateInverseFunc(double x);
+  
+  // Dump csv of function, approx and error
+  void csv(std::ostream &os);
+};
+
+#endif  // Include guard
+
+
+
--- a/Grid/algorithms/approx/Zolotarev.cc
+++ b/Grid/algorithms/approx/Zolotarev.cc
@@ -0,0 +1,730 @@
+/* -*- Mode: C; comment-column: 22; fill-column: 79; compile-command: "gcc -o zolotarev zolotarev.c -ansi -pedantic -lm -DTEST"; -*- */
+#define VERSION Source Time-stamp: <2015-05-18 16:32:08 neo>
+
+/* These C routines evalute the optimal rational approximation to the signum
+ * function for epsilon < |x| < 1 using Zolotarev's theorem.
+ *
+ * To obtain reliable results for high degree approximations (large n) it is
+ * necessary to compute using sufficiently high precision arithmetic. To this
+ * end the code has been parameterised to work with the preprocessor names
+ * INTERNAL_PRECISION and PRECISION set to float, double, or long double as
+ * appropriate. INTERNAL_PRECISION is used in computing the Zolotarev
+ * coefficients, which are converted to PRECISION before being returned to the
+ * caller. Presumably even higher precision could be obtained using GMP or
+ * similar package, but bear in mind that rounding errors might also be
+ * significant in evaluating the resulting polynomial. The convergence criteria
+ * have been written in a precision-independent form. */
+
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+
+#ifndef INTERNAL_PRECISION
+#define INTERNAL_PRECISION double
+#endif
+
+#include "Zolotarev.h"
+#define ZOLOTAREV_INTERNAL
+#undef ZOLOTAREV_DATA
+#define ZOLOTAREV_DATA izd
+#undef ZPRECISION
+#define ZPRECISION INTERNAL_PRECISION
+#include "Zolotarev.h"
+#undef ZOLOTAREV_INTERNAL
+
+/* The ANSI standard appears not to know what pi is */
+
+#ifndef M_PI
+#define M_PI ((INTERNAL_PRECISION) 3.141592653589793238462643383279502884197\
+169399375105820974944592307816406286208998628034825342117068)
+#endif
+
+#define ZERO ((INTERNAL_PRECISION) 0)
+#define ONE ((INTERNAL_PRECISION) 1)
+#define TWO ((INTERNAL_PRECISION) 2)
+#define THREE ((INTERNAL_PRECISION) 3)
+#define FOUR ((INTERNAL_PRECISION) 4)
+#define HALF (ONE/TWO)
+
+/* The following obscenity seems to be the simplest (?) way to coerce the C
+ * preprocessor to convert the value of a preprocessor token into a string. */
+
+#define PP2(x) #x
+#define PP1(a,b,c) a ## b(c)
+#define STRINGIFY(name) PP1(PP,2,name)
+
+/* Compute the partial fraction expansion coefficients (alpha) from the
+ * factored form */
+NAMESPACE_BEGIN(Grid);
+NAMESPACE_BEGIN(Approx);
+
+static void construct_partfrac(izd *z) {
+  int dn = z -> dn, dd = z -> dd, type = z -> type;
+  int j, k, da = dd + 1 + type;
+  INTERNAL_PRECISION A = z -> A, *a = z -> a, *ap = z -> ap, *alpha;
+  alpha = (INTERNAL_PRECISION*) malloc(da * sizeof(INTERNAL_PRECISION));
+  for (j = 0; j < dd; j++)
+    for (k = 0, alpha[j] = A; k < dd; k++)
+      alpha[j] *=
+	(k < dn ? ap[j] - a[k] : ONE) / (k == j ? ONE : ap[j] - ap[k]);
+  if(type == 1)	      /* implicit pole at zero? */
+    for (k = 0, alpha[dd] = A * (dn > dd ? - a[dd] : ONE); k < dd; k++) {
+      alpha[dd] *= a[k] / ap[k];
+      alpha[k] *= (dn > dd ? ap[k] - a[dd] : ONE) / ap[k];
+    }
+  alpha[da-1] = dn == da - 1 ? A : ZERO;
+  z -> alpha = alpha;
+  z -> da = da;
+  return;
+}
+
+/* Convert factored polynomial into dense polynomial. The input is the overall
+ * factor A and the roots a[i], such that p = A product(x - a[i], i = 1..d) */
+
+static INTERNAL_PRECISION *poly_factored_to_dense(INTERNAL_PRECISION A, 
+						  INTERNAL_PRECISION *a,
+						  int d) {
+  INTERNAL_PRECISION *p;
+  int i, j;
+  p = (INTERNAL_PRECISION *) malloc((d + 2) * sizeof(INTERNAL_PRECISION));
+  p[0] = A;
+  for (i = 0; i < d; i++) {
+    p[i+1] = p[i];
+    for (j = i; j > 0; j--) p[j] = p[j-1] - a[i]*p[j];
+    p[0] *= - a[i];
+  }
+  return p;
+}
+
+/* Convert a rational function of the form R0(x) = x p(x^2)/q(x^2) (type 0) or
+ * R1(x) = p(x^2)/[x q(x^2)] (type 1) into its continued fraction
+ * representation. We assume that 0 <= deg(q) - deg(p) <= 1 for type 0 and 0 <=
+ * deg(p) - deg(q) <= 1 for type 1. On input p and q are in factored form, and
+ * deg(q) = dq, deg(p) = dp.  The output is the continued fraction coefficients
+ * beta, where R(x) = beta[0] x + 1/(beta[1] x + 1/(...)).
+ *
+ * The method used is as follows. There are four cases to consider:
+ *
+ * 0.i.  Type 0, deg p = deg q
+ *
+ * 0.ii. Type 0, deg p = deg q - 1
+ *
+ * 1.i.  Type 1, deg p = deg q
+ *
+ * 1.ii. Type 1, deg p = deg q + 1
+ *
+ * and these are connected by two transformations:
+ *
+ * A. To obtain a continued fraction expansion of type 1 we use a single-step
+ * polynomial division we find beta and r(x) such that p(x) = beta x q(x) +
+ * r(x), with deg(r) = deg(q). This implies that p(x^2) = beta x^2 q(x^2) +
+ * r(x^2), and thus R1(x) = x beta + r(x^2)/(x q(x^2)) = x beta + 1/R0(x)
+ * with R0(x) = x q(x^2)/r(x^2).
+ *
+ * B. A continued fraction expansion of type 0 is obtained in a similar, but
+ * not identical, manner. We use the polynomial division algorithm to compute
+ * the quotient beta and the remainder r that satisfy p(x) = beta q(x) + r(x)
+ * with deg(r) = deg(q) - 1. We thus have x p(x^2) = x beta q(x^2) + x r(x^2),
+ * so R0(x) = x beta + x r(x^2)/q(x^2) = x beta + 1/R1(x) with R1(x) = q(x^2) /
+ * (x r(x^2)).
+ *
+ * Note that the deg(r) must be exactly deg(q) for (A) and deg(q) - 1 for (B)
+ * because p and q have disjoint roots all of multiplicity 1. This means that
+ * the division algorithm requires only a single polynomial subtraction step.
+ *
+ * The transformations between the cases form the following finite state
+ * automaton:
+ *
+ * +------+            +------+            +------+            +------+
+ * |      |            |      | ---(A)---> |      |            |      |
+ * | 0.ii | ---(B)---> | 1.ii |            | 0.i  | <---(A)--- | 1.i  |
+ * |      |            |      | <---(B)--- |      |            |      |
+ * +------+            +------+            +------+            +------+
+ */
+
+static INTERNAL_PRECISION *contfrac_A(INTERNAL_PRECISION *,
+				      INTERNAL_PRECISION *,
+				      INTERNAL_PRECISION *,
+				      INTERNAL_PRECISION *, int, int);
+
+static INTERNAL_PRECISION *contfrac_B(INTERNAL_PRECISION *,
+				      INTERNAL_PRECISION *,
+				      INTERNAL_PRECISION *,
+				      INTERNAL_PRECISION *, int, int);
+
+static void construct_contfrac(izd *z){
+  INTERNAL_PRECISION *r, A = z -> A, *p = z -> a, *q = z -> ap;
+  int dp = z -> dn, dq = z -> dd, type = z -> type;
+
+  z -> db = 2 * dq + 1 + type;
+  z -> beta = (INTERNAL_PRECISION *)
+    malloc(z -> db * sizeof(INTERNAL_PRECISION));
+  p = poly_factored_to_dense(A, p, dp);
+  q = poly_factored_to_dense(ONE, q, dq);
+  r = (INTERNAL_PRECISION *) malloc((MAX(dp,dq) + 1) *
+				    sizeof(INTERNAL_PRECISION));
+  if (type == 0) (void) contfrac_B(z -> beta, p, q, r, dp, dq);
+  else (void) contfrac_A(z -> beta, p, q, r, dp, dq);
+  free(p); free(q); free(r);
+  return;
+}
+
+static INTERNAL_PRECISION *contfrac_A(INTERNAL_PRECISION *beta,
+				      INTERNAL_PRECISION *p,
+				      INTERNAL_PRECISION *q,
+				      INTERNAL_PRECISION *r, int dp, int dq) {
+  INTERNAL_PRECISION quot, *rb;
+  int j;
+
+  /* p(x) = x beta q(x) + r(x); dp = dq or dp = dq + 1 */
+
+  quot = dp == dq ? ZERO : p[dp] / q[dq];
+  r[0] = p[0];
+  for (j = 1; j <= dp; j++) r[j] = p[j] - quot * q[j-1];
+#ifdef DEBUG
+  printf("%s: Continued Fraction form: deg p = %2d, deg q = %2d, beta = %g\n",
+	 __FUNCTION__, dp, dq, (float) quot);
+  for (j = 0; j <= dq + 1; j++)
+    printf("\tp[%2d] = %14.6g\tq[%2d] = %14.6g\tr[%2d] = %14.6g\n",
+	   j, (float) (j > dp ? ZERO : p[j]),
+	   j, (float) (j == 0 ? ZERO : q[j-1]),
+	   j, (float) (j == dp ? ZERO : r[j]));
+#endif /* DEBUG */
+  *(rb = contfrac_B(beta, q, r, p, dq, dq)) = quot;
+  return rb + 1;
+}
+
+static INTERNAL_PRECISION *contfrac_B(INTERNAL_PRECISION *beta,
+				      INTERNAL_PRECISION *p,
+				      INTERNAL_PRECISION *q,
+				      INTERNAL_PRECISION *r, int dp, int dq) {
+  INTERNAL_PRECISION quot, *rb;
+  int j;
+
+  /* p(x) = beta q(x) + r(x); dp = dq or dp = dq - 1 */
+
+  quot = dp == dq ? p[dp] / q[dq] : ZERO;
+  for (j = 0; j < dq; j++) r[j] = p[j] - quot * q[j];
+#ifdef DEBUG
+  printf("%s: Continued Fraction form: deg p = %2d, deg q = %2d, beta = %g\n",
+	 __FUNCTION__, dp, dq, (float) quot);
+  for (j = 0; j <= dq; j++)
+    printf("\tp[%2d] = %14.6g\tq[%2d] = %14.6g\tr[%2d] = %14.6g\n",
+	   j, (float) (j > dp ? ZERO : p[j]),
+	   j, (float) q[j],
+	   j, (float) (j == dq ? ZERO : r[j]));
+#endif /* DEBUG */
+  *(rb = dq > 0 ? contfrac_A(beta, q, r, p, dq, dq-1) : beta) = quot;
+  return rb + 1;
+}
+
+/* The global variable U is used to hold the argument u throughout the AGM
+ * recursion. The global variables F and K are set in the innermost
+ * instantiation of the recursive function AGM to the values of the elliptic
+ * integrals F(u,k) and K(k) respectively. They must be made thread local to
+ * make this code thread-safe in a multithreaded environment. */
+
+static INTERNAL_PRECISION U, F, K;	/* THREAD LOCAL */
+
+/* Recursive implementation of Gauss' arithmetico-geometric mean, which is the
+ * kernel of the method used to compute the Jacobian elliptic functions
+ * sn(u,k), cn(u,k), and dn(u,k) with parameter k (where 0 < k < 1), as well
+ * as the elliptic integral F(s,k) satisfying F(sn(u,k)) = u and the complete
+ * elliptic integral K(k).
+ *
+ * The algorithm used is a recursive implementation of the Gauss (Landen)
+ * transformation.
+ *
+ * The function returns the value of sn(u,k'), where k' is the dual parameter,
+ * and also sets the values of the global variables F and K.  The latter is
+ * used to determine the sign of cn(u,k').
+ *
+ * The algorithm is deemed to have converged when b ceases to increase. This
+ * works whatever INTERNAL_PRECISION is specified. */
+
+static INTERNAL_PRECISION AGM(INTERNAL_PRECISION a,
+			      INTERNAL_PRECISION b,
+			      INTERNAL_PRECISION s) {
+  static INTERNAL_PRECISION pb = -ONE;
+  INTERNAL_PRECISION c, d, xi;
+
+  if (b <= pb) {
+    pb = -ONE;
+    F = asin(s) / a;		/* Here, a is the AGM */
+    K = M_PI / (TWO * a);
+    return sin(U * a);
+  }
+  pb = b;
+  c = a - b;
+  d = a + b;
+  xi = AGM(HALF*d, sqrt(a*b), ONE + c*c == ONE ?
+	   HALF*s*d/a : (a - sqrt(a*a - s*s*c*d))/(c*s));
+  return 2*a*xi / (d + c*xi*xi);
+}
+
+/* Computes sn(u,k), cn(u,k), dn(u,k), F(u,k), and K(k). It is essentially a
+ * wrapper for the routine AGM. The sign of cn(u,k) is defined to be -1 if
+ * K(k) < u < 3*K(k) and +1 otherwise, and thus sign is computed by some quite
+ * unnecessarily obfuscated bit manipulations. */
+
+static void sncndnFK(INTERNAL_PRECISION u, INTERNAL_PRECISION k,
+		     INTERNAL_PRECISION* sn, INTERNAL_PRECISION* cn,
+		     INTERNAL_PRECISION* dn, INTERNAL_PRECISION* elF,
+		     INTERNAL_PRECISION* elK) {
+  int sgn;
+  U = u;
+  *sn = AGM(ONE, sqrt(ONE - k*k), u);
+  sgn = ((int) (fabs(u) / K)) % 4; /* sgn = 0, 1, 2, 3 */
+  sgn ^= sgn >> 1;    /* (sgn & 1) = 0, 1, 1, 0 */
+  sgn = 1 - ((sgn & 1) << 1);	/* sgn = 1, -1, -1, 1 */
+  *cn = ((INTERNAL_PRECISION) sgn) * sqrt(ONE - *sn * *sn);
+  *dn = sqrt(ONE - k*k* *sn * *sn);
+  *elF = F;
+  *elK = K;
+}
+
+/* Compute the coefficients for the optimal rational approximation R(x) to
+ * sgn(x) of degree n over the interval epsilon < |x| < 1 using Zolotarev's
+ * formula. 
+ *
+ * Set type = 0 for the Zolotarev approximation, which is zero at x = 0, and
+ * type = 1 for the approximation which is infinite at x = 0. */
+
+zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
+  INTERNAL_PRECISION A, c, cp, kp, ksq, sn, cn, dn, Kp, Kj, z, z0, t, M, F,
+    l, invlambda, xi, xisq, *tv, s, opl;
+  int m, czero, ts;
+  zolotarev_data *zd;
+  izd *d = (izd*) malloc(sizeof(izd));
+
+  d -> type = type;
+  d -> epsilon = (INTERNAL_PRECISION) epsilon;
+  d -> n = n;
+  d -> dd = n / 2;
+  d -> dn = d -> dd - 1 + n % 2; /* n even: dn = dd - 1, n odd: dn = dd */
+  d -> deg_denom = 2 * d -> dd;
+  d -> deg_num = 2 * d -> dn + 1;
+
+  d -> a = (INTERNAL_PRECISION*) malloc((1 + d -> dn) *
+					sizeof(INTERNAL_PRECISION));
+  d -> ap = (INTERNAL_PRECISION*) malloc(d -> dd *
+					 sizeof(INTERNAL_PRECISION));
+  ksq = d -> epsilon * d -> epsilon;
+  kp = sqrt(ONE - ksq);
+  sncndnFK(ZERO, kp, &sn, &cn, &dn, &F, &Kp); /* compute Kp = K(kp) */
+  z0 = TWO * Kp / (INTERNAL_PRECISION) n;
+  M = ONE;
+  A = ONE / d -> epsilon;
+
+  sncndnFK(HALF * z0, kp, &sn, &cn, &dn, &F, &Kj); /* compute xi */
+  xi = ONE / dn;
+  xisq = xi * xi;
+  invlambda = xi;
+
+  for (m = 0; m < d -> dd; m++) {
+    czero = 2 * (m + 1) == n; /* n even and m = dd -1 */
+
+    z = z0 * ((INTERNAL_PRECISION) m + ONE);
+    sncndnFK(z, kp, &sn, &cn, &dn, &F, &Kj);
+    t = cn / sn;
+    c = - t*t;
+    if (!czero) (d -> a)[d -> dn - 1 - m] = ksq / c;
+
+    z = z0 * ((INTERNAL_PRECISION) m + HALF);
+    sncndnFK(z, kp, &sn, &cn, &dn, &F, &Kj);
+    t = cn / sn;
+    cp = - t*t;
+    (d -> ap)[d -> dd - 1 - m] = ksq / cp;
+
+    M *= (ONE - c) / (ONE - cp);
+    A *= (czero ? -ksq : c) * (ONE - cp) / (cp * (ONE - c));
+    invlambda *= (ONE - c*xisq) / (ONE - cp*xisq);
+  }
+  invlambda /= M;
+  d -> A = TWO / (ONE + invlambda) * A;
+  d -> Delta = (invlambda - ONE) / (invlambda + ONE);
+
+  d -> gamma = (INTERNAL_PRECISION*) malloc((1 + d -> n) *
+					    sizeof(INTERNAL_PRECISION));
+  l = ONE / invlambda;
+  opl = ONE + l;
+  sncndnFK(sqrt( d -> type == 1
+		   ? (THREE + l) / (FOUR * opl)
+		   : (ONE + THREE*l) / (opl*opl*opl)
+	       ), sqrt(ONE - l*l), &sn, &cn, &dn, &F, &Kj);
+  s = M * F;
+  for (m = 0; m < d -> n; m++) {
+    sncndnFK(s + TWO*Kp*m/n, kp, &sn, &cn, &dn, &F, &Kj);
+    d -> gamma[m] = d -> epsilon / dn;
+  }
+
+  /* If R(x) is a Zolotarev rational approximation of degree (n,m) with maximum
+   * error Delta, then (1 - Delta^2) / R(x) is also an optimal Chebyshev
+   * approximation of degree (m,n) */
+
+  if (d -> type == 1) {
+    d -> A = (ONE - d -> Delta * d -> Delta) / d -> A;
+    tv = d -> a; d -> a = d -> ap; d -> ap = tv;
+    ts = d -> dn; d -> dn = d -> dd; d -> dd = ts;
+    ts = d -> deg_num; d -> deg_num = d -> deg_denom; d -> deg_denom = ts;
+  }
+
+  construct_partfrac(d);
+  construct_contfrac(d);
+
+  /* Converting everything to PRECISION for external use only */
+
+  zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
+  zd -> A = (PRECISION) d -> A;
+  zd -> Delta = (PRECISION) d -> Delta;
+  zd -> epsilon = (PRECISION) d -> epsilon;
+  zd -> n = d -> n;
+  zd -> type = d -> type;
+  zd -> dn = d -> dn;
+  zd -> dd = d -> dd;
+  zd -> da = d -> da;
+  zd -> db = d -> db;
+  zd -> deg_num = d -> deg_num;
+  zd -> deg_denom = d -> deg_denom;
+
+  zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
+  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
+  free(d -> a);
+
+  zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
+  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
+  free(d -> ap);
+
+  zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
+  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
+  free(d -> alpha);
+
+  zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
+  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
+  free(d -> beta);
+
+  zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
+  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
+  free(d -> gamma);
+
+  free(d);
+  return zd;
+}
+
+
+void zolotarev_free(zolotarev_data *zdata)
+{
+    free(zdata -> a);
+    free(zdata -> ap);
+    free(zdata -> alpha);
+    free(zdata -> beta);
+    free(zdata -> gamma);
+    free(zdata);
+}
+
+
+zolotarev_data* higham(PRECISION epsilon, int n) {
+  INTERNAL_PRECISION A, M, c, cp, z, z0, t, epssq;
+  int m, czero;
+  zolotarev_data *zd;
+  izd *d = (izd*) malloc(sizeof(izd));
+
+  d -> type = 0;
+  d -> epsilon = (INTERNAL_PRECISION) epsilon;
+  d -> n = n;
+  d -> dd = n / 2;
+  d -> dn = d -> dd - 1 + n % 2; /* n even: dn = dd - 1, n odd: dn = dd */
+  d -> deg_denom = 2 * d -> dd;
+  d -> deg_num = 2 * d -> dn + 1;
+
+  d -> a = (INTERNAL_PRECISION*) malloc((1 + d -> dn) *
+					sizeof(INTERNAL_PRECISION));
+  d -> ap = (INTERNAL_PRECISION*) malloc(d -> dd *
+					 sizeof(INTERNAL_PRECISION));
+  A = (INTERNAL_PRECISION) n;
+  z0 = M_PI / A;
+  A = n % 2 == 0 ? A : ONE / A;
+  M = d -> epsilon * A;
+  epssq = d -> epsilon * d -> epsilon;
+
+  for (m = 0; m < d -> dd; m++) {
+    czero = 2 * (m + 1) == n; /* n even and m = dd - 1*/
+
+    if (!czero) {
+      z = z0 * ((INTERNAL_PRECISION) m + ONE);
+      t = tan(z);
+      c = - t*t;
+      (d -> a)[d -> dn - 1 - m] = c;
+      M *= epssq - c;
+    }
+
+    z = z0 * ((INTERNAL_PRECISION) m + HALF);
+    t = tan(z);
+    cp = - t*t;
+    (d -> ap)[d -> dd - 1 - m] = cp;
+    M /= epssq - cp;
+  }
+
+  d -> gamma = (INTERNAL_PRECISION*) malloc((1 + d -> n) *
+					    sizeof(INTERNAL_PRECISION));
+  for (m = 0; m < d -> n; m++) d -> gamma[m] = ONE;
+
+  d -> A = A;
+  d -> Delta = ONE - M;
+
+  construct_partfrac(d);
+  construct_contfrac(d);
+
+  /* Converting everything to PRECISION for external use only */
+
+  zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
+  zd -> A = (PRECISION) d -> A;
+  zd -> Delta = (PRECISION) d -> Delta;
+  zd -> epsilon = (PRECISION) d -> epsilon;
+  zd -> n = d -> n;
+  zd -> type = d -> type;
+  zd -> dn = d -> dn;
+  zd -> dd = d -> dd;
+  zd -> da = d -> da;
+  zd -> db = d -> db;
+  zd -> deg_num = d -> deg_num;
+  zd -> deg_denom = d -> deg_denom;
+
+  zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
+  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
+  free(d -> a);
+
+  zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
+  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
+  free(d -> ap);
+
+  zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
+  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
+  free(d -> alpha);
+
+  zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
+  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
+  free(d -> beta);
+
+  zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
+  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
+  free(d -> gamma);
+
+  free(d);
+  return zd;
+}
+
+NAMESPACE_END(Approx);
+NAMESPACE_END(Grid);
+
+#ifdef TEST
+
+#undef ZERO
+#define ZERO ((PRECISION) 0)
+#undef ONE
+#define ONE ((PRECISION) 1)
+#undef TWO
+#define TWO ((PRECISION) 2)
+
+/* Evaluate the rational approximation R(x) using the factored form */
+
+static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) {
+  int m;
+  PRECISION R;
+
+  if (rdata -> type == 0) {
+    R = rdata -> A * x;
+    for (m = 0; m < rdata -> deg_denom/2; m++)
+      R *= (2*(m+1) > rdata -> deg_num ? ONE : x*x - rdata -> a[m]) /
+	(x*x - rdata -> ap[m]);
+  } else {
+    R = rdata -> A / x;
+    for (m = 0; m < rdata -> deg_num/2; m++)
+      R *= (x*x - rdata -> a[m]) /
+	(2*(m+1) > rdata -> deg_denom ? ONE : x*x - rdata -> ap[m]);
+  }
+  return R;
+}
+
+/* Evaluate the rational approximation R(x) using the partial fraction form */
+
+static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) {
+  int m;
+  PRECISION R = rdata -> alpha[rdata -> da - 1];
+  for (m = 0; m < rdata -> dd; m++)
+    R += rdata -> alpha[m] / (x * x - rdata -> ap[m]);
+  if (rdata -> type == 1) R += rdata -> alpha[rdata -> dd] / (x * x);
+  return R * x;
+}    
+
+/* Evaluate the rational approximation R(x) using continued fraction form. 
+ *
+ * If x = 0 and type = 1 then the result should be INF, whereas if x = 0 and
+ * type = 0 then the result should be 0, but division by zero will occur at
+ * intermediate stages of the evaluation. For IEEE implementations with
+ * non-signalling overflow this will work correctly since 1/(1/0) = 1/INF = 0,
+ * but with signalling overflow you will get an error message. */
+
+static PRECISION zolotarev_contfrac_eval(PRECISION x, zolotarev_data* rdata) {
+  int m;
+  PRECISION R = rdata -> beta[0] * x;
+  for (m = 1; m < rdata -> db; m++) R = rdata -> beta[m] * x + ONE / R;
+  return R;
+}    
+
+/* Evaluate the rational approximation R(x) using Cayley form */
+
+static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) {
+  int m;
+  PRECISION T;
+
+  T = rdata -> type == 0 ? ONE : -ONE;
+  for (m = 0; m < rdata -> n; m++)
+    T *= (rdata -> gamma[m] - x) / (rdata -> gamma[m] + x);
+  return (ONE - T) / (ONE + T);
+}
+
+
+/* Test program. Apart from printing out the parameters for R(x) it produces
+ * the following data files for plotting (unless NPLOT is defined):
+ *
+ * zolotarev-fn is a plot of R(x) for |x| < 1.2. This should look like sgn(x).
+ *
+ * zolotarev-err is a plot of the error |R(x) - sgn(x)| scaled by 1/Delta. This
+ * should oscillate deg_num + deg_denom + 2 times between +1 and -1 over the
+ * domain epsilon <= |x| <= 1.
+ *
+ * If ALLPLOTS is defined then zolotarev-partfrac (zolotarev-contfrac) is a
+ * plot of the difference between the values of R(x) computed using the
+ * factored form and the partial fraction (continued fraction) form, scaled by
+ * 1/Delta. It should be zero everywhere. */
+
+int main(int argc, char** argv) {
+  
+  int m, n, plotpts = 5000, type = 0;
+  float eps, x, ypferr, ycferr, ycaylerr, maxypferr, maxycferr, maxycaylerr;
+  zolotarev_data *rdata;
+  PRECISION y;
+  FILE *plot_function, *plot_error, 
+    *plot_partfrac, *plot_contfrac, *plot_cayley;
+
+  if (argc < 3 || argc > 4) {
+    fprintf(stderr, "Usage: %s epsilon n [type]\n", *argv);
+    exit(EXIT_FAILURE);
+  }
+  sscanf(argv[1], "%g", &eps);	/* First argument is epsilon */
+  sscanf(argv[2], "%d", &n);	/* Second argument is n */
+  if (argc == 4) sscanf(argv[3], "%d", &type); /* Third argument is type */
+
+  if (type < 0 || type > 2) {
+    fprintf(stderr, "%s: type must be 0 (Zolotarev R(0) = 0),\n"
+	    "\t\t1 (Zolotarev R(0) = Inf, or 2 (Higham)\n", *argv);
+    exit(EXIT_FAILURE);
+  }
+
+  rdata = type == 2 
+    ? higham((PRECISION) eps, n) 
+    : zolotarev((PRECISION) eps, n, type);
+
+  printf("Zolotarev Test: R(epsilon = %g, n = %d, type = %d)\n\t" 
+	 STRINGIFY(VERSION) "\n\t" STRINGIFY(HVERSION)
+	 "\n\tINTERNAL_PRECISION = " STRINGIFY(INTERNAL_PRECISION)
+	 "\tPRECISION = " STRINGIFY(PRECISION)
+	 "\n\n\tRational approximation of degree (%d,%d), %s at x = 0\n"
+	 "\tDelta = %g (maximum error)\n\n"
+	 "\tA = %g (overall factor)\n",
+	 (float) rdata -> epsilon, rdata -> n, type,
+	 rdata -> deg_num, rdata -> deg_denom,
+	 rdata -> type == 1 ? "infinite" : "zero",
+	 (float) rdata -> Delta, (float) rdata -> A);
+  for (m = 0; m < MIN(rdata -> dd, rdata -> dn); m++)
+    printf("\ta[%2d] = %14.8g\t\ta'[%2d] = %14.8g\n",
+	   m + 1, (float) rdata -> a[m], m + 1, (float) rdata -> ap[m]);
+  if (rdata -> dd > rdata -> dn)
+    printf("\t\t\t\t\ta'[%2d] = %14.8g\n",
+	   rdata -> dn + 1, (float) rdata -> ap[rdata -> dn]);
+  if (rdata -> dd < rdata -> dn)
+    printf("\ta[%2d] = %14.8g\n",
+	   rdata -> dd + 1, (float) rdata -> a[rdata -> dd]);
+
+  printf("\n\tPartial fraction coefficients\n");
+  printf("\talpha[ 0] = %14.8g\n",
+	 (float) rdata -> alpha[rdata -> da - 1]);
+  for (m = 0; m < rdata -> dd; m++)
+    printf("\talpha[%2d] = %14.8g\ta'[%2d] = %14.8g\n",
+	   m + 1, (float) rdata -> alpha[m], m + 1, (float) rdata -> ap[m]);
+  if (rdata -> type == 1)
+    printf("\talpha[%2d] = %14.8g\ta'[%2d] = %14.8g\n",
+	   rdata -> dd + 1, (float) rdata -> alpha[rdata -> dd],
+	   rdata -> dd + 1, (float) ZERO);
+
+  printf("\n\tContinued fraction coefficients\n");
+  for (m = 0; m < rdata -> db; m++)
+    printf("\tbeta[%2d] = %14.8g\n", m, (float) rdata -> beta[m]);
+
+  printf("\n\tCayley transform coefficients\n");
+  for (m = 0; m < rdata -> n; m++)
+    printf("\tgamma[%2d] = %14.8g\n", m, (float) rdata -> gamma[m]);
+
+#ifndef NPLOT
+  plot_function = fopen("zolotarev-fn.dat", "w");
+  plot_error = fopen("zolotarev-err.dat", "w");
+#ifdef ALLPLOTS
+  plot_partfrac = fopen("zolotarev-partfrac.dat", "w");
+  plot_contfrac = fopen("zolotarev-contfrac.dat", "w");
+  plot_cayley = fopen("zolotarev-cayley.dat", "w");
+#endif /* ALLPLOTS */
+  for (m = 0, maxypferr = maxycferr = maxycaylerr = 0.0; m <= plotpts; m++) {
+    x = 2.4 * (float) m / plotpts - 1.2;
+    if (rdata -> type == 0 || fabs(x) * (float) plotpts > 1.0) {
+      /* skip x = 0 for type 1, as R(0) is singular */
+      y = zolotarev_eval((PRECISION) x, rdata);
+      fprintf(plot_function, "%g %g\n", x, (float) y);
+      fprintf(plot_error, "%g %g\n",
+	      x, (float)((y - ((x > 0.0 ? ONE : -ONE))) / rdata -> Delta));
+      ypferr = (float)((zolotarev_partfrac_eval((PRECISION) x, rdata) - y)
+		       / rdata -> Delta);
+      ycferr = (float)((zolotarev_contfrac_eval((PRECISION) x, rdata) - y)
+		       / rdata -> Delta);
+      ycaylerr = (float)((zolotarev_cayley_eval((PRECISION) x, rdata) - y)
+		       / rdata -> Delta);
+      if (fabs(x) < 1.0 && fabs(x) > rdata -> epsilon) {
+	maxypferr = MAX(maxypferr, fabs(ypferr));
+	maxycferr = MAX(maxycferr, fabs(ycferr));
+	maxycaylerr = MAX(maxycaylerr, fabs(ycaylerr));
+      }
+#ifdef ALLPLOTS
+      fprintf(plot_partfrac, "%g %g\n", x, ypferr);
+      fprintf(plot_contfrac, "%g %g\n", x, ycferr);
+      fprintf(plot_cayley, "%g %g\n", x, ycaylerr);
+#endif /* ALLPLOTS */
+    }
+  }
+#ifdef ALLPLOTS
+  fclose(plot_cayley);
+  fclose(plot_contfrac);
+  fclose(plot_partfrac);
+#endif /* ALLPLOTS */
+  fclose(plot_error);
+  fclose(plot_function);
+
+  printf("\n\tMaximum PF error = %g (relative to Delta)\n", maxypferr);
+  printf("\tMaximum CF error = %g (relative to Delta)\n", maxycferr);
+  printf("\tMaximum Cayley error = %g (relative to Delta)\n", maxycaylerr);
+#endif /* NPLOT */
+
+  free(rdata -> a);
+  free(rdata -> ap);
+  free(rdata -> alpha);
+  free(rdata -> beta);
+  free(rdata -> gamma);
+  free(rdata);
+
+  return EXIT_SUCCESS;
+}
+
+#endif /* TEST */
+
--- a/Grid/algorithms/approx/Zolotarev.h
+++ b/Grid/algorithms/approx/Zolotarev.h
@@ -0,0 +1,88 @@
+/* -*- Mode: C; comment-column: 22; fill-column: 79; -*- */
+
+#ifdef __cplusplus
+#include <Grid/Namespace.h>
+NAMESPACE_BEGIN(Grid);
+NAMESPACE_BEGIN(Approx);
+#endif
+
+#define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY>
+
+#ifndef ZOLOTAREV_INTERNAL
+#ifndef PRECISION
+#define PRECISION double
+#endif
+#define ZPRECISION PRECISION
+#define ZOLOTAREV_DATA zolotarev_data
+#endif
+
+/* This struct contains the coefficients which parameterise an optimal rational
+ * approximation to the signum function.
+ *
+ * The parameterisations used are:
+ *
+ * Factored form for type 0 (R0(0) = 0)
+ *
+ * R0(x) = A * x * prod(x^2 - a[j], j = 0 .. dn-1) / prod(x^2 - ap[j], j = 0
+ * .. dd-1),
+ *
+ * where deg_num = 2*dn + 1 and deg_denom = 2*dd.
+ *
+ * Factored form for type 1 (R1(0) = infinity)
+ *
+ * R1(x) = (A / x) * prod(x^2 - a[j], j = 0 .. dn-1) / prod(x^2 - ap[j], j = 0
+ * .. dd-1),
+ *
+ * where deg_num = 2*dn and deg_denom = 2*dd + 1. 
+ *
+ * Partial fraction form
+ *
+ * R(x) = alpha[da] * x + sum(alpha[j] * x / (x^2 - ap[j]), j = 0 .. da-1)
+ *
+ * where da = dd for type 0 and da = dd + 1 with ap[dd] = 0 for type 1.
+ *
+ * Continued fraction form 
+ *
+ * R(x) = beta[db-1] * x + 1 / (beta[db-2] * x + 1 / (beta[db-3] * x + ...))
+ *
+ * with the final coefficient being beta[0], with d' = 2 * dd + 1 for type 0
+ * and db = 2 * dd + 2 for type 1.
+ *
+ * Cayley form (Chiu's domain wall formulation)
+ *
+ * R(x) = (1 - T(x)) / (1 + T(x))
+ *
+ * where T(x) = prod((x - gamma[j]) / (x + gamma[j]), j = 0 .. n-1)
+ */
+
+typedef struct {
+  ZPRECISION *a,      /* zeros of numerator, a[0 .. dn-1] */
+    *ap,	      /* poles (zeros of denominator), ap[0 .. dd-1] */
+    A,		      /* overall factor */
+    *alpha,	      /* coefficients of partial fraction, alpha[0 .. da-1] */
+    *beta,	      /* coefficients of continued fraction, beta[0 .. db-1] */
+    *gamma,	      /* zeros of numerator of T in Cayley form */
+    Delta,	      /* maximum error, |R(x) - sgn(x)| <= Delta */
+    epsilon;	      /* minimum x value, epsilon < |x| < 1 */
+  int n,	      /* approximation degree */
+    type,	      /* 0: R(0) = 0, 1: R(0) = infinity */
+    dn, dd, da, db,   /* number of elements of a, ap, alpha, and beta */
+    deg_num,	      /* degree of numerator = deg_denom +/- 1 */
+    deg_denom;	      /* degree of denominator */
+} ZOLOTAREV_DATA;
+
+#ifndef ZOLOTAREV_INTERNAL
+
+/* zolotarev(epsilon, n, type) returns a pointer to an initialised
+ * zolotarev_data structure. The arguments must satisfy the constraints that
+ * epsilon > 0, n > 0, and type = 0 or 1. */
+
+ZOLOTAREV_DATA* higham(PRECISION epsilon, int n) ;
+ZOLOTAREV_DATA* zolotarev(PRECISION epsilon, int n, int type);
+void zolotarev_free(zolotarev_data *zdata);
+#endif
+
+#ifdef __cplusplus
+NAMESPACE_END(Approx);
+NAMESPACE_END(Grid);
+#endif
--- a/Grid/algorithms/approx/bigfloat.h
+++ b/Grid/algorithms/approx/bigfloat.h
@@ -0,0 +1,206 @@
+/*
+  Mike Clark - 25th May 2005
+
+  bigfloat.h
+
+  Simple C++ wrapper for multiprecision datatype used by AlgRemez
+  algorithm
+*/
+
+#ifndef INCLUDED_BIGFLOAT_H
+#define INCLUDED_BIGFLOAT_H
+
+
+#include <gmp.h>
+#include <mpf2mpfr.h>
+#include <mpfr.h>
+class bigfloat {
+
+private:
+
+  mpf_t x;
+
+public:
+
+  bigfloat() { mpf_init(x); }
+  bigfloat(const bigfloat& y) { mpf_init_set(x, y.x); }
+  bigfloat(const unsigned long u) { mpf_init_set_ui(x, u); }
+  bigfloat(const long i) { mpf_init_set_si(x, i); }
+  bigfloat(const int i) {mpf_init_set_si(x,(long)i);}
+  bigfloat(const float d) { mpf_init_set_d(x, (double)d); }
+  bigfloat(const double d) { mpf_init_set_d(x, d); }  
+  bigfloat(const char *str) { mpf_init_set_str(x, (char*)str, 10); }
+  ~bigfloat(void) { mpf_clear(x); }
+  operator double (void) const { return (double)mpf_get_d(x); }
+  static void setDefaultPrecision(unsigned long dprec) {
+    unsigned long bprec =  (unsigned long)(3.321928094 * (double)dprec);
+    mpf_set_default_prec(bprec);
+  }
+
+  void setPrecision(unsigned long dprec) {
+    unsigned long bprec =  (unsigned long)(3.321928094 * (double)dprec);
+    mpf_set_prec(x,bprec);
+  }
+  
+  unsigned long getPrecision(void) const { return mpf_get_prec(x); }
+
+  unsigned long getDefaultPrecision(void) const { return mpf_get_default_prec(); }
+
+  bigfloat& operator=(const bigfloat& y) {
+    mpf_set(x, y.x); 
+    return *this;
+  }
+
+  bigfloat& operator=(const unsigned long y) { 
+    mpf_set_ui(x, y);
+    return *this; 
+  }
+  
+  bigfloat& operator=(const signed long y) {
+    mpf_set_si(x, y); 
+    return *this;
+  }
+  
+  bigfloat& operator=(const float y) {
+    mpf_set_d(x, (double)y); 
+    return *this;
+  }
+
+  bigfloat& operator=(const double y) {
+    mpf_set_d(x, y); 
+    return *this;
+  }
+
+  size_t write(void);
+  size_t read(void);
+
+  /* Arithmetic Functions */
+
+  bigfloat& operator+=(const bigfloat& y) { return *this = *this + y; }
+  bigfloat& operator-=(const bigfloat& y) { return *this = *this - y; }
+  bigfloat& operator*=(const bigfloat& y) { return *this = *this * y; }
+  bigfloat& operator/=(const bigfloat& y) { return *this = *this / y; }
+
+  friend bigfloat operator+(const bigfloat& x, const bigfloat& y) {
+    bigfloat a;
+    mpf_add(a.x,x.x,y.x);
+    return a;
+  }
+
+  friend bigfloat operator+(const bigfloat& x, const unsigned long y) {
+    bigfloat a;
+    mpf_add_ui(a.x,x.x,y);
+    return a;
+  }
+
+  friend bigfloat operator-(const bigfloat& x, const bigfloat& y) {
+    bigfloat a;
+    mpf_sub(a.x,x.x,y.x);
+    return a;
+  }
+  
+  friend bigfloat operator-(const unsigned long x, const bigfloat& y) {
+    bigfloat a;
+    mpf_ui_sub(a.x,x,y.x);
+    return a;
+  }
+  
+  friend bigfloat operator-(const bigfloat& x, const unsigned long y) {
+    bigfloat a;
+    mpf_sub_ui(a.x,x.x,y);
+    return a;
+  }
+
+  friend bigfloat operator-(const bigfloat& x) {
+    bigfloat a;
+    mpf_neg(a.x,x.x);
+    return a;
+  }
+
+  friend bigfloat operator*(const bigfloat& x, const bigfloat& y) {
+    bigfloat a;
+    mpf_mul(a.x,x.x,y.x);
+    return a;
+  }
+
+  friend bigfloat operator*(const bigfloat& x, const unsigned long y) {
+    bigfloat a;
+    mpf_mul_ui(a.x,x.x,y);
+    return a;
+  }
+
+  friend bigfloat operator/(const bigfloat& x, const bigfloat& y){
+    bigfloat a;
+    mpf_div(a.x,x.x,y.x);
+    return a;
+  }
+
+  friend bigfloat operator/(const unsigned long x, const bigfloat& y){
+    bigfloat a;
+    mpf_ui_div(a.x,x,y.x);
+    return a;
+  }
+
+  friend bigfloat operator/(const bigfloat& x, const unsigned long y){
+    bigfloat a;
+    mpf_div_ui(a.x,x.x,y);
+    return a;
+  }
+
+  friend bigfloat sqrt_bf(const bigfloat& x){
+    bigfloat a;
+    mpf_sqrt(a.x,x.x);
+    return a;
+  }
+
+  friend bigfloat sqrt_bf(const unsigned long x){
+    bigfloat a;
+    mpf_sqrt_ui(a.x,x);
+    return a;
+  }
+
+  friend bigfloat abs_bf(const bigfloat& x){
+    bigfloat a;
+    mpf_abs(a.x,x.x);
+    return a;
+  }
+
+  friend bigfloat pow_bf(const bigfloat& a, long power) {
+    bigfloat b;
+    mpf_pow_ui(b.x,a.x,power);
+    return b;
+  }
+
+  friend bigfloat pow_bf(const bigfloat& a, bigfloat &power) {
+    bigfloat b;
+    mpfr_pow(b.x,a.x,power.x,GMP_RNDN);
+    return b;
+  }
+
+  friend bigfloat exp_bf(const bigfloat& a) {
+    bigfloat b;
+    mpfr_exp(b.x,a.x,GMP_RNDN);
+    return b;
+  }
+
+  /* Comparison Functions */
+
+  friend int operator>(const bigfloat& x, const bigfloat& y) {
+    int test;
+    test = mpf_cmp(x.x,y.x);
+    if (test > 0) return 1;
+    else return 0;
+  }
+
+  friend int operator<(const bigfloat& x, const bigfloat& y) {
+    int test;
+    test = mpf_cmp(x.x,y.x);
+    if (test < 0) return 1;
+    else return 0;
+  }
+
+  friend int sgn(const bigfloat&);
+
+};
+
+#endif
--- a/Grid/algorithms/approx/bigfloat_double.h
+++ b/Grid/algorithms/approx/bigfloat_double.h
@@ -0,0 +1,189 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/approx/bigfloat_double.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <math.h>
+
+typedef double mfloat; 
+class bigfloat {
+private:
+
+  mfloat x;
+
+public:
+
+  bigfloat() { }
+  bigfloat(const bigfloat& y) { x=y.x; }
+  bigfloat(const unsigned long u) { x=u; }
+  bigfloat(const long i) { x=i; }
+  bigfloat(const int i) { x=i;}
+  bigfloat(const float d) { x=d;}
+  bigfloat(const double d) {  x=d;}
+  bigfloat(const char *str) { x=std::stod(std::string(str));}
+  ~bigfloat(void) { }
+  operator double (void) const { return (double)x; }
+  static void setDefaultPrecision(unsigned long dprec) {
+  }
+
+  void setPrecision(unsigned long dprec) {
+  }
+  
+  unsigned long getPrecision(void) const { return 64; }
+  unsigned long getDefaultPrecision(void) const { return 64; }
+
+  bigfloat& operator=(const bigfloat& y)     { x=y.x;    return *this;  }
+  bigfloat& operator=(const unsigned long y) { x=y; return *this; }
+  bigfloat& operator=(const signed long y)   { x=y; return *this; }
+  bigfloat& operator=(const float y)    { x=y; return *this; }
+  bigfloat& operator=(const double y)   { x=y; return *this; }
+
+  size_t write(void);
+  size_t read(void);
+
+  /* Arithmetic Functions */
+
+  bigfloat& operator+=(const bigfloat& y) { return *this = *this + y; }
+  bigfloat& operator-=(const bigfloat& y) { return *this = *this - y; }
+  bigfloat& operator*=(const bigfloat& y) { return *this = *this * y; }
+  bigfloat& operator/=(const bigfloat& y) { return *this = *this / y; }
+
+  friend bigfloat operator+(const bigfloat& x, const bigfloat& y) { 
+    bigfloat a;
+    a.x=x.x+y.x;
+    return a;
+  }
+
+  friend bigfloat operator+(const bigfloat& x, const unsigned long y) {
+    bigfloat a;
+    a.x=x.x+y;
+    return a;
+  }
+
+  friend bigfloat operator-(const bigfloat& x, const bigfloat& y) {
+    bigfloat a;
+    a.x=x.x-y.x;
+    return a;
+  }
+  
+  friend bigfloat operator-(const unsigned long x, const bigfloat& y) {
+    bigfloat bx(x);
+    return bx-y;
+  }
+  
+  friend bigfloat operator-(const bigfloat& x, const unsigned long y) {
+    bigfloat by(y);
+    return x-by;
+  }
+
+  friend bigfloat operator-(const bigfloat& x) {
+    bigfloat a;
+    a.x=-x.x;
+    return a;
+  }
+
+  friend bigfloat operator*(const bigfloat& x, const bigfloat& y) {
+    bigfloat a;
+    a.x=x.x*y.x;
+    return a;
+  }
+
+  friend bigfloat operator*(const bigfloat& x, const unsigned long y) {
+    bigfloat a;
+    a.x=x.x*y;
+    return a;
+  }
+
+  friend bigfloat operator/(const bigfloat& x, const bigfloat& y){
+    bigfloat a;
+    a.x=x.x/y.x;
+    return a;
+  }
+
+  friend bigfloat operator/(const unsigned long x, const bigfloat& y){
+    bigfloat bx(x);
+    return bx/y;
+  }
+
+  friend bigfloat operator/(const bigfloat& x, const unsigned long y){
+    bigfloat by(y);
+    return x/by;
+  }
+
+  friend bigfloat sqrt_bf(const bigfloat& x){
+    bigfloat a;
+    a.x= sqrt(x.x);
+    return a;
+  }
+
+  friend bigfloat sqrt_bf(const unsigned long x){
+    bigfloat a(x);
+    return sqrt_bf(a);
+  }
+
+  friend bigfloat abs_bf(const bigfloat& x){
+    bigfloat a;
+    a.x=fabs(x.x);
+    return a;
+  }
+
+  friend bigfloat pow_bf(const bigfloat& a, long power) {
+    bigfloat b;
+    b.x=pow(a.x,power);
+    return b;
+  }
+
+  friend bigfloat pow_bf(const bigfloat& a, bigfloat &power) {
+    bigfloat b;
+    b.x=pow(a.x,power.x);
+    return b;
+  }
+
+  friend bigfloat exp_bf(const bigfloat& a) {
+    bigfloat b;
+    b.x=exp(a.x);
+    return b;
+  }
+
+  /* Comparison Functions */
+  friend int operator>(const bigfloat& x, const bigfloat& y) {
+    return x.x>y.x;
+  }
+
+  friend int operator<(const bigfloat& x, const bigfloat& y) {
+    return x.x<y.x;
+  }
+
+  friend int sgn(const bigfloat& x) {
+    if ( x.x>=0 )  return 1;   
+    else return 0;
+  }
+
+  /* Miscellaneous Functions */
+
+  //  friend bigfloat& random(void);
+};
+
+
--- a/Grid/algorithms/iterative/AdefGeneric.h
+++ b/Grid/algorithms/iterative/AdefGeneric.h
@@ -0,0 +1,397 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/AdefGeneric.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_ALGORITHMS_ITERATIVE_GENERIC_PCG
+#define GRID_ALGORITHMS_ITERATIVE_GENERIC_PCG
+
+  /*
+   * Compared to Tang-2009:  P=Pleft. P^T = PRight Q=MssInv. 
+   * Script A = SolverMatrix 
+   * Script P = Preconditioner
+   *
+   * Deflation methods considered
+   *      -- Solve P A x = P b        [ like Luscher ]
+   * DEF-1        M P A x = M P b     [i.e. left precon]
+   * DEF-2        P^T M A x = P^T M b
+   * ADEF-1       Preconditioner = M P + Q      [ Q + M + M A Q]
+   * ADEF-2       Preconditioner = P^T M + Q
+   * BNN          Preconditioner = P^T M P + Q
+   * BNN2         Preconditioner = M P + P^TM +Q - M P A M 
+   * 
+   * Implement ADEF-2
+   *
+   * Vstart = P^Tx + Qb
+   * M1 = P^TM + Q
+   * M2=M3=1
+   * Vout = x
+   */
+
+// abstract base
+template<class Field, class CoarseField>
+class TwoLevelFlexiblePcg : public LinearFunction<Field>
+{
+ public:
+  int verbose;
+  RealD   Tolerance;
+  Integer MaxIterations;
+  const int mmax = 5;
+  GridBase *grid;
+  GridBase *coarsegrid;
+
+  LinearOperatorBase<Field>   *_Linop
+  OperatorFunction<Field>     *_Smoother,
+  LinearFunction<CoarseField> *_CoarseSolver;
+
+  // Need somthing that knows how to get from Coarse to fine and back again
+  
+  // more most opertor functions
+  TwoLevelFlexiblePcg(RealD tol,
+		     Integer maxit,
+		     LinearOperatorBase<Field> *Linop,
+		     LinearOperatorBase<Field> *SmootherLinop,
+		     OperatorFunction<Field>   *Smoother,
+		     OperatorFunction<CoarseField>  CoarseLinop
+		     ) : 
+      Tolerance(tol), 
+      MaxIterations(maxit),
+      _Linop(Linop),
+      _PreconditionerLinop(PrecLinop),
+      _Preconditioner(Preconditioner)
+  { 
+    verbose=0;
+  };
+
+  // The Pcg routine is common to all, but the various matrices differ from derived 
+  // implementation to derived implmentation
+  void operator() (const Field &src, Field &psi){
+  void operator() (const Field &src, Field &psi){
+
+    psi.Checkerboard() = src.Checkerboard();
+    grid             = src.Grid();
+
+    RealD f;
+    RealD rtzp,rtz,a,d,b;
+    RealD rptzp;
+    RealD tn;
+    RealD guess = norm2(psi);
+    RealD ssq   = norm2(src);
+    RealD rsq   = ssq*Tolerance*Tolerance;
+    
+    /////////////////////////////
+    // Set up history vectors
+    /////////////////////////////
+    std::vector<Field> p  (mmax,grid);
+    std::vector<Field> mmp(mmax,grid);
+    std::vector<RealD> pAp(mmax);
+
+    Field x  (grid); x = psi;
+    Field z  (grid);
+    Field tmp(grid);
+    Field r  (grid);
+    Field mu (grid);
+  
+    //////////////////////////
+    // x0 = Vstart -- possibly modify guess
+    //////////////////////////
+    x=src;
+    Vstart(x,src);
+
+    // r0 = b -A x0
+    HermOp(x,mmp); // Shouldn't this be something else?
+    axpy (r, -1.0,mmp[0], src);    // Recomputes r=src-Ax0
+
+    //////////////////////////////////
+    // Compute z = M1 x
+    //////////////////////////////////
+    M1(r,z,tmp,mp,SmootherMirs);
+    rtzp =real(innerProduct(r,z));
+
+    ///////////////////////////////////////
+    // Solve for Mss mu = P A z and set p = z-mu
+    // Def2: p = 1 - Q Az = Pright z 
+    // Other algos M2 is trivial
+    ///////////////////////////////////////
+    M2(z,p[0]);
+
+    for (int k=0;k<=MaxIterations;k++){
+    
+      int peri_k  = k % mmax;
+      int peri_kp = (k+1) % mmax;
+
+      rtz=rtzp;
+      d= M3(p[peri_k],mp,mmp[peri_k],tmp);
+      a = rtz/d;
+    
+      // Memorise this
+      pAp[peri_k] = d;
+
+      axpy(x,a,p[peri_k],x);
+      RealD rn = axpy_norm(r,-a,mmp[peri_k],r);
+
+      // Compute z = M x
+      M1(r,z,tmp,mp);
+
+      rtzp =real(innerProduct(r,z));
+
+      M2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate
+
+      p[peri_kp]=p[peri_k];
+
+      // Standard search direction  p -> z + b p    ; b = 
+      b = (rtzp)/rtz;
+
+      int northog;
+      //    northog     = (peri_kp==0)?1:peri_kp; // This is the fCG(mmax) algorithm
+      northog     = (k>mmax-1)?(mmax-1):k;        // This is the fCG-Tr(mmax-1) algorithm
+    
+      for(int back=0; back < northog; back++){
+	int peri_back = (k-back)%mmax;
+	RealD pbApk= real(innerProduct(mmp[peri_back],p[peri_kp]));
+	RealD beta = -pbApk/pAp[peri_back];
+	axpy(p[peri_kp],beta,p[peri_back],p[peri_kp]);
+      }
+
+      RealD rrn=sqrt(rn/ssq);
+      std::cout<<GridLogMessage<<"TwoLevelfPcg: k= "<<k<<" residual = "<<rrn<<std::endl;
+
+      // Stopping condition
+      if ( rn <= rsq ) { 
+
+	HermOp(x,mmp); // Shouldn't this be something else?
+	axpy(tmp,-1.0,src,mmp[0]);
+	
+	RealD psinorm = sqrt(norm2(x));
+	RealD srcnorm = sqrt(norm2(src));
+	RealD tmpnorm = sqrt(norm2(tmp));
+	RealD true_residual = tmpnorm/srcnorm;
+	std::cout<<GridLogMessage<<"TwoLevelfPcg:   true residual is "<<true_residual<<std::endl;
+	std::cout<<GridLogMessage<<"TwoLevelfPcg: target residual was"<<Tolerance<<std::endl;
+	return k;
+      }
+    }
+    // Non-convergence
+    assert(0);
+  }
+
+ public:
+
+  virtual void M(Field & in,Field & out,Field & tmp) {
+
+  }
+
+  virtual void M1(Field & in, Field & out) {// the smoother
+
+    // [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
+    Field tmp(grid);
+    Field Min(grid);
+
+    PcgM(in,Min); // Smoother call
+
+    HermOp(Min,out);
+    axpy(tmp,-1.0,out,in);          // tmp  = in - A Min
+
+    ProjectToSubspace(tmp,PleftProj);     
+    ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
+    PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]  
+    axpy(out,1.0,Min,tmp); // Min+tmp
+  }
+
+  virtual void M2(const Field & in, Field & out) {
+    out=in;
+    // Must override for Def2 only
+    //  case PcgDef2:
+    //    Pright(in,out);
+    //    break;
+  }
+
+  virtual RealD M3(const Field & p, Field & mmp){
+    double d,dd;
+    HermOpAndNorm(p,mmp,d,dd);
+    return dd;
+    // Must override for Def1 only
+    //  case PcgDef1:
+    //    d=linop_d->Mprec(p,mmp,tmp,0,1);// Dag no
+    //      linop_d->Mprec(mmp,mp,tmp,1);// Dag yes
+    //    Pleft(mp,mmp);
+    //    d=real(linop_d->inner(p,mmp));
+  }
+
+  virtual void VstartDef2(Field & xconst Field & src){
+    //case PcgDef2:
+    //case PcgAdef2: 
+    //case PcgAdef2f:
+    //case PcgV11f:
+    ///////////////////////////////////
+    // Choose x_0 such that 
+    // x_0 = guess +  (A_ss^inv) r_s = guess + Ass_inv [src -Aguess]
+    //                               = [1 - Ass_inv A] Guess + Assinv src
+    //                               = P^T guess + Assinv src 
+    //                               = Vstart  [Tang notation]
+    // This gives:
+    // W^T (src - A x_0) = src_s - A guess_s - r_s
+    //                   = src_s - (A guess)_s - src_s  + (A guess)_s 
+    //                   = 0 
+    ///////////////////////////////////
+    Field r(grid);
+    Field mmp(grid);
+    
+    HermOp(x,mmp);
+    axpy (r, -1.0, mmp, src);        // r_{-1} = src - A x
+    ProjectToSubspace(r,PleftProj);     
+    ApplyInverseCG(PleftProj,PleftMss_proj); // Ass^{-1} r_s
+    PromoteFromSubspace(PleftMss_proj,mmp);  
+    x=x+mmp;
+
+  }
+
+  virtual void Vstart(Field & x,const Field & src){
+    return;
+  }
+
+  /////////////////////////////////////////////////////////////////////
+  // Only Def1 has non-trivial Vout. Override in Def1
+  /////////////////////////////////////////////////////////////////////
+  virtual void   Vout  (Field & in, Field & out,Field & src){
+    out = in;
+    //case PcgDef1:
+    //    //Qb + PT x
+    //    ProjectToSubspace(src,PleftProj);     
+    //    ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} r_s
+    //    PromoteFromSubspace(PleftMss_proj,tmp);  
+    //    
+    //    Pright(in,out);
+    //    
+    //    linop_d->axpy(out,tmp,out,1.0);
+    //    break;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  // Pright and Pleft are common to all implementations
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+  virtual void Pright(Field & in,Field & out){
+    // P_R  = [ 1              0 ] 
+    //        [ -Mss^-1 Msb    0 ] 
+    Field in_sbar(grid);
+
+    ProjectToSubspace(in,PleftProj);     
+    PromoteFromSubspace(PleftProj,out);  
+    axpy(in_sbar,-1.0,out,in);       // in_sbar = in - in_s 
+
+    HermOp(in_sbar,out);
+    ProjectToSubspace(out,PleftProj);           // Mssbar in_sbar  (project)
+
+    ApplyInverse     (PleftProj,PleftMss_proj); // Mss^{-1} Mssbar 
+    PromoteFromSubspace(PleftMss_proj,out);     // 
+
+    axpy(out,-1.0,out,in_sbar);     // in_sbar - Mss^{-1} Mssbar in_sbar
+  }
+  virtual void Pleft (Field & in,Field & out){
+    // P_L  = [ 1  -Mbs Mss^-1] 
+    //        [ 0   0         ] 
+    Field in_sbar(grid);
+    Field    tmp2(grid);
+    Field    Mtmp(grid);
+
+    ProjectToSubspace(in,PleftProj);     
+    PromoteFromSubspace(PleftProj,out);  
+    axpy(in_sbar,-1.0,out,in);      // in_sbar = in - in_s
+
+    ApplyInverse(PleftProj,PleftMss_proj); // Mss^{-1} in_s
+    PromoteFromSubspace(PleftMss_proj,out);
+
+    HermOp(out,Mtmp);
+
+    ProjectToSubspace(Mtmp,PleftProj);      // Msbar s Mss^{-1}
+    PromoteFromSubspace(PleftProj,tmp2);
+
+    axpy(out,-1.0,tmp2,Mtmp);
+    axpy(out,-1.0,out,in_sbar);     // in_sbar - Msbars Mss^{-1} in_s
+  }
+}
+
+template<class Field>
+class TwoLevelFlexiblePcgADef2 : public TwoLevelFlexiblePcg<Field> {
+ public:
+  virtual void M(Field & in,Field & out,Field & tmp){
+
+  } 
+  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp){
+
+  }
+  virtual void M2(Field & in, Field & out){
+
+  }
+  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp){
+
+  }
+  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp){
+
+  }
+}
+/*
+template<class Field>
+class TwoLevelFlexiblePcgAD : public TwoLevelFlexiblePcg<Field> {
+ public:
+  virtual void M(Field & in,Field & out,Field & tmp); 
+  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
+  virtual void M2(Field & in, Field & out);
+  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
+  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
+}
+
+template<class Field>
+class TwoLevelFlexiblePcgDef1 : public TwoLevelFlexiblePcg<Field> {
+ public:
+  virtual void M(Field & in,Field & out,Field & tmp); 
+  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
+  virtual void M2(Field & in, Field & out);
+  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
+  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
+  virtual void   Vout  (Field & in, Field & out,Field & src,Field & tmp);
+}
+
+template<class Field>
+class TwoLevelFlexiblePcgDef2 : public TwoLevelFlexiblePcg<Field> {
+ public:
+  virtual void M(Field & in,Field & out,Field & tmp); 
+  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
+  virtual void M2(Field & in, Field & out);
+  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
+  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
+}
+
+template<class Field>
+class TwoLevelFlexiblePcgV11: public TwoLevelFlexiblePcg<Field> {
+ public:
+  virtual void M(Field & in,Field & out,Field & tmp); 
+  virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
+  virtual void M2(Field & in, Field & out);
+  virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
+  virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
+}
+*/
+#endif
--- a/Grid/algorithms/iterative/BlockConjugateGradient.h
+++ b/Grid/algorithms/iterative/BlockConjugateGradient.h
@@ -0,0 +1,694 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/algorithms/iterative/BlockConjugateGradient.h
+
+Copyright (C) 2017
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec };
+
+//////////////////////////////////////////////////////////////////////////
+// Block conjugate gradient. Dimension zero should be the block direction
+//////////////////////////////////////////////////////////////////////////
+template <class Field>
+class BlockConjugateGradient : public OperatorFunction<Field> {
+ public:
+
+  typedef typename Field::scalar_type scomplex;
+
+  int blockDim ;
+  int Nblock;
+
+  BlockCGtype CGtype;
+  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
+                           // Defaults true.
+  RealD Tolerance;
+  Integer MaxIterations;
+  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+  Integer PrintInterval; //GridLogMessages or Iterative
+  
+  BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
+    : Tolerance(tol), CGtype(cgtype),   blockDim(_Orthog),  MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv),PrintInterval(100)
+  {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Thin QR factorisation (google it)
+////////////////////////////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  //Dimensions
+  // R_{ferm x Nblock} =  Q_{ferm x Nblock} x  C_{Nblock x Nblock} -> ferm x Nblock
+  //
+  // Rdag R = m_rr = Herm = L L^dag        <-- Cholesky decomposition (LLT routine in Eigen)
+  //
+  //   Q  C = R => Q = R C^{-1}
+  //
+  // Want  Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock} 
+  //
+  // Set C = L^{dag}, and then Q^dag Q = ident 
+  //
+  // Checks:
+  // Cdag C = Rdag R ; passes.
+  // QdagQ  = 1      ; passes
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+void ThinQRfact (Eigen::MatrixXcd &m_rr,
+		 Eigen::MatrixXcd &C,
+		 Eigen::MatrixXcd &Cinv,
+		 Field & Q,
+		 const Field & R)
+{
+  int Orthog = blockDim; // First dimension is block dim; this is an assumption
+  sliceInnerProductMatrix(m_rr,R,R,Orthog);
+
+  // Force manifest hermitian to avoid rounding related
+  m_rr = 0.5*(m_rr+m_rr.adjoint());
+
+  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
+
+  C    = L.adjoint();
+  Cinv = C.inverse();
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Q = R C^{-1}
+  //
+  // Q_j  = R_i Cinv(i,j) 
+  //
+  // NB maddMatrix conventions are Right multiplication X[j] a[j,i] already
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  sliceMulMatrix(Q,Cinv,R,Orthog);
+}
+// see comments above
+void ThinQRfact (Eigen::MatrixXcd &m_rr,
+		 Eigen::MatrixXcd &C,
+		 Eigen::MatrixXcd &Cinv,
+		 std::vector<Field> & Q,
+		 const std::vector<Field> & R)
+{
+  InnerProductMatrix(m_rr,R,R);
+
+  m_rr = 0.5*(m_rr+m_rr.adjoint());
+
+  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
+
+  C    = L.adjoint();
+  Cinv = C.inverse();
+
+  MulMatrix(Q,Cinv,R);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Call one of several implementations
+////////////////////////////////////////////////////////////////////////////////////////////////////
+void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
+{
+  if ( CGtype == BlockCGrQ ) {
+    BlockCGrQsolve(Linop,Src,Psi);
+  } else if (CGtype == CGmultiRHS ) {
+    CGmultiRHSsolve(Linop,Src,Psi);
+  } else {
+    assert(0);
+  }
+}
+virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Field> &Src, std::vector<Field> &Psi) 
+{
+  if ( CGtype == BlockCGrQVec ) {
+    BlockCGrQsolveVec(Linop,Src,Psi);
+  } else {
+    assert(0);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////
+// BlockCGrQ implementation:
+//--------------------------
+// X is guess/Solution
+// B is RHS
+// Solve A X_i = B_i    ;        i refers to Nblock index
+////////////////////////////////////////////////////////////////////////////
+void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X) 
+{
+  int Orthog = blockDim; // First dimension is block dim; this is an assumption
+  Nblock = B.Grid()->_fdimensions[Orthog];
+/* FAKE */
+  Nblock=8;
+  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
+
+  X.checkerboard = B.checkerboard;
+  conformable(X, B);
+
+  Field tmp(B);
+  Field Q(B);
+  Field D(B);
+  Field Z(B);
+  Field AD(B);
+
+  Eigen::MatrixXcd m_DZ     = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_M      = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+  Eigen::MatrixXcd m_C      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_Cinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_S      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_Sinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+  Eigen::MatrixXcd m_tmp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_tmp1   = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+
+  // Initial residual computation & set up
+  std::vector<RealD> residuals(Nblock);
+  std::vector<RealD> ssq(Nblock);
+
+  sliceNorm(ssq,B,Orthog);
+  RealD sssum=0;
+  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
+
+  sliceNorm(residuals,B,Orthog);
+  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+
+  sliceNorm(residuals,X,Orthog);
+  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+
+  /************************************************************************
+   * Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
+   ************************************************************************
+   * Dimensions:
+   *
+   *   X,B==(Nferm x Nblock)
+   *   A==(Nferm x Nferm)
+   *  
+   * Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
+   * 
+   * QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
+   * for k: 
+   *   Z  = AD
+   *   M  = [D^dag Z]^{-1}
+   *   X  = X + D MC
+   *   QS = Q - ZM
+   *   D  = Q + D S^dag
+   *   C  = S C
+   */
+  ///////////////////////////////////////
+  // Initial block: initial search dir is guess
+  ///////////////////////////////////////
+  std::cout << GridLogMessage<<"BlockCGrQ algorithm initialisation " <<std::endl;
+
+  //1.  QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
+  Linop.HermOp(X, AD);
+  tmp = B - AD;  
+
+  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
+  D=Q;
+
+  std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
+
+  ///////////////////////////////////////
+  // Timers
+  ///////////////////////////////////////
+  GridStopWatch sliceInnerTimer;
+  GridStopWatch sliceMaddTimer;
+  GridStopWatch QRTimer;
+  GridStopWatch MatrixTimer;
+  GridStopWatch SolverTimer;
+  SolverTimer.Start();
+
+  int k;
+  for (k = 1; k <= MaxIterations; k++) {
+
+    //3. Z  = AD
+    MatrixTimer.Start();
+    Linop.HermOp(D, Z);      
+    MatrixTimer.Stop();
+
+    //4. M  = [D^dag Z]^{-1}
+    sliceInnerTimer.Start();
+    sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
+    sliceInnerTimer.Stop();
+    m_M       = m_DZ.inverse();
+    
+    //5. X  = X + D MC
+    m_tmp     = m_M * m_C;
+    sliceMaddTimer.Start();
+    sliceMaddMatrix(X,m_tmp, D,X,Orthog);     
+    sliceMaddTimer.Stop();
+
+    //6. QS = Q - ZM
+    sliceMaddTimer.Start();
+    sliceMaddMatrix(tmp,m_M,Z,Q,Orthog,-1.0);
+    sliceMaddTimer.Stop();
+    QRTimer.Start();
+    ThinQRfact (m_rr, m_S, m_Sinv, Q, tmp);
+    QRTimer.Stop();
+    
+    //7. D  = Q + D S^dag
+    m_tmp = m_S.adjoint();
+
+    sliceMaddTimer.Start();
+    sliceMaddMatrix(D,m_tmp,D,Q,Orthog);
+    sliceMaddTimer.Stop();
+
+    //8. C  = S C
+    m_C = m_S*m_C;
+    
+    /*********************
+     * convergence monitor
+     *********************
+     */
+    m_rr = m_C.adjoint() * m_C;
+
+    RealD max_resid=0;
+    RealD rrsum=0;
+    RealD rr;
+
+    for(int b=0;b<Nblock;b++) {
+      rrsum+=real(m_rr(b,b));
+      rr = real(m_rr(b,b))/ssq[b];
+      if ( rr > max_resid ) max_resid = rr;
+    }
+
+    std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
+	      <<" ave "<<std::sqrt(rrsum/sssum) << " max "<< max_resid <<std::endl;
+
+    if ( max_resid < Tolerance*Tolerance ) { 
+
+      SolverTimer.Stop();
+
+      std::cout << GridLogMessage<<"BlockCGrQ converged in "<<k<<" iterations"<<std::endl;
+
+      for(int b=0;b<Nblock;b++){
+	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
+		  << std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
+      }
+      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
+
+      Linop.HermOp(X, AD);
+      AD = AD-B;
+      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AD)/norm2(B)) <<std::endl;
+
+      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
+      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
+      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
+      std::cout << GridLogMessage << "\tThinQRfact " << QRTimer.Elapsed()  <<std::endl;
+	    
+      IterationsToComplete = k;
+      return;
+    }
+
+  }
+  std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
+
+  if (ErrorOnNoConverge) assert(0);
+  IterationsToComplete = k;
+}
+//////////////////////////////////////////////////////////////////////////
+// multiRHS conjugate gradient. Dimension zero should be the block direction
+// Use this for spread out across nodes
+//////////////////////////////////////////////////////////////////////////
+void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
+{
+  int Orthog = blockDim; // First dimension is block dim
+  Nblock = Src.Grid()->_fdimensions[Orthog];
+
+  std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
+
+  Psi.checkerboard = Src.checkerboard;
+  conformable(Psi, Src);
+
+  Field P(Src);
+  Field AP(Src);
+  Field R(Src);
+  
+  std::vector<ComplexD> v_pAp(Nblock);
+  std::vector<RealD> v_rr (Nblock);
+  std::vector<RealD> v_rr_inv(Nblock);
+  std::vector<RealD> v_alpha(Nblock);
+  std::vector<RealD> v_beta(Nblock);
+
+  // Initial residual computation & set up
+  std::vector<RealD> residuals(Nblock);
+  std::vector<RealD> ssq(Nblock);
+
+  sliceNorm(ssq,Src,Orthog);
+  RealD sssum=0;
+  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
+
+  sliceNorm(residuals,Src,Orthog);
+  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+
+  sliceNorm(residuals,Psi,Orthog);
+  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+
+  // Initial search dir is guess
+  Linop.HermOp(Psi, AP);
+
+  R = Src - AP;  
+  P = R;
+  sliceNorm(v_rr,R,Orthog);
+
+  GridStopWatch sliceInnerTimer;
+  GridStopWatch sliceMaddTimer;
+  GridStopWatch sliceNormTimer;
+  GridStopWatch MatrixTimer;
+  GridStopWatch SolverTimer;
+
+  SolverTimer.Start();
+  int k;
+  for (k = 1; k <= MaxIterations; k++) {
+
+    RealD rrsum=0;
+    for(int b=0;b<Nblock;b++) rrsum+=real(v_rr[b]);
+
+    std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
+	      <<" / "<<std::sqrt(rrsum/sssum) <<std::endl;
+
+    MatrixTimer.Start();
+    Linop.HermOp(P, AP);
+    MatrixTimer.Stop();
+
+    // Alpha
+    sliceInnerTimer.Start();
+    sliceInnerProductVector(v_pAp,P,AP,Orthog);
+    sliceInnerTimer.Stop();
+    for(int b=0;b<Nblock;b++){
+      v_alpha[b] = v_rr[b]/real(v_pAp[b]);
+    }
+
+    // Psi, R update
+    sliceMaddTimer.Start();
+    sliceMaddVector(Psi,v_alpha, P,Psi,Orthog);     // add alpha *  P to psi
+    sliceMaddVector(R  ,v_alpha,AP,  R,Orthog,-1.0);// sub alpha * AP to resid
+    sliceMaddTimer.Stop();
+
+    // Beta
+    for(int b=0;b<Nblock;b++){
+      v_rr_inv[b] = 1.0/v_rr[b];
+    }
+    sliceNormTimer.Start();
+    sliceNorm(v_rr,R,Orthog);
+    sliceNormTimer.Stop();
+    for(int b=0;b<Nblock;b++){
+      v_beta[b] = v_rr_inv[b] *v_rr[b];
+    }
+
+    // Search update
+    sliceMaddTimer.Start();
+    sliceMaddVector(P,v_beta,P,R,Orthog);
+    sliceMaddTimer.Stop();
+
+    /*********************
+     * convergence monitor
+     *********************
+     */
+    RealD max_resid=0;
+    for(int b=0;b<Nblock;b++){
+      RealD rr = v_rr[b]/ssq[b];
+      if ( rr > max_resid ) max_resid = rr;
+    }
+    
+    if ( max_resid < Tolerance*Tolerance ) { 
+
+      SolverTimer.Stop();
+
+      std::cout << GridLogMessage<<"MultiRHS solver converged in " <<k<<" iterations"<<std::endl;
+      for(int b=0;b<Nblock;b++){
+	std::cout << GridLogMessage<< "\t\tBlock "<<b<<" computed resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl;
+      }
+      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
+
+      Linop.HermOp(Psi, AP);
+      AP = AP-Src;
+      std::cout <<GridLogMessage << "\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
+
+      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
+      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
+      std::cout << GridLogMessage << "\tNorm       " << sliceNormTimer.Elapsed() <<std::endl;
+      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
+
+
+      IterationsToComplete = k;
+      return;
+    }
+
+  }
+  std::cout << GridLogMessage << "MultiRHSConjugateGradient did NOT converge" << std::endl;
+
+  if (ErrorOnNoConverge) assert(0);
+  IterationsToComplete = k;
+}
+
+void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
+  for(int b=0;b<Nblock;b++){
+  for(int bp=0;bp<Nblock;bp++) {
+    m(b,bp) = innerProduct(X[b],Y[bp]);  
+  }}
+}
+void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
+  // Should make this cache friendly with site outermost, parallel_for
+  // Deal with case AP aliases with either Y or X
+  std::vector<Field> tmp(Nblock,X[0]);
+  for(int b=0;b<Nblock;b++){
+    tmp[b]   = Y[b];
+    for(int bp=0;bp<Nblock;bp++) {
+      tmp[b] = tmp[b] + (scale*m(bp,b))*X[bp]; 
+    }
+  }
+  for(int b=0;b<Nblock;b++){
+    AP[b] = tmp[b];
+  }
+}
+void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
+  // Should make this cache friendly with site outermost, parallel_for
+  for(int b=0;b<Nblock;b++){
+    AP[b] = Zero();
+    for(int bp=0;bp<Nblock;bp++) {
+      AP[b] += (m(bp,b))*X[bp]; 
+    }
+  }
+}
+double normv(const std::vector<Field> &P){
+  double nn = 0.0;
+  for(int b=0;b<Nblock;b++) {
+    nn+=norm2(P[b]);
+  }
+  return nn;
+}
+
+////////////////////////////////////////////////////////////////////////////
+// BlockCGrQvec implementation:
+//--------------------------
+// X is guess/Solution
+// B is RHS
+// Solve A X_i = B_i    ;        i refers to Nblock index
+////////////////////////////////////////////////////////////////////////////
+void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field> &B, std::vector<Field> &X) 
+{
+  Nblock = B.size();
+  assert(Nblock == X.size());
+
+  std::cout<<GridLogMessage<<" Block Conjugate Gradient Vec rQ : Nblock "<<Nblock<<std::endl;
+
+  for(int b=0;b<Nblock;b++){ 
+    X[b].checkerboard = B[b].checkerboard;
+    conformable(X[b], B[b]);
+    conformable(X[b], X[0]); 
+  }
+
+  Field Fake(B[0]);
+
+  std::vector<Field> tmp(Nblock,Fake);
+  std::vector<Field>   Q(Nblock,Fake);
+  std::vector<Field>   D(Nblock,Fake);
+  std::vector<Field>   Z(Nblock,Fake);
+  std::vector<Field>  AD(Nblock,Fake);
+
+  Eigen::MatrixXcd m_DZ     = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_M      = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+  Eigen::MatrixXcd m_C      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_Cinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_S      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_Sinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+  Eigen::MatrixXcd m_tmp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_tmp1   = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+
+  // Initial residual computation & set up
+  std::vector<RealD> residuals(Nblock);
+  std::vector<RealD> ssq(Nblock);
+
+  RealD sssum=0;
+  for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);}
+  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
+
+  for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);}
+  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+
+  for(int b=0;b<Nblock;b++){ residuals[b] = norm2(X[b]);}
+  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+
+  /************************************************************************
+   * Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
+   ************************************************************************
+   * Dimensions:
+   *
+   *   X,B==(Nferm x Nblock)
+   *   A==(Nferm x Nferm)
+   *  
+   * Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
+   * 
+   * QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
+   * for k: 
+   *   Z  = AD
+   *   M  = [D^dag Z]^{-1}
+   *   X  = X + D MC
+   *   QS = Q - ZM
+   *   D  = Q + D S^dag
+   *   C  = S C
+   */
+  ///////////////////////////////////////
+  // Initial block: initial search dir is guess
+  ///////////////////////////////////////
+  std::cout << GridLogMessage<<"BlockCGrQvec algorithm initialisation " <<std::endl;
+
+  //1.  QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
+  for(int b=0;b<Nblock;b++) {
+    Linop.HermOp(X[b], AD[b]);
+    tmp[b] = B[b] - AD[b];  
+  }
+
+  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
+
+  for(int b=0;b<Nblock;b++) D[b]=Q[b];
+
+  std::cout << GridLogMessage<<"BlockCGrQ vec computed initial residual and QR fact " <<std::endl;
+
+  ///////////////////////////////////////
+  // Timers
+  ///////////////////////////////////////
+  GridStopWatch sliceInnerTimer;
+  GridStopWatch sliceMaddTimer;
+  GridStopWatch QRTimer;
+  GridStopWatch MatrixTimer;
+  GridStopWatch SolverTimer;
+  SolverTimer.Start();
+
+  int k;
+  for (k = 1; k <= MaxIterations; k++) {
+
+    //3. Z  = AD
+    MatrixTimer.Start();
+    for(int b=0;b<Nblock;b++) Linop.HermOp(D[b], Z[b]);      
+    MatrixTimer.Stop();
+
+    //4. M  = [D^dag Z]^{-1}
+    sliceInnerTimer.Start();
+    InnerProductMatrix(m_DZ,D,Z);
+    sliceInnerTimer.Stop();
+    m_M       = m_DZ.inverse();
+    
+    //5. X  = X + D MC
+    m_tmp     = m_M * m_C;
+    sliceMaddTimer.Start();
+    MaddMatrix(X,m_tmp, D,X);     
+    sliceMaddTimer.Stop();
+
+    //6. QS = Q - ZM
+    sliceMaddTimer.Start();
+    MaddMatrix(tmp,m_M,Z,Q,-1.0);
+    sliceMaddTimer.Stop();
+    QRTimer.Start();
+    ThinQRfact (m_rr, m_S, m_Sinv, Q, tmp);
+    QRTimer.Stop();
+    
+    //7. D  = Q + D S^dag
+    m_tmp = m_S.adjoint();
+    sliceMaddTimer.Start();
+    MaddMatrix(D,m_tmp,D,Q);
+    sliceMaddTimer.Stop();
+
+    //8. C  = S C
+    m_C = m_S*m_C;
+    
+    /*********************
+     * convergence monitor
+     *********************
+     */
+    m_rr = m_C.adjoint() * m_C;
+
+    RealD max_resid=0;
+    RealD rrsum=0;
+    RealD rr;
+
+    for(int b=0;b<Nblock;b++) {
+      rrsum+=real(m_rr(b,b));
+      rr = real(m_rr(b,b))/ssq[b];
+      if ( rr > max_resid ) max_resid = rr;
+    }
+
+    std::cout << GridLogIterative << "\t Block Iteration "<<k<<" ave resid "<< sqrt(rrsum/sssum) << " max "<< sqrt(max_resid) <<std::endl;
+
+    if ( max_resid < Tolerance*Tolerance ) { 
+
+      SolverTimer.Stop();
+
+      std::cout << GridLogMessage<<"BlockCGrQ converged in "<<k<<" iterations"<<std::endl;
+
+      for(int b=0;b<Nblock;b++){
+	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
+      }
+      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
+
+      for(int b=0;b<Nblock;b++) Linop.HermOp(X[b], AD[b]);
+      for(int b=0;b<Nblock;b++) AD[b] = AD[b]-B[b];
+      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(normv(AD)/normv(B)) <<std::endl;
+
+      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
+      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
+      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
+      std::cout << GridLogMessage << "\tThinQRfact " << QRTimer.Elapsed()  <<std::endl;
+	    
+      IterationsToComplete = k;
+      return;
+    }
+
+  }
+  std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
+
+  if (ErrorOnNoConverge) assert(0);
+  IterationsToComplete = k;
+}
+
+};
+
+NAMESPACE_END(Grid);
+
--- a/Grid/algorithms/iterative/ConjugateGradient.h
+++ b/Grid/algorithms/iterative/ConjugateGradient.h
@@ -0,0 +1,180 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/algorithms/iterative/ConjugateGradient.h
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#ifndef GRID_CONJUGATE_GRADIENT_H
+#define GRID_CONJUGATE_GRADIENT_H
+
+NAMESPACE_BEGIN(Grid);
+
+/////////////////////////////////////////////////////////////
+// Base classes for iterative processes based on operators
+// single input vec, single output vec.
+/////////////////////////////////////////////////////////////
+
+template <class Field>
+class ConjugateGradient : public OperatorFunction<Field> {
+public:
+  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
+                           // Defaults true.
+  RealD Tolerance;
+  Integer MaxIterations;
+  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+  
+  ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
+    : Tolerance(tol),
+      MaxIterations(maxit),
+      ErrorOnNoConverge(err_on_no_conv){};
+
+  void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
+
+    psi.Checkerboard() = src.Checkerboard();
+
+    conformable(psi, src);
+
+    RealD cp, c, a, d, b, ssq, qq, b_pred;
+
+    Field p(src);
+    Field mmp(src);
+    Field r(src);
+
+    // Initial residual computation & set up
+    RealD guess = norm2(psi);
+    assert(std::isnan(guess) == 0);
+
+    
+    Linop.HermOpAndNorm(psi, mmp, d, b);
+    
+    r = src - mmp;
+    p = r;
+
+    a = norm2(p);
+    cp = a;
+    ssq = norm2(src);
+
+    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: guess " << guess << std::endl;
+    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:   src " << ssq << std::endl;
+    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:    mp " << d << std::endl;
+    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:   mmp " << b << std::endl;
+    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:  cp,r " << cp << std::endl;
+    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:     p " << a << std::endl;
+
+    RealD rsq = Tolerance * Tolerance * ssq;
+
+    // Check if guess is really REALLY good :)
+    if (cp <= rsq) {
+      return;
+    }
+
+    std::cout << GridLogIterative << std::setprecision(8)
+              << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
+
+    GridStopWatch LinalgTimer;
+    GridStopWatch InnerTimer;
+    GridStopWatch AxpyNormTimer;
+    GridStopWatch LinearCombTimer;
+    GridStopWatch MatrixTimer;
+    GridStopWatch SolverTimer;
+
+    SolverTimer.Start();
+    int k;
+    for (k = 1; k <= MaxIterations*1000; k++) {
+      c = cp;
+
+      MatrixTimer.Start();
+      Linop.HermOp(p, mmp);
+      MatrixTimer.Stop();
+
+      LinalgTimer.Start();
+
+      InnerTimer.Start();
+      ComplexD dc  = innerProduct(p,mmp);
+      InnerTimer.Stop();
+      d = dc.real();
+      a = c / d;
+
+      AxpyNormTimer.Start();
+      cp = axpy_norm(r, -a, mmp, r);
+      AxpyNormTimer.Stop();
+      b = cp / c;
+
+      LinearCombTimer.Start();
+      auto psi_v = psi.View();
+      auto p_v   = p.View();
+      auto r_v   = r.View();
+      parallel_for(int ss=0;ss<src.Grid()->oSites();ss++){
+	vstream(psi_v[ss], a      *  p_v[ss] + psi_v[ss]);
+	vstream(p_v  [ss], b      *  p_v[ss] + r_v[ss]);
+      }
+      LinearCombTimer.Stop();
+      LinalgTimer.Stop();
+
+      std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
+                << " residual^2 " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
+
+      // Stopping condition
+      if (cp <= rsq) {
+        SolverTimer.Stop();
+        Linop.HermOpAndNorm(psi, mmp, d, qq);
+        p = mmp - src;
+
+        RealD srcnorm = std::sqrt(norm2(src));
+        RealD resnorm = std::sqrt(norm2(p));
+        RealD true_residual = resnorm / srcnorm;
+
+        std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl;
+        std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)<<std::endl;
+	std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
+	std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
+
+        std::cout << GridLogMessage << "Time breakdown "<<std::endl;
+	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
+
+        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
+
+	IterationsToComplete = k;	
+
+        return;
+      }
+    }
+    std::cout << GridLogMessage << "ConjugateGradient did NOT converge"
+              << std::endl;
+
+    if (ErrorOnNoConverge) assert(0);
+    IterationsToComplete = k;
+
+  }
+};
+NAMESPACE_END(Grid);
+#endif
--- a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
@@ -0,0 +1,156 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrec.h
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@phys.columbia.edu>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
+#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
+
+NAMESPACE_BEGIN(Grid);
+
+//Mixed precision restarted defect correction CG
+template<class FieldD,class FieldF, 
+	 typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
+	 typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
+class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
+public:                                                
+  RealD   Tolerance;
+  RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
+  Integer MaxInnerIterations;
+  Integer MaxOuterIterations;
+  GridBase* SinglePrecGrid; //Grid for single-precision fields
+  RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
+  LinearOperatorBase<FieldF> &Linop_f;
+  LinearOperatorBase<FieldD> &Linop_d;
+
+  Integer TotalInnerIterations; //Number of inner CG iterations
+  Integer TotalOuterIterations; //Number of restarts
+  Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
+
+  //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
+  LinearFunction<FieldF> *guesser;
+    
+  MixedPrecisionConjugateGradient(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d) :
+    Linop_f(_Linop_f), Linop_d(_Linop_d),
+    Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
+    OuterLoopNormMult(100.), guesser(NULL){ };
+
+  void useGuesser(LinearFunction<FieldF> &g){
+    guesser = &g;
+  }
+  
+  void operator() (const FieldD &src_d_in, FieldD &sol_d){
+    TotalInnerIterations = 0;
+	
+    GridStopWatch TotalTimer;
+    TotalTimer.Start();
+    
+    int cb = src_d_in.Checkerboard();
+    sol_d.Checkerboard() = cb;
+    
+    RealD src_norm = norm2(src_d_in);
+    RealD stop = src_norm * Tolerance*Tolerance;
+
+    GridBase* DoublePrecGrid = src_d_in.Grid();
+    FieldD tmp_d(DoublePrecGrid);
+    tmp_d.Checkerboard() = cb;
+    
+    FieldD tmp2_d(DoublePrecGrid);
+    tmp2_d.Checkerboard() = cb;
+    
+    FieldD src_d(DoublePrecGrid);
+    src_d = src_d_in; //source for next inner iteration, computed from residual during operation
+    
+    RealD inner_tol = InnerTolerance;
+    
+    FieldF src_f(SinglePrecGrid);
+    src_f.Checkerboard() = cb;
+    
+    FieldF sol_f(SinglePrecGrid);
+    sol_f.Checkerboard() = cb;
+    
+    ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
+    CG_f.ErrorOnNoConverge = false;
+
+    GridStopWatch InnerCGtimer;
+
+    GridStopWatch PrecChangeTimer;
+    
+    Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
+      
+    for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
+      //Compute double precision rsd and also new RHS vector.
+      Linop_d.HermOp(sol_d, tmp_d);
+      RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
+      
+      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
+
+      if(norm < OuterLoopNormMult * stop){
+	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
+	break;
+      }
+      while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
+
+      PrecChangeTimer.Start();
+      precisionChange(src_f, src_d);
+      PrecChangeTimer.Stop();
+      
+      sol_f = Zero();
+
+      //Optionally improve inner solver guess (eg using known eigenvectors)
+      if(guesser != NULL)
+	(*guesser)(src_f, sol_f);
+
+      //Inner CG
+      CG_f.Tolerance = inner_tol;
+      InnerCGtimer.Start();
+      CG_f(Linop_f, src_f, sol_f);
+      InnerCGtimer.Stop();
+      TotalInnerIterations += CG_f.IterationsToComplete;
+      
+      //Convert sol back to double and add to double prec solution
+      PrecChangeTimer.Start();
+      precisionChange(tmp_d, sol_f);
+      PrecChangeTimer.Stop();
+      
+      axpy(sol_d, 1.0, tmp_d, sol_d);
+    }
+    
+    //Final trial CG
+    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl;
+    
+    ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
+    CG_d(Linop_d, src_d_in, sol_d);
+    TotalFinalStepIterations = CG_d.IterationsToComplete;
+
+    TotalTimer.Stop();
+    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
+    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
+  }
+};
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
@@ -0,0 +1,322 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H
+#define GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H
+
+NAMESPACE_BEGIN(Grid);
+
+/////////////////////////////////////////////////////////////
+// Base classes for iterative processes based on operators
+// single input vec, single output vec.
+/////////////////////////////////////////////////////////////
+
+template<class Field> 
+class ConjugateGradientMultiShift : public OperatorMultiFunction<Field>,
+				    public OperatorFunction<Field>
+{
+public:                                                
+  RealD   Tolerance;
+  Integer MaxIterations;
+    Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+  int verbose;
+  MultiShiftFunction shifts;
+
+  ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) : 
+    MaxIterations(maxit),
+    shifts(_shifts)
+  { 
+    verbose=1;
+  }
+
+  void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
+  {
+    GridBase *grid = src.Grid();
+    int nshift = shifts.order;
+    std::vector<Field> results(nshift,grid);
+    (*this)(Linop,src,results,psi);
+  }
+  void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &results, Field &psi)
+  {
+    int nshift = shifts.order;
+
+    (*this)(Linop,src,results);
+  
+    psi = shifts.norm*src;
+    for(int i=0;i<nshift;i++){
+      psi = psi + shifts.residues[i]*results[i];
+    }
+
+    return;
+  }
+
+  void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &psi)
+  {
+  
+    GridBase *grid = src.Grid();
+  
+    ////////////////////////////////////////////////////////////////////////
+    // Convenience references to the info stored in "MultiShiftFunction"
+    ////////////////////////////////////////////////////////////////////////
+    int nshift = shifts.order;
+
+    std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
+    std::vector<RealD> &mresidual(shifts.tolerances);
+    std::vector<RealD> alpha(nshift,1.0);
+    std::vector<Field>   ps(nshift,grid);// Search directions
+
+    assert(psi.size()==nshift);
+    assert(mass.size()==nshift);
+    assert(mresidual.size()==nshift);
+  
+    // dynamic sized arrays on stack; 2d is a pain with vector
+    RealD  bs[nshift];
+    RealD  rsq[nshift];
+    RealD  z[nshift][2];
+    int     converged[nshift];
+  
+    const int       primary =0;
+  
+    //Primary shift fields CG iteration
+    RealD a,b,c,d;
+    RealD cp,bp,qq; //prev
+  
+    // Matrix mult fields
+    Field r(grid);
+    Field p(grid);
+    Field tmp(grid);
+    Field mmp(grid);
+  
+    // Check lightest mass
+    for(int s=0;s<nshift;s++){
+      assert( mass[s]>= mass[primary] );
+      converged[s]=0;
+    }
+  
+    // Wire guess to zero
+    // Residuals "r" are src
+    // First search direction "p" is also src
+    cp = norm2(src);
+    for(int s=0;s<nshift;s++){
+      rsq[s] = cp * mresidual[s] * mresidual[s];
+      std::cout<<GridLogMessage<<"ConjugateGradientMultiShift: shift "<<s
+	       <<" target resid "<<rsq[s]<<std::endl;
+      ps[s] = src;
+    }
+    // r and p for primary
+    r=src;
+    p=src;
+  
+    //MdagM+m[0]
+    Linop.HermOpAndNorm(p,mmp,d,qq);
+    axpy(mmp,mass[0],p,mmp);
+    RealD rn = norm2(p);
+    d += rn*mass[0];
+  
+    // have verified that inner product of 
+    // p and mmp is equal to d after this since
+    // the d computation is tricky
+    //  qq = real(innerProduct(p,mmp));
+    //  std::cout<<GridLogMessage << "debug equal ?  qq "<<qq<<" d "<< d<<std::endl;
+  
+    b = -cp /d;
+  
+    // Set up the various shift variables
+    int       iz=0;
+    z[0][1-iz] = 1.0;
+    z[0][iz]   = 1.0;
+    bs[0]      = b;
+    for(int s=1;s<nshift;s++){
+      z[s][1-iz] = 1.0;
+      z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0]));
+      bs[s]      = b*z[s][iz]; 
+    }
+  
+    // r += b[0] A.p[0]
+    // c= norm(r)
+    c=axpy_norm(r,b,mmp,r);
+  
+    for(int s=0;s<nshift;s++) {
+      axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
+    }
+  
+  ///////////////////////////////////////
+  // Timers
+  ///////////////////////////////////////
+  GridStopWatch AXPYTimer;
+  GridStopWatch ShiftTimer;
+  GridStopWatch QRTimer;
+  GridStopWatch MatrixTimer;
+  GridStopWatch SolverTimer;
+  SolverTimer.Start();
+  
+    // Iteration loop
+    int k;
+  
+    for (k=1;k<=MaxIterations;k++){
+    
+      a = c /cp;
+    AXPYTimer.Start();
+      axpy(p,a,p,r);
+    AXPYTimer.Stop();
+    
+      // Note to self - direction ps is iterated seperately
+      // for each shift. Does not appear to have any scope
+      // for avoiding linear algebra in "single" case.
+      // 
+      // However SAME r is used. Could load "r" and update
+      // ALL ps[s]. 2/3 Bandwidth saving
+      // New Kernel: Load r, vector of coeffs, vector of pointers ps
+    AXPYTimer.Start();
+      for(int s=0;s<nshift;s++){
+	if ( ! converged[s] ) { 
+	  if (s==0){
+	    axpy(ps[s],a,ps[s],r);
+	  } else{
+	    RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
+	    axpby(ps[s],z[s][iz],as,r,ps[s]);
+	  }
+	}
+      }
+    AXPYTimer.Stop();
+    
+      cp=c;
+    MatrixTimer.Start();  
+    //Linop.HermOpAndNorm(p,mmp,d,qq); // d is used
+    // The below is faster on KNL
+    Linop.HermOp(p,mmp); 
+    d=real(innerProduct(p,mmp));
+    
+    MatrixTimer.Stop();  
+
+    AXPYTimer.Start();
+      axpy(mmp,mass[0],p,mmp);
+    AXPYTimer.Stop();
+      RealD rn = norm2(p);
+      d += rn*mass[0];
+    
+      bp=b;
+      b=-cp/d;
+    
+    AXPYTimer.Start();
+      c=axpy_norm(r,b,mmp,r);
+    AXPYTimer.Stop();
+
+      // Toggle the recurrence history
+      bs[0] = b;
+      iz = 1-iz;
+    ShiftTimer.Start();
+      for(int s=1;s<nshift;s++){
+	if((!converged[s])){
+	  RealD z0 = z[s][1-iz];
+	  RealD z1 = z[s][iz];
+	  z[s][iz] = z0*z1*bp
+	    / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b)); 
+	  bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike
+	}
+      }
+    ShiftTimer.Stop();
+    
+      for(int s=0;s<nshift;s++){
+	int ss = s;
+	// Scope for optimisation here in case of "single".
+	// Could load psi[0] and pull all ps[s] in.
+	//      if ( single ) ss=primary;
+	// Bandwith saving in single case is Ls * 3 -> 2+Ls, so ~ 3x saving
+	// Pipelined CG gain:
+	//
+	// New Kernel: Load r, vector of coeffs, vector of pointers ps
+	// New Kernel: Load psi[0], vector of coeffs, vector of pointers ps
+	// If can predict the coefficient bs then we can fuse these and avoid write reread cyce
+	//  on ps[s].
+	// Before:  3 x npole  + 3 x npole
+	// After :  2 x npole (ps[s])        => 3x speed up of multishift CG.
+      
+	if( (!converged[s]) ) { 
+	  axpy(psi[ss],-bs[s]*alpha[s],ps[s],psi[ss]);
+	}
+      }
+    
+      // Convergence checks
+      int all_converged = 1;
+      for(int s=0;s<nshift;s++){
+      
+	if ( (!converged[s]) ){
+	
+	  RealD css  = c * z[s][iz]* z[s][iz];
+	
+	  if(css<rsq[s]){
+	    if ( ! converged[s] )
+	      std::cout<<GridLogMessage<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
+	    converged[s]=1;
+	  } else {
+	    all_converged=0;
+	  }
+
+	}
+      }
+    
+      if ( all_converged ){
+
+    SolverTimer.Stop();
+
+
+	std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
+	std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl;
+      
+	// Check answers 
+	for(int s=0; s < nshift; s++) { 
+	  Linop.HermOpAndNorm(psi[s],mmp,d,qq);
+	  axpy(tmp,mass[s],psi[s],mmp);
+	  axpy(r,-alpha[s],src,tmp);
+	  RealD rn = norm2(r);
+	  RealD cn = norm2(src);
+	  std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
+	}
+
+      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
+      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tMarix    " << MatrixTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
+
+      IterationsToComplete = k;	
+
+	return;
+      }
+
+   
+    }
+    // ugly hack
+    std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
+    //  assert(0);
+  }
+
+};
+NAMESPACE_END(Grid);
+#endif
--- a/Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h
+++ b/Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h
@@ -0,0 +1,258 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ConjugateGradientReliableUpdate.h
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@phys.columbia.edu>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
+#define GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
+
+NAMESPACE_BEGIN(Grid);
+
+template<class FieldD,class FieldF, 
+	 typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
+	 typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
+class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> {
+public:
+  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
+  // Defaults true.
+  RealD Tolerance;
+  Integer MaxIterations;
+  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+  Integer ReliableUpdatesPerformed;
+
+  bool DoFinalCleanup; //Final DP cleanup, defaults to true
+  Integer IterationsToCleanup; //Final DP cleanup step iterations
+    
+  LinearOperatorBase<FieldF> &Linop_f;
+  LinearOperatorBase<FieldD> &Linop_d;
+  GridBase* SinglePrecGrid;
+  RealD Delta; //reliable update parameter
+
+  //Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
+  LinearOperatorBase<FieldF> *Linop_fallback;
+  RealD fallback_transition_tol;
+
+    
+  ConjugateGradientReliableUpdate(RealD tol, Integer maxit, RealD _delta, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d, bool err_on_no_conv = true)
+    : Tolerance(tol),
+      MaxIterations(maxit),
+      Delta(_delta),
+      Linop_f(_Linop_f),
+      Linop_d(_Linop_d),
+      SinglePrecGrid(_sp_grid),
+      ErrorOnNoConverge(err_on_no_conv),
+      DoFinalCleanup(true),
+      Linop_fallback(NULL)
+  {};
+
+  void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
+    Linop_fallback = &_Linop_fallback;
+    fallback_transition_tol = _fallback_transition_tol;      
+  }
+    
+  void operator()(const FieldD &src, FieldD &psi) {
+    LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f;
+    bool using_fallback = false;
+      
+    psi.Checkerboard() = src.Checkerboard();
+    conformable(psi, src);
+
+    RealD cp, c, a, d, b, ssq, qq, b_pred;
+
+    FieldD p(src);
+    FieldD mmp(src);
+    FieldD r(src);
+
+    // Initial residual computation & set up
+    RealD guess = norm2(psi);
+    assert(std::isnan(guess) == 0);
+    
+    Linop_d.HermOpAndNorm(psi, mmp, d, b);
+    
+    r = src - mmp;
+    p = r;
+
+    a = norm2(p);
+    cp = a;
+    ssq = norm2(src);
+
+    std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: guess " << guess << std::endl;
+    std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:   src " << ssq << std::endl;
+    std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:    mp " << d << std::endl;
+    std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:   mmp " << b << std::endl;
+    std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:  cp,r " << cp << std::endl;
+    std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:     p " << a << std::endl;
+
+    RealD rsq = Tolerance * Tolerance * ssq;
+
+    // Check if guess is really REALLY good :)
+    if (cp <= rsq) {
+      std::cout << GridLogMessage << "ConjugateGradientReliableUpdate guess was REALLY good\n";
+      std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)<<std::endl;
+      return;
+    }
+
+    //Single prec initialization
+    FieldF r_f(SinglePrecGrid);
+    r_f.Checkerboard() = r.Checkerboard();
+    precisionChange(r_f, r);
+
+    FieldF psi_f(r_f);
+    psi_f = Zero();
+
+    FieldF p_f(r_f);
+    FieldF mmp_f(r_f);
+
+    RealD MaxResidSinceLastRelUp = cp; //initial residual    
+    
+    std::cout << GridLogIterative << std::setprecision(4)
+	      << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
+
+    GridStopWatch LinalgTimer;
+    GridStopWatch MatrixTimer;
+    GridStopWatch SolverTimer;
+
+    SolverTimer.Start();
+    int k = 0;
+    int l = 0;
+    
+    for (k = 1; k <= MaxIterations; k++) {
+      c = cp;
+
+      MatrixTimer.Start();
+      Linop_f_use->HermOpAndNorm(p_f, mmp_f, d, qq);
+      MatrixTimer.Stop();
+
+      LinalgTimer.Start();
+
+      a = c / d;
+      b_pred = a * (a * qq - d) / c;
+
+      cp = axpy_norm(r_f, -a, mmp_f, r_f);
+      b = cp / c;
+
+      // Fuse these loops ; should be really easy
+      psi_f = a * p_f + psi_f;
+      //p_f = p_f * b + r_f;
+
+      LinalgTimer.Stop();
+
+      std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: Iteration " << k
+		<< " residual " << cp << " target " << rsq << std::endl;
+      std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << "  b = "<< b << std::endl;
+      std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << "  c = "<< c << std::endl;
+
+      if(cp > MaxResidSinceLastRelUp){
+	std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: updating MaxResidSinceLastRelUp : " << MaxResidSinceLastRelUp << " -> " << cp << std::endl;
+	MaxResidSinceLastRelUp = cp;
+      }
+	  
+      // Stopping condition
+      if (cp <= rsq) {
+	//Although not written in the paper, I assume that I have to add on the final solution
+	precisionChange(mmp, psi_f);
+	psi = psi + mmp;
+	
+	
+	SolverTimer.Stop();
+	Linop_d.HermOpAndNorm(psi, mmp, d, qq);
+	p = mmp - src;
+
+	RealD srcnorm = std::sqrt(norm2(src));
+	RealD resnorm = std::sqrt(norm2(p));
+	RealD true_residual = resnorm / srcnorm;
+
+	std::cout << GridLogMessage << "ConjugateGradientReliableUpdate Converged on iteration " << k << " after " << l << " reliable updates" << std::endl;
+	std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)<<std::endl;
+	std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
+	std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
+
+	std::cout << GridLogMessage << "Time breakdown "<<std::endl;
+	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
+
+	IterationsToComplete = k;	
+	ReliableUpdatesPerformed = l;
+	  
+	if(DoFinalCleanup){
+	  //Do a final CG to cleanup
+	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate performing final cleanup.\n";
+	  ConjugateGradient<FieldD> CG(Tolerance,MaxIterations);
+	  CG.ErrorOnNoConverge = ErrorOnNoConverge;
+	  CG(Linop_d,src,psi);
+	  IterationsToCleanup = CG.IterationsToComplete;
+	}
+	else if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
+
+	std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n";
+	return;
+      }
+      else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
+	std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
+		  << cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
+	precisionChange(mmp, psi_f);
+	psi = psi + mmp;
+
+	Linop_d.HermOpAndNorm(psi, mmp, d, qq);
+	r = src - mmp;
+
+	psi_f = Zero();
+	precisionChange(r_f, r);
+	cp = norm2(r);
+	MaxResidSinceLastRelUp = cp;
+
+	b = cp/c;
+	  
+	std::cout << GridLogMessage << "ConjugateGradientReliableUpdate new residual " << cp << std::endl;
+	  
+	l = l+1;
+      }
+
+      p_f = p_f * b + r_f; //update search vector after reliable update appears to help convergence
+
+      if(!using_fallback && Linop_fallback != NULL && cp < fallback_transition_tol){
+	std::cout << GridLogMessage << "ConjugateGradientReliableUpdate switching to fallback linear operator on iteration " << k << " at residual " << cp << std::endl;
+	Linop_f_use = Linop_fallback;
+	using_fallback = true;
+      }
+
+	
+    }
+    std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge"
+	      << std::endl;
+      
+    if (ErrorOnNoConverge) assert(0);
+    IterationsToComplete = k;
+    ReliableUpdatesPerformed = l;      
+  }    
+};
+
+
+NAMESPACE_END(Grid);
+
+
+
+#endif
--- a/Grid/algorithms/iterative/ConjugateResidual.h
+++ b/Grid/algorithms/iterative/ConjugateResidual.h
@@ -0,0 +1,111 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ConjugateResidual.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_CONJUGATE_RESIDUAL_H
+#define GRID_CONJUGATE_RESIDUAL_H
+
+NAMESPACE_BEGIN(Grid);
+
+/////////////////////////////////////////////////////////////
+// Base classes for iterative processes based on operators
+// single input vec, single output vec.
+/////////////////////////////////////////////////////////////
+
+template<class Field> 
+class ConjugateResidual : public OperatorFunction<Field> {
+public:                                                
+  RealD   Tolerance;
+  Integer MaxIterations;
+  int verbose;
+
+  ConjugateResidual(RealD tol,Integer maxit) : Tolerance(tol), MaxIterations(maxit) { 
+    verbose=0;
+  };
+
+  void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
+
+    RealD a, b; // c, d;
+    RealD cp, ssq,rsq;
+      
+    RealD rAr, rAAr, rArp;
+    RealD pAp, pAAp;
+
+    GridBase *grid = src.Grid();
+    psi=Zero();
+    Field r(grid),  p(grid), Ap(grid), Ar(grid);
+      
+    r=src;
+    p=src;
+
+    Linop.HermOpAndNorm(p,Ap,pAp,pAAp);
+    Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
+
+    cp =norm2(r);
+    ssq=norm2(src);
+    rsq=Tolerance*Tolerance*ssq;
+
+    if (verbose) std::cout<<GridLogMessage<<"ConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl;
+
+    for(int k=1;k<MaxIterations;k++){
+
+      a = rAr/pAAp;
+
+      axpy(psi,a,p,psi);
+
+      cp = axpy_norm(r,-a,Ap,r);
+
+      rArp=rAr;
+
+      Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
+
+      b   =rAr/rArp;
+ 
+      axpy(p,b,p,r);
+      pAAp=axpy_norm(Ap,b,Ap,Ar);
+	
+      if(verbose) std::cout<<GridLogMessage<<"ConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
+
+      if(cp<rsq) {
+	Linop.HermOp(psi,Ap);
+	axpy(r,-1.0,src,Ap);
+	RealD true_resid = norm2(r)/ssq;
+	std::cout<<GridLogMessage<<"ConjugateResidual: Converged on iteration " <<k
+		 << " computed residual "<<std::sqrt(cp/ssq)
+		 << " true residual "<<std::sqrt(true_resid)
+		 << " target "       <<Tolerance <<std::endl;
+	return;
+      }
+
+    }
+
+    std::cout<<GridLogMessage<<"ConjugateResidual did NOT converge"<<std::endl;
+    assert(0);
+  }
+};
+NAMESPACE_END(Grid);
+#endif
--- a/Grid/algorithms/iterative/Deflation.h
+++ b/Grid/algorithms/iterative/Deflation.h
@@ -0,0 +1,104 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_DEFLATION_H
+#define GRID_DEFLATION_H
+
+namespace Grid { 
+
+template<class Field>
+class ZeroGuesser: public LinearFunction<Field> {
+public:
+    virtual void operator()(const Field &src, Field &guess) { guess = Zero(); };
+};
+
+template<class Field>
+class SourceGuesser: public LinearFunction<Field> {
+public:
+  virtual void operator()(const Field &src, Field &guess) { guess = src; };
+};
+
+////////////////////////////////
+// Fine grid deflation
+////////////////////////////////
+template<class Field>
+class DeflatedGuesser: public LinearFunction<Field> {
+private:
+  const std::vector<Field> &evec;
+  const std::vector<RealD> &eval;
+
+public:
+
+  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
+
+  virtual void operator()(const Field &src,Field &guess) {
+    guess = Zero();
+    assert(evec.size()==eval.size());
+    auto N = evec.size();
+    for (int i=0;i<N;i++) {
+      const Field& tmp = evec[i];
+      axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
+    }
+    guess.Checkerboard() = src.Checkerboard();
+  }
+};
+
+template<class FineField, class CoarseField>
+class LocalCoherenceDeflatedGuesser: public LinearFunction<FineField> {
+private:
+  const std::vector<FineField>   &subspace;
+  const std::vector<CoarseField> &evec_coarse;
+  const std::vector<RealD>       &eval_coarse;
+public:
+  
+  LocalCoherenceDeflatedGuesser(const std::vector<FineField>   &_subspace,
+				const std::vector<CoarseField> &_evec_coarse,
+				const std::vector<RealD>       &_eval_coarse)
+    : subspace(_subspace), 
+      evec_coarse(_evec_coarse), 
+      eval_coarse(_eval_coarse)  
+  {
+  }
+  
+  void operator()(const FineField &src,FineField &guess) { 
+    int N = (int)evec_coarse.size();
+    CoarseField src_coarse(evec_coarse[0].Grid());
+    CoarseField guess_coarse(evec_coarse[0].Grid());    guess_coarse = Zero();
+    blockProject(src_coarse,src,subspace);    
+    for (int i=0;i<N;i++) {
+      const CoarseField & tmp = evec_coarse[i];
+      axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse);
+    }
+    blockPromote(guess_coarse,guess,subspace);
+    guess.Checkerboard() = src.Checkerboard();
+  };
+};
+
+
+
+}
+#endif
--- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -0,0 +1,863 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Chulwoo Jung <chulwoo@bnl.gov>
+Author: Christoph Lehner <clehner@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_BIRL_H
+#define GRID_BIRL_H
+
+#include <string.h> //memset
+//#include <zlib.h>
+#include <sys/stat.h>
+
+NAMESPACE_BEGIN(Grid); 
+
+  ////////////////////////////////////////////////////////
+  // Move following 100 LOC to lattice/Lattice_basis.h
+  ////////////////////////////////////////////////////////
+template<class Field>
+void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k) 
+{
+  for(int j=0; j<k; ++j){
+    auto ip = innerProduct(basis[j],w);
+    w = w - ip*basis[j];
+  }
+}
+
+template<class Field>
+void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm) 
+{
+  typedef decltype(basis[0].View()) View;
+  auto tmp_v = basis[0].View();
+  std::vector<View> basis_v(basis.size(),tmp_v);
+  typedef typename Field::vector_object vobj;
+  GridBase* grid = basis[0].Grid();
+      
+  for(int k=0;k<basis.size();k++){
+    basis_v[k] = basis[k].View();
+  }
+
+  thread_region
+  {
+    std::vector < vobj , commAllocator<vobj> > B(Nm); // Thread private
+    thread_loop_in_region( (int ss=0;ss < grid->oSites();ss++),{
+      for(int j=j0; j<j1; ++j) B[j]=0.;
+      
+      for(int j=j0; j<j1; ++j){
+	for(int k=k0; k<k1; ++k){
+	  B[j] +=Qt(j,k) * basis_v[k][ss];
+	}
+      }
+      for(int j=j0; j<j1; ++j){
+	basis_v[j][ss] = B[j];
+      }
+    });
+  }
+}
+
+// Extract a single rotated vector
+template<class Field>
+void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) 
+{
+  typedef typename Field::vector_object vobj;
+  GridBase* grid = basis[0].Grid();
+
+  result.Checkerboard() = basis[0].Checkerboard();
+  auto result_v=result.View();
+  thread_loop( (int ss=0;ss < grid->oSites();ss++),{
+    vobj B = Zero();
+    for(int k=k0; k<k1; ++k){
+      auto basis_k = basis[k].View();
+      B +=Qt(j,k) * basis_k[ss];
+    }
+    result_v[ss] = B;
+  });
+}
+
+template<class Field>
+void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx) 
+{
+  int vlen = idx.size();
+
+  assert(vlen>=1);
+  assert(vlen<=sort_vals.size());
+  assert(vlen<=_v.size());
+
+  for (size_t i=0;i<vlen;i++) {
+
+    if (idx[i] != i) {
+
+      //////////////////////////////////////
+      // idx[i] is a table of desired sources giving a permutation.
+      // Swap v[i] with v[idx[i]].
+      // Find  j>i for which _vnew[j] = _vold[i],
+      // track the move idx[j] => idx[i]
+      // track the move idx[i] => i
+      //////////////////////////////////////
+      size_t j;
+      for (j=i;j<idx.size();j++)
+	if (idx[j]==i)
+	  break;
+
+      assert(idx[i] > i);     assert(j!=idx.size());      assert(idx[j]==i);
+
+      swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy
+      std::swap(sort_vals[i],sort_vals[idx[i]]);
+
+      idx[j] = idx[i];
+      idx[i] = i;
+    }
+  }
+}
+
+inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals) 
+{
+  std::vector<int> idx(sort_vals.size());
+  std::iota(idx.begin(), idx.end(), 0);
+
+  // sort indexes based on comparing values in v
+  std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
+    return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
+  });
+  return idx;
+}
+
+template<class Field>
+void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse) 
+{
+  std::vector<int> idx = basisSortGetIndex(sort_vals);
+  if (reverse)
+    std::reverse(idx.begin(), idx.end());
+  
+  basisReorderInPlace(_v,sort_vals,idx);
+}
+
+// PAB: faster to compute the inner products first then fuse loops.
+// If performance critical can improve.
+template<class Field>
+void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
+  result = Zero();
+  assert(_v.size()==eval.size());
+  int N = (int)_v.size();
+  for (int i=0;i<N;i++) {
+    Field& tmp = _v[i];
+    axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
+  }
+}
+
+/////////////////////////////////////////////////////////////
+// Implicitly restarted lanczos
+/////////////////////////////////////////////////////////////
+template<class Field> class ImplicitlyRestartedLanczosTester 
+{
+ public:
+  virtual int TestConvergence(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox)=0;
+  virtual int ReconstructEval(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox)=0;
+};
+
+enum IRLdiagonalisation { 
+  IRLdiagonaliseWithDSTEGR,
+  IRLdiagonaliseWithQR,
+  IRLdiagonaliseWithEigen
+};
+
+template<class Field> class ImplicitlyRestartedLanczosHermOpTester  : public ImplicitlyRestartedLanczosTester<Field>
+{
+ public:
+
+  LinearFunction<Field>       &_HermOp;
+  ImplicitlyRestartedLanczosHermOpTester(LinearFunction<Field> &HermOp) : _HermOp(HermOp)  {  };
+  int ReconstructEval(int j,RealD resid,Field &B, RealD &eval,RealD evalMaxApprox)
+  {
+    return TestConvergence(j,resid,B,eval,evalMaxApprox);
+  }
+  int TestConvergence(int j,RealD eresid,Field &B, RealD &eval,RealD evalMaxApprox)
+  {
+    Field v(B);
+    RealD eval_poly = eval;
+    // Apply operator
+    _HermOp(B,v);
+
+    RealD vnum = real(innerProduct(B,v)); // HermOp.
+    RealD vden = norm2(B);
+    RealD vv0  = norm2(v);
+    eval   = vnum/vden;
+    v -= eval*B;
+
+    RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);
+
+    std::cout.precision(13);
+    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
+	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
+	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
+	     <<std::endl;
+
+    int conv=0;
+    if( (vv<eresid*eresid) ) conv = 1;
+
+    return conv;
+  }
+};
+
+template<class Field> 
+class ImplicitlyRestartedLanczos {
+ private:
+  const RealD small = 1.0e-8;
+  int MaxIter;
+  int MinRestart; // Minimum number of restarts; only check for convergence after
+  int Nstop;   // Number of evecs checked for convergence
+  int Nk;      // Number of converged sought
+  //  int Np;      // Np -- Number of spare vecs in krylov space //  == Nm - Nk
+  int Nm;      // Nm -- total number of vectors
+  IRLdiagonalisation diagonalisation;
+  int orth_period;
+    
+  RealD OrthoTime;
+  RealD eresid, betastp;
+  ////////////////////////////////
+  // Embedded objects
+  ////////////////////////////////
+  LinearFunction<Field>       &_PolyOp;
+  LinearFunction<Field>       &_HermOp;
+  ImplicitlyRestartedLanczosTester<Field> &_Tester;
+  // Default tester provided (we need a ref to something in default case)
+  ImplicitlyRestartedLanczosHermOpTester<Field> SimpleTester;
+  /////////////////////////
+  // Constructor
+  /////////////////////////
+  
+public:       
+
+  //////////////////////////////////////////////////////////////////
+  // PAB:
+  //////////////////////////////////////////////////////////////////
+  // Too many options  & knobs. 
+  // Eliminate:
+  //   orth_period
+  //   betastp
+  //   MinRestart
+  //
+  // Do we really need orth_period
+  // What is the theoretical basis & guarantees of betastp ?
+  // Nstop=Nk viable?
+  // MinRestart avoidable with new convergence test?
+  // Could cut to PolyOp, HermOp, Tester, Nk, Nm, resid, maxiter (+diagonalisation)
+  // HermOp could be eliminated if we dropped the Power method for max eval.
+  // -- also: The eval, eval2, eval2_copy stuff is still unnecessarily unclear
+  //////////////////////////////////////////////////////////////////
+ ImplicitlyRestartedLanczos(LinearFunction<Field> & PolyOp,
+			    LinearFunction<Field> & HermOp,
+			    ImplicitlyRestartedLanczosTester<Field> & Tester,
+			    int _Nstop, // sought vecs
+			    int _Nk, // sought vecs
+			    int _Nm, // spare vecs
+			    RealD _eresid, // resid in lmdue deficit 
+			    int _MaxIter, // Max iterations
+			    RealD _betastp=0.0, // if beta(k) < betastp: converged
+			    int _MinRestart=1, int _orth_period = 1,
+			    IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
+    SimpleTester(HermOp), _PolyOp(PolyOp),      _HermOp(HermOp), _Tester(Tester),
+    Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm),
+    eresid(_eresid),      betastp(_betastp),
+    MaxIter(_MaxIter)  ,      MinRestart(_MinRestart),
+    orth_period(_orth_period), diagonalisation(_diagonalisation)  { };
+
+    ImplicitlyRestartedLanczos(LinearFunction<Field> & PolyOp,
+			       LinearFunction<Field> & HermOp,
+			       int _Nstop, // sought vecs
+			       int _Nk, // sought vecs
+			       int _Nm, // spare vecs
+			       RealD _eresid, // resid in lmdue deficit 
+			       int _MaxIter, // Max iterations
+			       RealD _betastp=0.0, // if beta(k) < betastp: converged
+			       int _MinRestart=1, int _orth_period = 1,
+			       IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
+    SimpleTester(HermOp),  _PolyOp(PolyOp),      _HermOp(HermOp), _Tester(SimpleTester),
+    Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm),
+    eresid(_eresid),      betastp(_betastp),
+    MaxIter(_MaxIter)  ,      MinRestart(_MinRestart),
+    orth_period(_orth_period), diagonalisation(_diagonalisation)  { };
+
+  ////////////////////////////////
+  // Helpers
+  ////////////////////////////////
+  template<typename T>  static RealD normalise(T& v) 
+  {
+    RealD nn = norm2(v);
+    nn = std::sqrt(nn);
+    v = v * (1.0/nn);
+    return nn;
+  }
+
+  void orthogonalize(Field& w, std::vector<Field>& evec,int k)
+  {
+    OrthoTime-=usecond()/1e6;
+    basisOrthogonalize(evec,w,k);
+    normalise(w);
+    OrthoTime+=usecond()/1e6;
+  }
+
+/* Rudy Arthur's thesis pp.137
+------------------------
+Require: M > K P = M − K †
+Compute the factorization AVM = VM HM + fM eM 
+repeat
+  Q=I
+  for i = 1,...,P do
+    QiRi =HM −θiI Q = QQi
+    H M = Q †i H M Q i
+  end for
+  βK =HM(K+1,K) σK =Q(M,K)
+  r=vK+1βK +rσK
+  VK =VM(1:M)Q(1:M,1:K)
+  HK =HM(1:K,1:K)
+  →AVK =VKHK +fKe†K † Extend to an M = K + P step factorization AVM = VMHM + fMeM
+until convergence
+*/
+  void calc(std::vector<RealD>& eval, std::vector<Field>& evec,  const Field& src, int& Nconv, bool reverse=false)
+  {
+    GridBase *grid = src.Grid();
+    assert(grid == evec[0].Grid());
+    
+    GridLogIRL.TimingMode(1);
+    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
+    std::cout << GridLogIRL <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 /  "<< MaxIter<< std::endl;
+    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
+    std::cout << GridLogIRL <<" -- seek   Nk    = " << Nk    <<" vectors"<< std::endl;
+    std::cout << GridLogIRL <<" -- accept Nstop = " << Nstop <<" vectors"<< std::endl;
+    std::cout << GridLogIRL <<" -- total  Nm    = " << Nm    <<" vectors"<< std::endl;
+    std::cout << GridLogIRL <<" -- size of eval = " << eval.size() << std::endl;
+    std::cout << GridLogIRL <<" -- size of evec = " << evec.size() << std::endl;
+    if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) {
+      std::cout << GridLogIRL << "Diagonalisation is DSTEGR "<<std::endl;
+    } else if ( diagonalisation == IRLdiagonaliseWithQR ) { 
+      std::cout << GridLogIRL << "Diagonalisation is QR "<<std::endl;
+    }  else if ( diagonalisation == IRLdiagonaliseWithEigen ) { 
+      std::cout << GridLogIRL << "Diagonalisation is Eigen "<<std::endl;
+    }
+    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
+	
+    assert(Nm <= evec.size() && Nm <= eval.size());
+    
+    // quickly get an idea of the largest eigenvalue to more properly normalize the residuum
+    RealD evalMaxApprox = 0.0;
+    {
+      auto src_n = src;
+      auto tmp = src;
+      const int _MAX_ITER_IRL_MEVAPP_ = 50;
+      for (int i=0;i<_MAX_ITER_IRL_MEVAPP_;i++) {
+	normalise(src_n);
+	_HermOp(src_n,tmp);
+	RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
+	RealD vden = norm2(src_n);
+	RealD na = vnum/vden;
+	if (fabs(evalMaxApprox/na - 1.0) < 0.05)
+	  i=_MAX_ITER_IRL_MEVAPP_;
+	evalMaxApprox = na;
+	std::cout << GridLogIRL << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
+	src_n = tmp;
+      }
+    }
+	
+    std::vector<RealD> lme(Nm);  
+    std::vector<RealD> lme2(Nm);
+    std::vector<RealD> eval2(Nm);
+    std::vector<RealD> eval2_copy(Nm);
+    Eigen::MatrixXd Qt = Eigen::MatrixXd::Zero(Nm,Nm);
+
+    Field f(grid);
+    Field v(grid);
+    int k1 = 1;
+    int k2 = Nk;
+    RealD beta_k;
+
+    Nconv = 0;
+  
+    // Set initial vector
+    evec[0] = src;
+    normalise(evec[0]);
+	
+    // Initial Nk steps
+    OrthoTime=0.;
+    for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k);
+    std::cout<<GridLogIRL <<"Initial "<< Nk <<"steps done "<<std::endl;
+    std::cout<<GridLogIRL <<"Initial steps:OrthoTime "<<OrthoTime<< "seconds"<<std::endl;
+
+    //////////////////////////////////
+    // Restarting loop begins
+    //////////////////////////////////
+    int iter;
+    for(iter = 0; iter<MaxIter; ++iter){
+      
+      OrthoTime=0.;
+
+      std::cout<< GridLogMessage <<" **********************"<< std::endl;
+      std::cout<< GridLogMessage <<" Restart iteration = "<< iter << std::endl;
+      std::cout<< GridLogMessage <<" **********************"<< std::endl;
+
+      std::cout<<GridLogIRL <<" running "<<Nm-Nk <<" steps: "<<std::endl;
+      for(int k=Nk; k<Nm; ++k) step(eval,lme,evec,f,Nm,k);
+      f *= lme[Nm-1];
+
+      std::cout<<GridLogIRL <<" "<<Nm-Nk <<" steps done "<<std::endl;
+      std::cout<<GridLogIRL <<"Initial steps:OrthoTime "<<OrthoTime<< "seconds"<<std::endl;
+	  
+      //////////////////////////////////
+      // getting eigenvalues
+      //////////////////////////////////
+      for(int k=0; k<Nm; ++k){
+	eval2[k] = eval[k+k1-1];
+	lme2[k] = lme[k+k1-1];
+      }
+      Qt = Eigen::MatrixXd::Identity(Nm,Nm);
+      diagonalize(eval2,lme2,Nm,Nm,Qt,grid);
+      std::cout<<GridLogIRL <<" diagonalized "<<std::endl;
+
+      //////////////////////////////////
+      // sorting
+      //////////////////////////////////
+      eval2_copy = eval2;
+      std::partial_sort(eval2.begin(),eval2.begin()+Nm,eval2.end(),std::greater<RealD>());
+      std::cout<<GridLogIRL <<" evals sorted "<<std::endl;
+      const int chunk=8;
+      for(int io=0; io<k2;io+=chunk){
+	std::cout<<GridLogIRL << "eval "<< std::setw(3) << io ;
+	for(int ii=0;ii<chunk;ii++){
+	  if ( (io+ii)<k2 )
+	    std::cout<< " "<< std::setw(12)<< eval2[io+ii];
+	}
+	std::cout << std::endl;
+      }
+
+      //////////////////////////////////
+      // Implicitly shifted QR transformations
+      //////////////////////////////////
+      Qt = Eigen::MatrixXd::Identity(Nm,Nm);
+      for(int ip=k2; ip<Nm; ++ip){ 
+	QR_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
+      }
+      std::cout<<GridLogIRL <<"QR decomposed "<<std::endl;
+
+      assert(k2<Nm);      assert(k2<Nm);      assert(k1>0);
+
+      basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis
+      std::cout<<GridLogIRL <<"basisRotated  by Qt *"<<k1-1<<","<<k2+1<<")"<<std::endl;
+      
+      ////////////////////////////////////////////////////
+      // Compressed vector f and beta(k2)
+      ////////////////////////////////////////////////////
+      f *= Qt(k2-1,Nm-1);
+      f += lme[k2-1] * evec[k2];
+      beta_k = norm2(f);
+      beta_k = std::sqrt(beta_k);
+      std::cout<<GridLogIRL<<" beta(k) = "<<beta_k<<std::endl;
+	  
+      RealD betar = 1.0/beta_k;
+      evec[k2] = betar * f;
+      lme[k2-1] = beta_k;
+	  
+      ////////////////////////////////////////////////////
+      // Convergence test
+      ////////////////////////////////////////////////////
+      for(int k=0; k<Nm; ++k){    
+	eval2[k] = eval[k];
+	lme2[k] = lme[k];
+      }
+      Qt = Eigen::MatrixXd::Identity(Nm,Nm);
+      diagonalize(eval2,lme2,Nk,Nm,Qt,grid);
+      std::cout<<GridLogIRL <<" Diagonalized "<<std::endl;
+	  
+      Nconv = 0;
+      if (iter >= MinRestart) {
+
+	std::cout << GridLogIRL << "Test convergence: rotate subset of vectors to test convergence " << std::endl;
+
+	Field B(grid); B.Checkerboard() = evec[0].Checkerboard();
+
+	//  power of two search pattern;  not every evalue in eval2 is assessed.
+	int allconv =1;
+	for(int jj = 1; jj<=Nstop; jj*=2){
+	  int j = Nstop-jj;
+	  RealD e = eval2_copy[j]; // Discard the evalue
+	  basisRotateJ(B,evec,Qt,j,0,Nk,Nm);	    
+	  if( !_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) {
+	    allconv=0;
+	  }
+	}
+	// Do evec[0] for good measure
+	{ 
+	  int j=0;
+	  RealD e = eval2_copy[0]; 
+	  basisRotateJ(B,evec,Qt,j,0,Nk,Nm);	    
+	  if( !_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) allconv=0;
+	}
+	if ( allconv ) Nconv = Nstop;
+
+	// test if we converged, if so, terminate
+	std::cout<<GridLogIRL<<" #modes converged: >= "<<Nconv<<"/"<<Nstop<<std::endl;
+	//	if( Nconv>=Nstop || beta_k < betastp){
+	if( Nconv>=Nstop){
+	  goto converged;
+	}
+	  
+      } else {
+	std::cout << GridLogIRL << "iter < MinRestart: do not yet test for convergence\n";
+      } // end of iter loop
+    }
+
+    std::cout<<GridLogError<<"\n NOT converged.\n";
+    abort();
+	
+  converged:
+    {
+      Field B(grid); B.Checkerboard() = evec[0].Checkerboard();
+      basisRotate(evec,Qt,0,Nk,0,Nk,Nm);	    
+      std::cout << GridLogIRL << " Rotated basis"<<std::endl;
+      Nconv=0;
+      //////////////////////////////////////////////////////////////////////
+      // Full final convergence test; unconditionally applied
+      //////////////////////////////////////////////////////////////////////
+      for(int j = 0; j<=Nk; j++){
+	B=evec[j];
+	if( _Tester.ReconstructEval(j,eresid,B,eval2[j],evalMaxApprox) ) {
+	  Nconv++;
+	}
+      }
+
+      if ( Nconv < Nstop )
+	std::cout << GridLogIRL << "Nconv ("<<Nconv<<") < Nstop ("<<Nstop<<")"<<std::endl;
+
+      eval=eval2;
+      
+      //Keep only converged
+      eval.resize(Nconv);// Nstop?
+      evec.resize(Nconv,grid);// Nstop?
+      basisSortInPlace(evec,eval,reverse);
+      
+    }
+       
+    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
+    std::cout << GridLogIRL << "ImplicitlyRestartedLanczos CONVERGED ; Summary :\n";
+    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
+    std::cout << GridLogIRL << " -- Iterations  = "<< iter   << "\n";
+    std::cout << GridLogIRL << " -- beta(k)     = "<< beta_k << "\n";
+    std::cout << GridLogIRL << " -- Nconv       = "<< Nconv  << "\n";
+    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
+  }
+
+ private:
+/* Saad PP. 195
+1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0
+2. For k = 1,2,...,m Do:
+3. wk:=Avk−βkv_{k−1}      
+4. αk:=(wk,vk)       // 
+5. wk:=wk−αkvk       // wk orthog vk 
+6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
+7. vk+1 := wk/βk+1
+8. EndDo
+ */
+  void step(std::vector<RealD>& lmd,
+	    std::vector<RealD>& lme, 
+	    std::vector<Field>& evec,
+	    Field& w,int Nm,int k)
+  {
+    const RealD tiny = 1.0e-20;
+    assert( k< Nm );
+
+    GridStopWatch gsw_op,gsw_o;
+
+    Field& evec_k = evec[k];
+
+    _PolyOp(evec_k,w);    std::cout<<GridLogIRL << "PolyOp" <<std::endl;
+
+    if(k>0) w -= lme[k-1] * evec[k-1];
+
+    ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk)
+    RealD     alph = real(zalph);
+
+    w = w - alph * evec_k;// 5. wk:=wk−αkvk
+
+    RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
+    // 7. vk+1 := wk/βk+1
+
+    lmd[k] = alph;
+    lme[k] = beta;
+
+    if (k>0 && k % orth_period == 0) {
+      orthogonalize(w,evec,k); // orthonormalise
+      std::cout<<GridLogIRL << "Orthogonalised " <<std::endl;
+    }
+
+    if(k < Nm-1) evec[k+1] = w;
+
+    std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
+    if ( beta < tiny ) 
+      std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl;
+  }
+
+  void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
+			 int Nk, int Nm,  
+			 Eigen::MatrixXd & Qt, // Nm x Nm
+			 GridBase *grid)
+  {
+    Eigen::MatrixXd TriDiag = Eigen::MatrixXd::Zero(Nk,Nk);
+
+    for(int i=0;i<Nk;i++)   TriDiag(i,i)   = lmd[i];
+    for(int i=0;i<Nk-1;i++) TriDiag(i,i+1) = lme[i];
+    for(int i=0;i<Nk-1;i++) TriDiag(i+1,i) = lme[i];
+    
+    Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> eigensolver(TriDiag);
+
+    for (int i = 0; i < Nk; i++) {
+      lmd[Nk-1-i] = eigensolver.eigenvalues()(i);
+    }
+    for (int i = 0; i < Nk; i++) {
+      for (int j = 0; j < Nk; j++) {
+	Qt(Nk-1-i,j) = eigensolver.eigenvectors()(j,i);
+      }
+    }
+  }
+
+  ///////////////////////////////////////////////////////////////////////////
+  // File could end here if settle on Eigen ??? !!!
+  ///////////////////////////////////////////////////////////////////////////
+  void QR_decomp(std::vector<RealD>& lmd,   // Nm 
+		 std::vector<RealD>& lme,   // Nm 
+		 int Nk, int Nm,            // Nk, Nm
+		 Eigen::MatrixXd& Qt,       // Nm x Nm matrix
+		 RealD Dsh, int kmin, int kmax)
+  {
+    int k = kmin-1;
+    RealD x;
+    
+    RealD Fden = 1.0/hypot(lmd[k]-Dsh,lme[k]);
+    RealD c = ( lmd[k] -Dsh) *Fden;
+    RealD s = -lme[k] *Fden;
+      
+    RealD tmpa1 = lmd[k];
+    RealD tmpa2 = lmd[k+1];
+    RealD tmpb  = lme[k];
+
+    lmd[k]   = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
+    lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
+    lme[k]   = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
+    x        =-s*lme[k+1];
+    lme[k+1] = c*lme[k+1];
+      
+    for(int i=0; i<Nk; ++i){
+      RealD Qtmp1 = Qt(k,i);
+      RealD Qtmp2 = Qt(k+1,i);
+      Qt(k,i)  = c*Qtmp1 - s*Qtmp2;
+      Qt(k+1,i)= s*Qtmp1 + c*Qtmp2; 
+    }
+
+    // Givens transformations
+    for(int k = kmin; k < kmax-1; ++k){
+      
+      RealD Fden = 1.0/hypot(x,lme[k-1]);
+      RealD c = lme[k-1]*Fden;
+      RealD s = - x*Fden;
+	
+      RealD tmpa1 = lmd[k];
+      RealD tmpa2 = lmd[k+1];
+      RealD tmpb  = lme[k];
+
+      lmd[k]   = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
+      lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
+      lme[k]   = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
+      lme[k-1] = c*lme[k-1] -s*x;
+
+      if(k != kmax-2){
+	x = -s*lme[k+1];
+	lme[k+1] = c*lme[k+1];
+      }
+
+      for(int i=0; i<Nk; ++i){
+	RealD Qtmp1 = Qt(k,i);
+	RealD Qtmp2 = Qt(k+1,i);
+	Qt(k,i)     = c*Qtmp1 -s*Qtmp2;
+	Qt(k+1,i)   = s*Qtmp1 +c*Qtmp2;
+      }
+    }
+  }
+
+  void diagonalize(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
+		   int Nk, int Nm,   
+		   Eigen::MatrixXd & Qt,
+		   GridBase *grid)
+  {
+    Qt = Eigen::MatrixXd::Identity(Nm,Nm);
+    if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) {
+      diagonalize_lapack(lmd,lme,Nk,Nm,Qt,grid);
+    } else if ( diagonalisation == IRLdiagonaliseWithQR ) { 
+      diagonalize_QR(lmd,lme,Nk,Nm,Qt,grid);
+    }  else if ( diagonalisation == IRLdiagonaliseWithEigen ) { 
+      diagonalize_Eigen(lmd,lme,Nk,Nm,Qt,grid);
+    } else { 
+      assert(0);
+    }
+  }
+
+#ifdef USE_LAPACK
+void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e,
+                   double *vl, double *vu, int *il, int *iu, double *abstol,
+                   int *m, double *w, double *z, int *ldz, int *isuppz,
+                   double *work, int *lwork, int *iwork, int *liwork,
+                   int *info);
+#endif
+
+void diagonalize_lapack(std::vector<RealD>& lmd,
+			std::vector<RealD>& lme, 
+			int Nk, int Nm,  
+			Eigen::MatrixXd& Qt,
+			GridBase *grid)
+{
+#ifdef USE_LAPACK
+  const int size = Nm;
+  int NN = Nk;
+  double evals_tmp[NN];
+  double evec_tmp[NN][NN];
+  memset(evec_tmp[0],0,sizeof(double)*NN*NN);
+  double DD[NN];
+  double EE[NN];
+  for (int i = 0; i< NN; i++) {
+    for (int j = i - 1; j <= i + 1; j++) {
+      if ( j < NN && j >= 0 ) {
+	if (i==j) DD[i] = lmd[i];
+	if (i==j) evals_tmp[i] = lmd[i];
+	if (j==(i-1)) EE[j] = lme[j];
+      }
+    }
+  }
+  int evals_found;
+  int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ;
+  int liwork =  3+NN*10 ;
+  int iwork[liwork];
+  double work[lwork];
+  int isuppz[2*NN];
+  char jobz = 'V'; // calculate evals & evecs
+  char range = 'I'; // calculate all evals
+  //    char range = 'A'; // calculate all evals
+  char uplo = 'U'; // refer to upper half of original matrix
+  char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
+  int ifail[NN];
+  int info;
+  int total = grid->_Nprocessors;
+  int node  = grid->_processor;
+  int interval = (NN/total)+1;
+  double vl = 0.0, vu = 0.0;
+  int il = interval*node+1 , iu = interval*(node+1);
+  if (iu > NN)  iu=NN;
+  double tol = 0.0;
+  if (1) {
+    memset(evals_tmp,0,sizeof(double)*NN);
+    if ( il <= NN){
+      LAPACK_dstegr(&jobz, &range, &NN,
+		    (double*)DD, (double*)EE,
+		    &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
+		    &tol, // tolerance
+		    &evals_found, evals_tmp, (double*)evec_tmp, &NN,
+		    isuppz,
+		    work, &lwork, iwork, &liwork,
+		    &info);
+      for (int i = iu-1; i>= il-1; i--){
+	evals_tmp[i] = evals_tmp[i - (il-1)];
+	if (il>1) evals_tmp[i-(il-1)]=0.;
+	for (int j = 0; j< NN; j++){
+	  evec_tmp[i][j] = evec_tmp[i - (il-1)][j];
+	  if (il>1) evec_tmp[i-(il-1)][j]=0.;
+	}
+      }
+    }
+    {
+      grid->GlobalSumVector(evals_tmp,NN);
+      grid->GlobalSumVector((double*)evec_tmp,NN*NN);
+    }
+  } 
+  // Safer to sort instead of just reversing it, 
+  // but the document of the routine says evals are sorted in increasing order. 
+  // qr gives evals in decreasing order.
+  for(int i=0;i<NN;i++){
+    lmd [NN-1-i]=evals_tmp[i];
+    for(int j=0;j<NN;j++){
+      Qt((NN-1-i),j)=evec_tmp[i][j];
+    }
+  }
+#else 
+  assert(0);
+#endif
+}
+
+void diagonalize_QR(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
+		    int Nk, int Nm,   
+		    Eigen::MatrixXd & Qt,
+		    GridBase *grid)
+{
+  int QRiter = 100*Nm;
+  int kmin = 1;
+  int kmax = Nk;
+  
+  // (this should be more sophisticated)
+  for(int iter=0; iter<QRiter; ++iter){
+    
+    // determination of 2x2 leading submatrix
+    RealD dsub = lmd[kmax-1]-lmd[kmax-2];
+    RealD dd = std::sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]);
+    RealD Dsh = 0.5*(lmd[kmax-2]+lmd[kmax-1] +dd*(dsub/fabs(dsub)));
+    // (Dsh: shift)
+    
+    // transformation
+    QR_decomp(lmd,lme,Nk,Nm,Qt,Dsh,kmin,kmax); // Nk, Nm
+    
+    // Convergence criterion (redef of kmin and kamx)
+    for(int j=kmax-1; j>= kmin; --j){
+      RealD dds = fabs(lmd[j-1])+fabs(lmd[j]);
+      if(fabs(lme[j-1])+dds > dds){
+	kmax = j+1;
+	goto continued;
+      }
+    }
+    QRiter = iter;
+    return;
+    
+  continued:
+    for(int j=0; j<kmax-1; ++j){
+      RealD dds = fabs(lmd[j])+fabs(lmd[j+1]);
+      if(fabs(lme[j])+dds > dds){
+	kmin = j+1;
+	break;
+      }
+    }
+  }
+  std::cout << GridLogError << "[QL method] Error - Too many iteration: "<<QRiter<<"\n";
+  abort();
+}
+};
+
+NAMESPACE_END(Grid);
+#endif
--- a/Grid/algorithms/iterative/LocalCoherenceLanczos.h
+++ b/Grid/algorithms/iterative/LocalCoherenceLanczos.h
@@ -0,0 +1,405 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/LocalCoherenceLanczos.h
+
+    Copyright (C) 2015
+
+Author: Christoph Lehner <clehner@bnl.gov>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_LOCAL_COHERENCE_IRL_H
+#define GRID_LOCAL_COHERENCE_IRL_H
+
+NAMESPACE_BEGIN(Grid); 
+
+struct LanczosParams : Serializable {
+public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams,
+				  ChebyParams, Cheby,/*Chebyshev*/
+				  int, Nstop,    /*Vecs in Lanczos must converge Nstop < Nk < Nm*/
+				  int, Nk,       /*Vecs in Lanczos seek converge*/
+				  int, Nm,       /*Total vecs in Lanczos include restart*/
+				  RealD, resid,  /*residual*/
+ 				  int, MaxIt, 
+				  RealD, betastp,  /* ? */
+				  int, MinRes);    // Must restart
+};
+
+struct LocalCoherenceLanczosParams : Serializable {
+public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
+				  bool, saveEvecs,
+				  bool, doFine,
+				  bool, doFineRead,
+				  bool, doCoarse,
+	       			  bool, doCoarseRead,
+				  LanczosParams, FineParams,
+				  LanczosParams, CoarseParams,
+				  ChebyParams,   Smoother,
+				  RealD        , coarse_relax_tol,
+				  std::vector<int>, blockSize,
+				  std::string, config,
+				  std::vector < ComplexD  >, omega,
+				  RealD, mass,
+				  RealD, M5);
+};
+
+// Duplicate functionality; ProjectedFunctionHermOp could be used with the trivial function
+template<class Fobj,class CComplex,int nbasis>
+class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
+public:
+  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
+  typedef Lattice<CoarseSiteVector>           CoarseField;
+  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
+  typedef Lattice<Fobj>          FineField;
+
+  LinearOperatorBase<FineField> &_Linop;
+  std::vector<FineField>        &subspace;
+
+  ProjectedHermOp(LinearOperatorBase<FineField>& linop, std::vector<FineField> & _subspace) : 
+    _Linop(linop), subspace(_subspace)
+  {  
+    assert(subspace.size() >0);
+  };
+
+  void operator()(const CoarseField& in, CoarseField& out) {
+    GridBase *FineGrid = subspace[0].Grid();    
+    int   checkerboard = subspace[0].Checkerboard();
+
+    FineField fin (FineGrid);     fin.Checkerboard()= checkerboard;
+    FineField fout(FineGrid);   fout.Checkerboard() = checkerboard;
+
+    blockPromote(in,fin,subspace);       std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
+    _Linop.HermOp(fin,fout);                   std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
+    blockProject(out,fout,subspace);     std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
+  }
+};
+
+template<class Fobj,class CComplex,int nbasis>
+class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
+public:
+  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
+  typedef Lattice<CoarseSiteVector>           CoarseField;
+  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
+  typedef Lattice<Fobj>          FineField;
+
+
+  OperatorFunction<FineField>   & _poly;
+  LinearOperatorBase<FineField> &_Linop;
+  std::vector<FineField>        &subspace;
+
+  ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,
+			  LinearOperatorBase<FineField>& linop, 
+			  std::vector<FineField> & _subspace) :
+    _poly(poly),
+    _Linop(linop),
+    subspace(_subspace)
+  {  };
+
+  void operator()(const CoarseField& in, CoarseField& out) {
+
+    GridBase *FineGrid = subspace[0].Grid();    
+    int   checkerboard = subspace[0].Checkerboard();
+
+    FineField fin (FineGrid); fin.Checkerboard() =checkerboard;
+    FineField fout(FineGrid);fout.Checkerboard() =checkerboard;
+    
+    blockPromote(in,fin,subspace);             std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
+    _poly(_Linop,fin,fout);                    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl;
+    blockProject(out,fout,subspace);           std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
+  }
+};
+
+template<class Fobj,class CComplex,int nbasis>
+class ImplicitlyRestartedLanczosSmoothedTester  : public ImplicitlyRestartedLanczosTester<Lattice<iVector<CComplex,nbasis > > >
+{
+public:
+  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
+  typedef Lattice<CoarseSiteVector>           CoarseField;
+  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
+  typedef Lattice<Fobj>          FineField;
+
+  LinearFunction<CoarseField> & _Poly;
+  OperatorFunction<FineField>   & _smoother;
+  LinearOperatorBase<FineField> &_Linop;
+  RealD                             _coarse_relax_tol;
+  std::vector<FineField>        &_subspace;
+  
+  ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField>   &Poly,
+					   OperatorFunction<FineField>   &smoother,
+					   LinearOperatorBase<FineField> &Linop,
+					   std::vector<FineField>        &subspace,
+					   RealD coarse_relax_tol=5.0e3) 
+    : _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
+      _coarse_relax_tol(coarse_relax_tol)  
+  {    };
+
+  int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
+  {
+    CoarseField v(B);
+    RealD eval_poly = eval;
+
+    // Apply operator
+    _Poly(B,v);
+
+    RealD vnum = real(innerProduct(B,v)); // HermOp.
+    RealD vden = norm2(B);
+    RealD vv0  = norm2(v);
+    eval   = vnum/vden;
+    v -= eval*B;
+
+    RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);
+
+    std::cout.precision(13);
+    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
+	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
+	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
+	     <<std::endl;
+
+    int conv=0;
+    if( (vv<eresid*eresid) ) conv = 1;
+    return conv;
+  }
+  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
+  {
+    GridBase *FineGrid = _subspace[0].Grid();    
+    int checkerboard   = _subspace[0].Checkerboard();
+    FineField fB(FineGrid);fB.Checkerboard() =checkerboard;
+    FineField fv(FineGrid);fv.Checkerboard() =checkerboard;
+
+    blockPromote(B,fv,_subspace);  
+    
+    _smoother(_Linop,fv,fB); 
+
+    RealD eval_poly = eval;
+    _Linop.HermOp(fB,fv);
+
+    RealD vnum = real(innerProduct(fB,fv)); // HermOp.
+    RealD vden = norm2(fB);
+    RealD vv0  = norm2(fv);
+    eval   = vnum/vden;
+    fv -= eval*fB;
+    RealD vv = norm2(fv) / ::pow(evalMaxApprox,2.0);
+
+    std::cout.precision(13);
+    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
+	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
+	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
+	     <<std::endl;
+    if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
+    if( (vv<eresid*eresid) ) return 1;
+    return 0;
+  }
+};
+
+////////////////////////////////////////////
+// Make serializable Lanczos params
+////////////////////////////////////////////
+template<class Fobj,class CComplex,int nbasis>
+class LocalCoherenceLanczos 
+{
+public:
+  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
+  typedef Lattice<CComplex>                   CoarseScalar; // used for inner products on fine field
+  typedef Lattice<CoarseSiteVector>           CoarseField;
+  typedef Lattice<Fobj>                       FineField;
+
+protected:
+  GridBase *_CoarseGrid;
+  GridBase *_FineGrid;
+  int _checkerboard;
+  LinearOperatorBase<FineField>                 & _FineOp;
+  
+  std::vector<RealD>                              &evals_fine;
+  std::vector<RealD>                              &evals_coarse; 
+  std::vector<FineField>                          &subspace;
+  std::vector<CoarseField>                        &evec_coarse;
+
+private:
+  std::vector<RealD>                              _evals_fine;
+  std::vector<RealD>                              _evals_coarse; 
+  std::vector<FineField>                          _subspace;
+  std::vector<CoarseField>                        _evec_coarse;
+
+public:
+
+  LocalCoherenceLanczos(GridBase *FineGrid,
+			GridBase *CoarseGrid,
+			LinearOperatorBase<FineField> &FineOp,
+			int checkerboard) :
+    _CoarseGrid(CoarseGrid),
+    _FineGrid(FineGrid),
+    _FineOp(FineOp),
+    _checkerboard(checkerboard),
+    evals_fine  (_evals_fine),
+    evals_coarse(_evals_coarse),
+    subspace    (_subspace),
+    evec_coarse(_evec_coarse)
+  {
+    evals_fine.resize(0);
+    evals_coarse.resize(0);
+  };
+  //////////////////////////////////////////////////////////////////////////
+  // Alternate constructore, external storage for use by Hadrons module
+  //////////////////////////////////////////////////////////////////////////
+  LocalCoherenceLanczos(GridBase *FineGrid,
+			GridBase *CoarseGrid,
+			LinearOperatorBase<FineField> &FineOp,
+			int checkerboard,
+			std::vector<FineField>   &ext_subspace,
+			std::vector<CoarseField> &ext_coarse,
+			std::vector<RealD>       &ext_eval_fine,
+			std::vector<RealD>       &ext_eval_coarse
+			) :
+    _CoarseGrid(CoarseGrid),
+    _FineGrid(FineGrid),
+    _FineOp(FineOp),
+    _checkerboard(checkerboard),
+    evals_fine  (ext_eval_fine), 
+    evals_coarse(ext_eval_coarse),
+    subspace    (ext_subspace),
+    evec_coarse (ext_coarse)
+  {
+    evals_fine.resize(0);
+    evals_coarse.resize(0);
+  };
+
+  void Orthogonalise(void ) {
+    CoarseScalar InnerProd(_CoarseGrid);
+    std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
+    blockOrthogonalise(InnerProd,subspace);
+    std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
+    blockOrthogonalise(InnerProd,subspace);
+  };
+
+  template<typename T>  static RealD normalise(T& v) 
+  {
+    RealD nn = norm2(v);
+    nn = ::sqrt(nn);
+    v = v * (1.0/nn);
+    return nn;
+  }
+  /*
+  void fakeFine(void)
+  {
+    int Nk = nbasis;
+    subspace.resize(Nk,_FineGrid);
+    subspace[0]=1.0;
+    subspace[0].Checkerboard()=_checkerboard;
+    normalise(subspace[0]);
+    PlainHermOp<FineField>    Op(_FineOp);
+    for(int k=1;k<Nk;k++){
+      subspace[k].Checkerboard()=_checkerboard;
+      Op(subspace[k-1],subspace[k]);
+      normalise(subspace[k]);
+    }
+  }
+  */
+
+  void testFine(RealD resid) 
+  {
+    assert(evals_fine.size() == nbasis);
+    assert(subspace.size() == nbasis);
+    PlainHermOp<FineField>    Op(_FineOp);
+    ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op);
+    for(int k=0;k<nbasis;k++){
+      assert(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1);
+    }
+  }
+
+  void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) 
+  {
+    assert(evals_fine.size() == nbasis);
+    assert(subspace.size() == nbasis);
+    //////////////////////////////////////////////////////////////////////////////////////////////////
+    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
+    //////////////////////////////////////////////////////////////////////////////////////////////////
+    Chebyshev<FineField>                          ChebySmooth(cheby_smooth);
+    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,subspace);
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
+
+    for(int k=0;k<evec_coarse.size();k++){
+      if ( k < nbasis ) { 
+	assert(ChebySmoothTester.ReconstructEval(k,resid,evec_coarse[k],evals_coarse[k],1.0)==1);
+      } else { 
+	assert(ChebySmoothTester.ReconstructEval(k,resid*relax,evec_coarse[k],evals_coarse[k],1.0)==1);
+      }
+    }
+  }
+
+  void calcFine(ChebyParams cheby_parms,int Nstop,int Nk,int Nm,RealD resid, 
+		RealD MaxIt, RealD betastp, int MinRes)
+  {
+    assert(nbasis<=Nm);
+    Chebyshev<FineField>      Cheby(cheby_parms);
+    FunctionHermOp<FineField> ChebyOp(Cheby,_FineOp);
+    PlainHermOp<FineField>    Op(_FineOp);
+
+    evals_fine.resize(Nm);
+    subspace.resize(Nm,_FineGrid);
+
+    ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
+
+    FineField src(_FineGrid); src=1.0; src.Checkerboard() = _checkerboard;
+
+    int Nconv;
+    IRL.calc(evals_fine,subspace,src,Nconv,false);
+    
+    // Shrink down to number saved
+    assert(Nstop>=nbasis);
+    assert(Nconv>=nbasis);
+    evals_fine.resize(nbasis);
+    subspace.resize(nbasis,_FineGrid);
+  }
+  void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
+		  int Nstop, int Nk, int Nm,RealD resid, 
+		  RealD MaxIt, RealD betastp, int MinRes)
+  {
+    Chebyshev<FineField>                          Cheby(cheby_op);
+    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace);
+    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace);
+    //////////////////////////////////////////////////////////////////////////////////////////////////
+    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
+    //////////////////////////////////////////////////////////////////////////////////////////////////
+
+    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth);
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
+
+    evals_coarse.resize(Nm);
+    evec_coarse.resize(Nm,_CoarseGrid);
+
+    CoarseField src(_CoarseGrid);     src=1.0; 
+
+    ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
+    int Nconv=0;
+    IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
+    assert(Nconv>=Nstop);
+    evals_coarse.resize(Nstop);
+    evec_coarse.resize (Nstop,_CoarseGrid);
+    for (int i=0;i<Nstop;i++){
+      std::cout << i << " Coarse eval = " << evals_coarse[i]  << std::endl;
+    }
+  }
+};
+
+NAMESPACE_END(Grid);
+#endif
--- a/Grid/algorithms/iterative/NormalEquations.h
+++ b/Grid/algorithms/iterative/NormalEquations.h
@@ -0,0 +1,60 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/NormalEquations.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_NORMAL_EQUATIONS_H
+#define GRID_NORMAL_EQUATIONS_H
+
+NAMESPACE_BEGIN(Grid);
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// Take a matrix and form an NE solver calling a Herm solver
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+template<class Field> class NormalEquations : public OperatorFunction<Field>{
+private:
+  SparseMatrixBase<Field> & _Matrix;
+  OperatorFunction<Field> & _HermitianSolver;
+
+public:
+
+  /////////////////////////////////////////////////////
+  // Wrap the usual normal equations trick
+  /////////////////////////////////////////////////////
+  NormalEquations(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver) 
+    :  _Matrix(Matrix), _HermitianSolver(HermitianSolver) {}; 
+
+  void operator() (const Field &in, Field &out){
+ 
+    Field src(in.Grid());
+
+    _Matrix.Mdag(in,src);
+    _HermitianSolver(src,out);  // Mdag M out = Mdag in
+ 
+  }     
+};
+
+NAMESPACE_END(Grid);
+#endif
--- a/Grid/algorithms/iterative/PrecConjugateResidual.h
+++ b/Grid/algorithms/iterative/PrecConjugateResidual.h
@@ -0,0 +1,119 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/PrecConjugateResidual.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_PREC_CONJUGATE_RESIDUAL_H
+#define GRID_PREC_CONJUGATE_RESIDUAL_H
+
+NAMESPACE_BEGIN(Grid);
+
+/////////////////////////////////////////////////////////////
+// Base classes for iterative processes based on operators
+// single input vec, single output vec.
+/////////////////////////////////////////////////////////////
+
+template<class Field> 
+class PrecConjugateResidual : public OperatorFunction<Field> {
+public:                                                
+  RealD   Tolerance;
+  Integer MaxIterations;
+  int verbose;
+  LinearFunction<Field> &Preconditioner;
+
+  PrecConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec) : Tolerance(tol), MaxIterations(maxit),      Preconditioner(Prec)
+  { 
+    verbose=1;
+  };
+
+  void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
+
+    RealD a, b, c, d;
+    RealD cp, ssq,rsq;
+      
+    RealD rAr, rAAr, rArp;
+    RealD pAp, pAAp;
+
+    GridBase *grid = src.Grid();
+    Field r(grid),  p(grid), Ap(grid), Ar(grid), z(grid);
+      
+    psi=zero;
+    r  = src;
+    Preconditioner(r,p);
+
+      
+
+    Linop.HermOpAndNorm(p,Ap,pAp,pAAp);
+    Ar=Ap;
+    rAr=pAp;
+    rAAr=pAAp;
+
+    cp =norm2(r);
+    ssq=norm2(src);
+    rsq=Tolerance*Tolerance*ssq;
+
+    if (verbose) std::cout<<GridLogMessage<<"PrecConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl;
+
+    for(int k=0;k<MaxIterations;k++){
+
+
+      Preconditioner(Ap,z);
+      RealD rq= real(innerProduct(Ap,z)); 
+
+      a = rAr/rq;
+
+      axpy(psi,a,p,psi);
+      cp = axpy_norm(r,-a,z,r);
+
+      rArp=rAr;
+
+      Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
+
+      b   =rAr/rArp;
+ 
+      axpy(p,b,p,r);
+      pAAp=axpy_norm(Ap,b,Ap,Ar);
+	
+      if(verbose) std::cout<<GridLogMessage<<"PrecConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
+
+      if(cp<rsq) {
+	Linop.HermOp(psi,Ap);
+	axpy(r,-1.0,src,Ap);
+	RealD true_resid = norm2(r)/ssq;
+	std::cout<<GridLogMessage<<"PrecConjugateResidual: Converged on iteration " <<k
+		 << " computed residual "<<sqrt(cp/ssq)
+		 << " true residual "<<sqrt(true_resid)
+		 << " target "       <<Tolerance <<std::endl;
+	return;
+      }
+
+    }
+
+    std::cout<<GridLogMessage<<"PrecConjugateResidual did NOT converge"<<std::endl;
+    assert(0);
+  }
+};
+NAMESPACE_END(Grid);
+#endif
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
@@ -0,0 +1,230 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_PREC_GCR_H
+#define GRID_PREC_GCR_H
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+//VPGCR Abe and Zhang, 2005.
+//INTERNATIONAL JOURNAL OF NUMERICAL ANALYSIS AND MODELING
+//Computing and Information Volume 2, Number 2, Pages 147-161
+//NB. Likely not original reference since they are focussing on a preconditioner variant.
+//    but VPGCR was nicely written up in their paper
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+NAMESPACE_BEGIN(Grid);
+
+template<class Field>
+class PrecGeneralisedConjugateResidual : public OperatorFunction<Field> {
+public:                                                
+  RealD   Tolerance;
+  Integer MaxIterations;
+  int verbose;
+  int mmax;
+  int nstep;
+  int steps;
+  GridStopWatch PrecTimer;
+  GridStopWatch MatTimer;
+  GridStopWatch LinalgTimer;
+
+  LinearFunction<Field> &Preconditioner;
+
+  PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec,int _mmax,int _nstep) : 
+    Tolerance(tol), 
+    MaxIterations(maxit),
+    Preconditioner(Prec),
+    mmax(_mmax),
+    nstep(_nstep)
+  { 
+    verbose=1;
+  };
+
+  void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
+
+    psi=Zero();
+    RealD cp, ssq,rsq;
+    ssq=norm2(src);
+    rsq=Tolerance*Tolerance*ssq;
+      
+    Field r(src.Grid());
+
+    PrecTimer.Reset();
+    MatTimer.Reset();
+    LinalgTimer.Reset();
+
+    GridStopWatch SolverTimer;
+    SolverTimer.Start();
+
+    steps=0;
+    for(int k=0;k<MaxIterations;k++){
+
+      cp=GCRnStep(Linop,src,psi,rsq);
+
+      std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl;
+
+      if(cp<rsq) {
+
+	SolverTimer.Stop();
+
+	Linop.HermOp(psi,r);
+	axpy(r,-1.0,src,r);
+	RealD tr = norm2(r);
+	std::cout<<GridLogMessage<<"PrecGeneralisedConjugateResidual: Converged on iteration " <<steps
+		 << " computed residual "<<sqrt(cp/ssq)
+		 << " true residual "    <<sqrt(tr/ssq)
+		 << " target "           <<Tolerance <<std::endl;
+
+	std::cout<<GridLogMessage<<"VPGCR Time elapsed: Total  "<< SolverTimer.Elapsed() <<std::endl;
+	std::cout<<GridLogMessage<<"VPGCR Time elapsed: Precon "<<   PrecTimer.Elapsed() <<std::endl;
+	std::cout<<GridLogMessage<<"VPGCR Time elapsed: Matrix "<<    MatTimer.Elapsed() <<std::endl;
+	std::cout<<GridLogMessage<<"VPGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl;
+	return;
+      }
+
+    }
+    std::cout<<GridLogMessage<<"Variable Preconditioned GCR did not converge"<<std::endl;
+    assert(0);
+  }
+
+  RealD GCRnStep(LinearOperatorBase<Field> &Linop,const Field &src, Field &psi,RealD rsq){
+
+    RealD cp;
+    RealD a, b;
+    RealD zAz, zAAz;
+    RealD rq;
+
+    GridBase *grid = src.Grid();
+
+    Field r(grid);
+    Field z(grid);
+    Field tmp(grid);
+    Field ttmp(grid);
+    Field Az(grid);
+
+    ////////////////////////////////
+    // history for flexible orthog
+    ////////////////////////////////
+    std::vector<Field> q(mmax,grid);
+    std::vector<Field> p(mmax,grid);
+    std::vector<RealD> qq(mmax);
+      
+    //////////////////////////////////
+    // initial guess x0 is taken as nonzero.
+    // r0=src-A x0 = src
+    //////////////////////////////////
+    MatTimer.Start();
+    Linop.HermOpAndNorm(psi,Az,zAz,zAAz); 
+    MatTimer.Stop();
+    r=src-Az;
+      
+    /////////////////////
+    // p = Prec(r)
+    /////////////////////
+    PrecTimer.Start();
+    Preconditioner(r,z);
+    PrecTimer.Stop();
+
+    MatTimer.Start();
+    Linop.HermOp(z,tmp); 
+    MatTimer.Stop();
+
+    ttmp=tmp;
+    tmp=tmp-r;
+
+    /*
+      std::cout<<GridLogMessage<<r<<std::endl;
+      std::cout<<GridLogMessage<<z<<std::endl;
+      std::cout<<GridLogMessage<<ttmp<<std::endl;
+      std::cout<<GridLogMessage<<tmp<<std::endl;
+    */
+
+    MatTimer.Start();
+    Linop.HermOpAndNorm(z,Az,zAz,zAAz); 
+    MatTimer.Stop();
+
+    //p[0],q[0],qq[0] 
+    p[0]= z;
+    q[0]= Az;
+    qq[0]= zAAz;
+
+    cp =norm2(r);
+
+    for(int k=0;k<nstep;k++){
+
+      steps++;
+
+      int kp     = k+1;
+      int peri_k = k %mmax;
+      int peri_kp= kp%mmax;
+
+      rq= real(innerProduct(r,q[peri_k])); // what if rAr not real?
+      a = rq/qq[peri_k];
+
+      axpy(psi,a,p[peri_k],psi);         
+
+      cp = axpy_norm(r,-a,q[peri_k],r);  
+
+      if((k==nstep-1)||(cp<rsq)){
+	return cp;
+      }
+
+      std::cout<<GridLogMessage<< " VPGCR_step["<<steps<<"]  resid " <<sqrt(cp/rsq)<<std::endl; 
+
+      PrecTimer.Start();
+      Preconditioner(r,z);// solve Az = r
+      PrecTimer.Stop();
+
+      MatTimer.Start();
+      Linop.HermOpAndNorm(z,Az,zAz,zAAz);
+      Linop.HermOp(z,tmp);
+      MatTimer.Stop();
+      tmp=tmp-r;
+      std::cout<<GridLogMessage<< " Preconditioner resid " <<sqrt(norm2(tmp)/norm2(r))<<std::endl; 
+
+      q[peri_kp]=Az;
+      p[peri_kp]=z;
+
+      int northog = ((kp)>(mmax-1))?(mmax-1):(kp);  // if more than mmax done, we orthog all mmax history.
+      for(int back=0;back<northog;back++){
+
+	int peri_back=(k-back)%mmax;   	  assert((k-back)>=0);
+
+	b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
+	p[peri_kp]=p[peri_kp]+b*p[peri_back];
+	q[peri_kp]=q[peri_kp]+b*q[peri_back];
+
+      }
+      qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
+
+
+    }
+    assert(0); // never reached
+    return cp;
+  }
+};
+NAMESPACE_END(Grid);
+#endif
--- a/Grid/algorithms/iterative/SchurRedBlack.h
+++ b/Grid/algorithms/iterative/SchurRedBlack.h
@@ -0,0 +1,473 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/SchurRedBlack.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_SCHUR_RED_BLACK_H
+#define GRID_SCHUR_RED_BLACK_H
+
+
+  /*
+   * Red black Schur decomposition
+   *
+   *  M = (Mee Meo) =  (1             0 )   (Mee   0               )  (1 Mee^{-1} Meo)
+   *      (Moe Moo)    (Moe Mee^-1    1 )   (0   Moo-Moe Mee^-1 Meo)  (0   1         )
+   *                =         L                     D                     U
+   *
+   * L^-1 = (1              0 )
+   *        (-MoeMee^{-1}   1 )   
+   * L^{dag} = ( 1       Mee^{-dag} Moe^{dag} )
+   *           ( 0       1                    )
+   * L^{-d}  = ( 1      -Mee^{-dag} Moe^{dag} )
+   *           ( 0       1                    )
+   *
+   * U^-1 = (1   -Mee^{-1} Meo)
+   *        (0    1           )
+   * U^{dag} = ( 1                 0)
+   *           (Meo^dag Mee^{-dag} 1)
+   * U^{-dag} = (  1                 0)
+   *            (-Meo^dag Mee^{-dag} 1)
+   ***********************
+   *     M psi = eta
+   ***********************
+   *Odd
+   * i)                 D_oo psi_o =  L^{-1}  eta_o
+   *                        eta_o' = (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
+   *
+   * Wilson:
+   *      (D_oo)^{\dag} D_oo psi_o = (D_oo)^dag L^{-1}  eta_o
+   * Stag:
+   *      D_oo psi_o = L^{-1}  eta =    (eta_o - Moe Mee^{-1} eta_e)
+   *
+   * L^-1 eta_o= (1              0 ) (e
+   *             (-MoeMee^{-1}   1 )   
+   *
+   *Even
+   * ii)  Mee psi_e + Meo psi_o = src_e
+   *
+   *   => sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+   *
+   * 
+   * TODO: Other options:
+   * 
+   * a) change checkerboards for Schur e<->o
+   *
+   * Left precon by Moo^-1
+   * b) Doo^{dag} M_oo^-dag Moo^-1 Doo psi_0 =  (D_oo)^dag M_oo^-dag Moo^-1 L^{-1}  eta_o
+   *                              eta_o'     = (D_oo)^dag  M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e)
+   *
+   * Right precon by Moo^-1
+   * c) M_oo^-dag Doo^{dag} Doo Moo^-1 phi_0 = M_oo^-dag (D_oo)^dag L^{-1}  eta_o
+   *                              eta_o'     = M_oo^-dag (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
+   *                              psi_o = M_oo^-1 phi_o
+   * TODO: Deflation 
+   */
+namespace Grid {
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Use base class to share code
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Take a matrix and form a Red Black solver calling a Herm solver
+  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field> class SchurRedBlackBase {
+  protected:
+    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
+    OperatorFunction<Field> & _HermitianRBSolver;
+    int CBfactorise;
+    bool subGuess;
+  public:
+
+    SchurRedBlackBase(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  :
+    _HermitianRBSolver(HermitianRBSolver) 
+    { 
+      CBfactorise = 0;
+      subtractGuess(initSubGuess);
+    };
+    void subtractGuess(const bool initSubGuess)
+    {
+      subGuess = initSubGuess;
+    }
+    bool isSubtractGuess(void)
+    {
+      return subGuess;
+    }
+
+    /////////////////////////////////////////////////////////////
+    // Shared code
+    /////////////////////////////////////////////////////////////
+    void operator() (Matrix & _Matrix,const Field &in, Field &out){
+      ZeroGuesser<Field> guess;
+      (*this)(_Matrix,in,out,guess);
+    }
+    void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out) 
+    {
+      ZeroGuesser<Field> guess;
+      (*this)(_Matrix,in,out,guess);
+    }
+
+    template<class Guesser>
+    void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out,Guesser &guess) 
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+      int nblock = in.size();
+
+      std::vector<Field> src_o(nblock,grid);
+      std::vector<Field> sol_o(nblock,grid);
+      
+      std::vector<Field> guess_save;
+
+      Field resid(fgrid);
+      Field tmp(grid);
+
+      ////////////////////////////////////////////////
+      // Prepare RedBlack source
+      ////////////////////////////////////////////////
+      for(int b=0;b<nblock;b++){
+	RedBlackSource(_Matrix,in[b],tmp,src_o[b]);
+      }
+      ////////////////////////////////////////////////
+      // Make the guesses
+      ////////////////////////////////////////////////
+      if ( subGuess ) guess_save.resize(nblock,grid);
+
+      for(int b=0;b<nblock;b++){
+	guess(src_o[b],sol_o[b]); 
+
+	if ( subGuess ) { 
+	  guess_save[b] = sol_o[b];
+	}
+      }
+      //////////////////////////////////////////////////////////////
+      // Call the block solver
+      //////////////////////////////////////////////////////////////
+      std::cout<<GridLogMessage << "SchurRedBlackBase calling the solver for "<<nblock<<" RHS" <<std::endl;
+      RedBlackSolve(_Matrix,src_o,sol_o);
+
+      ////////////////////////////////////////////////
+      // A2A boolean behavioural control & reconstruct other checkerboard
+      ////////////////////////////////////////////////
+      for(int b=0;b<nblock;b++) {
+
+	if (subGuess)   sol_o[b] = sol_o[b] - guess_save[b];
+
+	///////// Needs even source //////////////
+	pickCheckerboard(Even,tmp,in[b]);
+	RedBlackSolution(_Matrix,sol_o[b],tmp,out[b]);
+
+	/////////////////////////////////////////////////
+	// Check unprec residual if possible
+	/////////////////////////////////////////////////
+	if ( ! subGuess ) {
+	  _Matrix.M(out[b],resid); 
+	  resid = resid-in[b];
+	  RealD ns = norm2(in[b]);
+	  RealD nr = norm2(resid);
+	
+	  std::cout<<GridLogMessage<< "SchurRedBlackBase solver true unprec resid["<<b<<"] "<<std::sqrt(nr/ns) << std::endl;
+	} else {
+	  std::cout<<GridLogMessage<< "SchurRedBlackBase Guess subtracted after solve["<<b<<"] " << std::endl;
+	}
+
+      }
+    }
+    template<class Guesser>
+    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
+
+      // FIXME CGdiagonalMee not implemented virtual function
+      // FIXME use CBfactorise to control schur decomp
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field resid(fgrid);
+      Field src_o(grid);
+      Field src_e(grid);
+      Field sol_o(grid);
+
+      ////////////////////////////////////////////////
+      // RedBlack source
+      ////////////////////////////////////////////////
+      RedBlackSource(_Matrix,in,src_e,src_o);
+
+      ////////////////////////////////
+      // Construct the guess
+      ////////////////////////////////
+      Field   tmp(grid);
+      guess(src_o,sol_o);
+
+      Field  guess_save(grid);
+      guess_save = sol_o;
+
+      //////////////////////////////////////////////////////////////
+      // Call the red-black solver
+      //////////////////////////////////////////////////////////////
+      RedBlackSolve(_Matrix,src_o,sol_o);
+
+      ////////////////////////////////////////////////
+      // Fionn A2A boolean behavioural control
+      ////////////////////////////////////////////////
+      if (subGuess)      sol_o= sol_o-guess_save;
+
+      ///////////////////////////////////////////////////
+      // RedBlack solution needs the even source
+      ///////////////////////////////////////////////////
+      RedBlackSolution(_Matrix,sol_o,src_e,out);
+
+      // Verify the unprec residual
+      if ( ! subGuess ) {
+        _Matrix.M(out,resid); 
+        resid = resid-in;
+        RealD ns = norm2(in);
+        RealD nr = norm2(resid);
+
+        std::cout<<GridLogMessage << "SchurRedBlackBase solver true unprec resid "<< std::sqrt(nr/ns) << std::endl;
+      } else {
+        std::cout << GridLogMessage << "SchurRedBlackBase Guess subtracted after solve." << std::endl;
+      }
+    }     
+    
+    /////////////////////////////////////////////////////////////
+    // Override in derived. Not virtual as template methods
+    /////////////////////////////////////////////////////////////
+    virtual void RedBlackSource  (Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)                =0;
+    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)          =0;
+    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)                           =0;
+    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)=0;
+
+  };
+
+  template<class Field> class SchurRedBlackStaggeredSolve : public SchurRedBlackBase<Field> {
+  public:
+    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
+
+    SchurRedBlackStaggeredSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false) 
+      :    SchurRedBlackBase<Field> (HermitianRBSolver,initSubGuess) 
+    {
+    }
+
+    //////////////////////////////////////////////////////
+    // Override RedBlack specialisation
+    //////////////////////////////////////////////////////
+    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   tmp(grid);
+      Field  Mtmp(grid);
+
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd ,src_o,src);
+
+      /////////////////////////////////////////////////////
+      // src_o = (source_o - Moe MeeInv source_e)
+      /////////////////////////////////////////////////////
+      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.Checkerboard() ==Even);
+      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.Checkerboard() ==Odd);     
+      tmp=src_o-Mtmp;                  assert(  tmp.Checkerboard() ==Odd);     
+
+      _Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
+    }
+    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e_c,Field &sol)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   tmp(grid);
+      Field   sol_e(grid);
+      Field   src_e(grid);
+
+      src_e = src_e_c; // Const correctness
+
+      ///////////////////////////////////////////////////
+      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+      ///////////////////////////////////////////////////
+      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.Checkerboard()   ==Even);
+      src_e = src_e-tmp;               assert(  src_e.Checkerboard() ==Even);
+      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.Checkerboard() ==Even);
+     
+      setCheckerboard(sol,sol_e); assert(  sol_e.Checkerboard() ==Even);
+      setCheckerboard(sol,sol_o); assert(  sol_o.Checkerboard() ==Odd );
+    }
+    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
+    {
+      SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.Checkerboard()==Odd);
+    };
+    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
+    {
+      SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
+    }
+  };
+  template<class Field> using SchurRedBlackStagSolve = SchurRedBlackStaggeredSolve<Field>;
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Site diagonal has Mooee on it.
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field> class SchurRedBlackDiagMooeeSolve : public SchurRedBlackBase<Field> {
+  public:
+    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
+
+    SchurRedBlackDiagMooeeSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  
+      : SchurRedBlackBase<Field> (HermitianRBSolver,initSubGuess) {};
+
+
+    //////////////////////////////////////////////////////
+    // Override RedBlack specialisation
+    //////////////////////////////////////////////////////
+    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   tmp(grid);
+      Field  Mtmp(grid);
+
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd ,src_o,src);
+
+      /////////////////////////////////////////////////////
+      // src_o = Mdag * (source_o - Moe MeeInv source_e)
+      /////////////////////////////////////////////////////
+      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.Checkerboard() ==Even);
+      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.Checkerboard() ==Odd);     
+      tmp=src_o-Mtmp;                  assert(  tmp.Checkerboard() ==Odd);     
+
+      // get the right MpcDag
+      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
+      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.Checkerboard() ==Odd);       
+
+    }
+    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   tmp(grid);
+      Field  sol_e(grid);
+      Field  src_e_i(grid);
+      ///////////////////////////////////////////////////
+      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+      ///////////////////////////////////////////////////
+      _Matrix.Meooe(sol_o,tmp);          assert(  tmp.Checkerboard()   ==Even);
+      src_e_i = src_e-tmp;               assert(  src_e_i.Checkerboard() ==Even);
+      _Matrix.MooeeInv(src_e_i,sol_e);   assert(  sol_e.Checkerboard() ==Even);
+     
+      setCheckerboard(sol,sol_e); assert(  sol_e.Checkerboard() ==Even);
+      setCheckerboard(sol,sol_o); assert(  sol_o.Checkerboard() ==Odd );
+    }
+    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
+    {
+      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.Checkerboard()==Odd);
+    };
+    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
+    {
+      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
+    }
+  };
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Site diagonal is identity, right preconditioned by Mee^inv
+  // ( 1 - Meo Moo^inv Moe Mee^inv  ) phi =( 1 - Meo Moo^inv Moe Mee^inv  ) Mee psi =  = eta  = eta
+  //=> psi = MeeInv phi
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field> class SchurRedBlackDiagTwoSolve : public SchurRedBlackBase<Field> {
+  public:
+    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
+
+    /////////////////////////////////////////////////////
+    // Wrap the usual normal equations Schur trick
+    /////////////////////////////////////////////////////
+  SchurRedBlackDiagTwoSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  
+    : SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess) {};
+
+    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
+      
+      Field   tmp(grid);
+      Field  Mtmp(grid);
+
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd ,src_o,src);
+    
+      /////////////////////////////////////////////////////
+      // src_o = Mdag * (source_o - Moe MeeInv source_e)
+      /////////////////////////////////////////////////////
+      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.Checkerboard() ==Even);
+      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.Checkerboard() ==Odd);     
+      tmp=src_o-Mtmp;                  assert(  tmp.Checkerboard() ==Odd);     
+
+      // get the right MpcDag
+      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.Checkerboard() ==Odd);       
+    }
+
+    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   sol_o_i(grid);
+      Field   tmp(grid);
+      Field   sol_e(grid);
+
+      ////////////////////////////////////////////////
+      // MooeeInv due to pecond
+      ////////////////////////////////////////////////
+      _Matrix.MooeeInv(sol_o,tmp);
+      sol_o_i = tmp;
+
+      ///////////////////////////////////////////////////
+      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+      ///////////////////////////////////////////////////
+      _Matrix.Meooe(sol_o_i,tmp);    assert(  tmp.Checkerboard()   ==Even);
+      tmp = src_e-tmp;               assert(  src_e.Checkerboard() ==Even);
+      _Matrix.MooeeInv(tmp,sol_e);   assert(  sol_e.Checkerboard() ==Even);
+     
+      setCheckerboard(sol,sol_e);    assert(  sol_e.Checkerboard() ==Even);
+      setCheckerboard(sol,sol_o_i);  assert(  sol_o_i.Checkerboard() ==Odd );
+    };
+
+    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
+    {
+      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
+    };
+    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
+    {
+      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
+    }
+  };
+}
+#endif