diff --git a/lib/Algorithms.h b/lib/Algorithms.h
index 0252606f..2007cf60 100644
--- a/lib/Algorithms.h
+++ b/lib/Algorithms.h
@@ -5,14 +5,17 @@
 #include <algorithms/LinearOperator.h>
 #include <algorithms/CoarsenedMatrix.h>
 
+#include <algorithms/approx/Zolotarev.h>
+#include <algorithms/approx/Chebyshev.h>
+#include <algorithms/approx/Remez.h>
+#include <algorithms/approx/MultiShiftFunction.h>
+
 #include <algorithms/iterative/ConjugateGradient.h>
 #include <algorithms/iterative/ConjugateResidual.h>
 #include <algorithms/iterative/NormalEquations.h>
 #include <algorithms/iterative/SchurRedBlack.h>
 
-#include <algorithms/approx/Zolotarev.h>
-#include <algorithms/approx/Chebyshev.h>
-#include <algorithms/approx/Remez.h>
+#include <algorithms/iterative/ConjugateGradientMultiShift.h>
 
 // Eigen/lanczos
 // EigCg
diff --git a/lib/Make.inc b/lib/Make.inc
index 069e818d..c38606e2 100644
--- a/lib/Make.inc
+++ b/lib/Make.inc
@@ -1,4 +1,4 @@
 
-HFILES=./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/CoarsenedMatrix.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/LinearOperator.h ./algorithms/SparseMatrix.h ./Algorithms.h ./AlignedAllocator.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./Cartesian.h ./communicator/Communicator_base.h ./Communicator.h ./Comparison.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./Cshift.h ./Grid.h ./GridConfig.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_ET.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_where.h ./Lattice.h ./parallelIO/NerscIO.h ./qcd/action/Actions.h ./qcd/action/DiffAction.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/gauge/GaugeActionBase.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/QCD.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/WilsonLoops.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Old/Grid_vComplexD.h ./simd/Old/Grid_vComplexF.h ./simd/Old/Grid_vInteger.h ./simd/Old/Grid_vRealD.h ./simd/Old/Grid_vRealF.h ./Simd.h ./stencil/Lebesgue.h ./Stencil.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_inner.h ./tensors/Tensor_outer.h ./tensors/Tensor_peek.h ./tensors/Tensor_poke.h ./tensors/Tensor_reality.h ./tensors/Tensor_Ta.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./Tensors.h ./Threads.h
+HFILES=./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/CoarsenedMatrix.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/LinearOperator.h ./algorithms/SparseMatrix.h ./Algorithms.h ./AlignedAllocator.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./Cartesian.h ./communicator/Communicator_base.h ./Communicator.h ./Comparison.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./Cshift.h ./Grid.h ./GridConfig.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_ET.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_unary.h ./lattice/Lattice_where.h ./Lattice.h ./parallelIO/NerscIO.h ./qcd/action/Actions.h ./qcd/action/DiffAction.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/gauge/GaugeActionBase.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/QCD.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/WilsonLoops.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Grid_vector_unops.h ./simd/Old/Grid_vComplexD.h ./simd/Old/Grid_vComplexF.h ./simd/Old/Grid_vInteger.h ./simd/Old/Grid_vRealD.h ./simd/Old/Grid_vRealF.h ./Simd.h ./stencil/Lebesgue.h ./Stencil.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_inner.h ./tensors/Tensor_outer.h ./tensors/Tensor_peek.h ./tensors/Tensor_poke.h ./tensors/Tensor_reality.h ./tensors/Tensor_Ta.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./tensors/Tensor_unary.h ./Tensors.h ./Threads.h
 
-CCFILES=./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./GridInit.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/spin/Dirac.cc ./qcd/utils/SpaceTimeGrid.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc
+CCFILES=./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./GridInit.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/spin/Dirac.cc ./qcd/utils/SpaceTimeGrid.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc
diff --git a/lib/algorithms/LinearOperator.h b/lib/algorithms/LinearOperator.h
index ee0b2ec1..1cf78501 100644
--- a/lib/algorithms/LinearOperator.h
+++ b/lib/algorithms/LinearOperator.h
@@ -207,6 +207,14 @@ namespace Grid {
       virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0;
     };
 
+    /////////////////////////////////////////////////////////////
+    // Base classes for Multishift solvers for operators
+    /////////////////////////////////////////////////////////////
+    template<class Field> class OperatorMultiFunction {
+    public:
+      virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, std::vector<Field> &out) = 0;
+    };
+
     // FIXME : To think about
 
     // Chroma functionality list defining LinearOperator
diff --git a/lib/algorithms/approx/MultiShiftFunction.cc b/lib/algorithms/approx/MultiShiftFunction.cc
new file mode 100644
index 00000000..8f62dffd
--- /dev/null
+++ b/lib/algorithms/approx/MultiShiftFunction.cc
@@ -0,0 +1,29 @@
+#include <Grid.h>
+
+namespace Grid {
+double MultiShiftFunction::approx(double x)
+{
+  double a = norm;
+  for(int n=0;n<poles.size();n++){
+    a = a + residues[n]/(x+poles[n]);
+  }
+  return a;
+}
+void MultiShiftFunction::gnuplot(std::ostream &out)
+{
+  out<<"f(x) = "<<norm<<"";
+  for(int n=0;n<poles.size();n++){
+    out<<"+("<<residues[n]<<"/(x+"<<poles[n]<<"))";
+  }
+  out<<";"<<std::endl;
+}
+void MultiShiftFunction::csv(std::ostream &out)
+{
+  for (double x=lo; x<hi; x*=1.05) {
+    double f = approx(x);
+    double r = sqrt(x);
+    out<< x<<","<<r<<","<<f<<","<<r-f<<std::endl;
+  }
+  return;
+}
+}
diff --git a/lib/algorithms/approx/MultiShiftFunction.h b/lib/algorithms/approx/MultiShiftFunction.h
new file mode 100644
index 00000000..4136736e
--- /dev/null
+++ b/lib/algorithms/approx/MultiShiftFunction.h
@@ -0,0 +1,28 @@
+#ifndef MULTI_SHIFT_FUNCTION
+#define MULTI_SHIFT_FUNCTION
+namespace Grid {
+class MultiShiftFunction {
+public:
+  int order;
+  std::vector<RealD> poles;
+  std::vector<RealD> residues;
+  std::vector<RealD> tolerances;
+  RealD norm;
+  RealD lo,hi;
+  MultiShiftFunction(int n,RealD _lo,RealD _hi): poles(n), residues(n), lo(_lo), hi(_hi) {;};
+  RealD approx(RealD x);
+  void csv(std::ostream &out);
+  void gnuplot(std::ostream &out);
+  MultiShiftFunction(AlgRemez & remez,double tol,bool inverse) :
+      order(remez.getDegree()),
+      tolerances(remez.getDegree(),tol),
+      poles(remez.getDegree()),
+      residues(remez.getDegree())
+  {
+    remez.getBounds(lo,hi);
+    if ( inverse ) remez.getIPFE (&residues[0],&poles[0],&norm);
+    else remez.getPFE (&residues[0],&poles[0],&norm);
+  }
+};
+}
+#endif
diff --git a/lib/algorithms/approx/Remez.h b/lib/algorithms/approx/Remez.h
index 906811be..336382d2 100644
--- a/lib/algorithms/approx/Remez.h
+++ b/lib/algorithms/approx/Remez.h
@@ -125,8 +125,17 @@ class AlgRemez
   // Destructor
   virtual ~AlgRemez();
 
+  int getDegree(void){ 
+    assert(n==d);
+    return n;
+  }
   // Reset the bounds of the approximation
   void setBounds(double lower, double upper);
+  // Reset the bounds of the approximation
+  void getBounds(double &lower, double &upper) { 
+    lower=(double)apstrt;
+    upper=(double)apend;
+  }
 
   // Generate the rational approximation x^(pnum/pden)
   double generateApprox(int num_degree, int den_degree, 
diff --git a/lib/algorithms/iterative/ConjugateGradientMultiShift.h b/lib/algorithms/iterative/ConjugateGradientMultiShift.h
new file mode 100644
index 00000000..580ab838
--- /dev/null
+++ b/lib/algorithms/iterative/ConjugateGradientMultiShift.h
@@ -0,0 +1,250 @@
+#ifndef GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H
+#define GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H
+
+namespace Grid {
+
+    /////////////////////////////////////////////////////////////
+    // Base classes for iterative processes based on operators
+    // single input vec, single output vec.
+    /////////////////////////////////////////////////////////////
+
+  template<class Field> 
+    class ConjugateGradientMultiShift : public OperatorMultiFunction<Field>,
+                                        public OperatorFunction<Field>
+    {
+public:                                                
+    RealD   Tolerance;
+    Integer MaxIterations;
+    int verbose;
+    MultiShiftFunction shifts;
+
+    ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) : 
+	MaxIterations(maxit),
+	shifts(_shifts)
+    { 
+      verbose=1;
+    }
+
+void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
+{
+
+  GridBase *grid = src._grid;
+  int nshift = shifts.order;
+  std::vector<Field> results(nshift,grid);
+
+  (*this)(Linop,src,results);
+  
+  psi = shifts.norm*src;
+  for(int i=0;i<nshift;i++){
+    psi = psi + shifts.residues[i]*results[i];
+  }
+
+  return;
+}
+
+void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &psi)
+{
+  
+  GridBase *grid = src._grid;
+  
+  ////////////////////////////////////////////////////////////////////////
+  // Convenience references to the info stored in "MultiShiftFunction"
+  ////////////////////////////////////////////////////////////////////////
+  int nshift = shifts.order;
+
+  std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
+  std::vector<RealD> &mresidual(shifts.tolerances);
+  std::vector<RealD> alpha(nshift,1.0);
+  std::vector<Field>   ps(nshift,grid);// Search directions
+
+  assert(psi.size()==nshift);
+  assert(mass.size()==nshift);
+  assert(mresidual.size()==nshift);
+  
+  // dynamic sized arrays on stack; 2d is a pain with vector
+  RealD  bs[nshift];
+  RealD  rsq[nshift];
+  RealD  z[nshift][2];
+  int     converged[nshift];
+  
+  const int       primary =0;
+  
+  //Primary shift fields CG iteration
+  RealD a,b,c,d;
+  RealD cp,bp,qq; //prev
+  
+  // Matrix mult fields
+  Field r(grid);
+  Field p(grid);
+  Field tmp(grid);
+  Field mmp(grid);
+  
+  // Check lightest mass
+  for(int s=0;s<nshift;s++){
+    assert( mass[s]>= mass[primary] );
+    converged[s]=0;
+  }
+  
+  // Wire guess to zero
+  // Residuals "r" are src
+  // First search direction "p" is also src
+  cp = norm2(src);
+  for(int s=0;s<nshift;s++){
+    rsq[s] = cp * mresidual[s] * mresidual[s];
+    std::cout<<"ConjugateGradientMultiShift: shift "<<s
+	     <<" target resid "<<rsq[s]<<std::endl;
+    ps[s] = src;
+  }
+  // r and p for primary
+  r=src;
+  p=src;
+  
+  //MdagM+m[0]
+  Linop.HermOpAndNorm(p,mmp,d,qq);
+  axpy(mmp,mass[0],p,mmp);
+  RealD rn = norm2(p);
+  d += rn*mass[0];
+  
+  // have verified that inner product of 
+  // p and mmp is equal to d after this since
+  // the d computation is tricky
+  //  qq = real(innerProduct(p,mmp));
+  //  std::cout << "debug equal ?  qq "<<qq<<" d "<< d<<std::endl;
+  
+  b = -cp /d;
+  
+  // Set up the various shift variables
+  int       iz=0;
+  z[0][1-iz] = 1.0;
+  z[0][iz]   = 1.0;
+  bs[0]      = b;
+  for(int s=1;s<nshift;s++){
+    z[s][1-iz] = 1.0;
+    z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0]));
+    bs[s]      = b*z[s][iz]; 
+  }
+  
+  // r += b[0] A.p[0]
+  // c= norm(r)
+  c=axpy_norm(r,b,mmp,r);
+  
+  for(int s=0;s<nshift;s++) {
+    axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
+  }
+  
+  
+  // Iteration loop
+  int k;
+  
+  for (k=1;k<=MaxIterations;k++){
+    
+    a = c /cp;
+    axpy(p,a,p,r);
+    
+    // Note to self - direction ps is iterated seperately
+    // for each shift. Does not appear to have any scope
+    // for avoiding linear algebra in "single" case.
+    // 
+    // However SAME r is used. Could load "r" and update
+    // ALL ps[s]. 2/3 Bandwidth saving
+    // New Kernel: Load r, vector of coeffs, vector of pointers ps
+    for(int s=0;s<nshift;s++){
+      if ( ! converged[s] ) { 
+	if (s==0){
+	  axpy(ps[s],a,ps[s],r);
+	} else{
+	  RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
+	  axpby(ps[s],z[s][iz],as,r,ps[s]);
+	}
+      }
+    }
+    
+    cp=c;
+    
+    Linop.HermOpAndNorm(p,mmp,d,qq);
+    axpy(mmp,mass[0],p,mmp);
+    RealD rn = norm2(p);
+    d += rn*mass[0];
+    
+    bp=b;
+    b=-cp/d;
+    
+    c=axpy_norm(r,b,mmp,r);
+
+    // Toggle the recurrence history
+    bs[0] = b;
+    iz = 1-iz;
+    for(int s=1;s<nshift;s++){
+      if((!converged[s])){
+	RealD z0 = z[s][1-iz];
+	RealD z1 = z[s][iz];
+	z[s][iz] = z0*z1*bp
+	  / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b)); 
+	bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike
+      }
+    }
+    
+    for(int s=0;s<nshift;s++){
+      int ss = s;
+      // Scope for optimisation here in case of "single".
+      // Could load psi[0] and pull all ps[s] in.
+      //      if ( single ) ss=primary;
+      // Bandwith saving in single case is Ls * 3 -> 2+Ls, so ~ 3x saving
+      // Pipelined CG gain:
+      //
+      // New Kernel: Load r, vector of coeffs, vector of pointers ps
+      // New Kernel: Load psi[0], vector of coeffs, vector of pointers ps
+      // If can predict the coefficient bs then we can fuse these and avoid write reread cyce
+      //  on ps[s].
+      // Before:  3 x npole  + 3 x npole
+      // After :  2 x npole (ps[s])        => 3x speed up of multishift CG.
+      
+      if( (!converged[s]) ) { 
+	axpy(psi[ss],-bs[s]*alpha[s],ps[s],psi[ss]);
+      }
+    }
+    
+    // Convergence checks
+    int all_converged = 1;
+    for(int s=0;s<nshift;s++){
+      
+      if ( (!converged[s]) ){
+	
+	RealD css  = c * z[s][iz]* z[s][iz];
+	
+	if(css<rsq[s]){
+	  if ( ! converged[s] )
+	    std::cout<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
+	      converged[s]=1;
+	} else {
+	  all_converged=0;
+	}
+
+      }
+    }
+    
+    if ( all_converged ){
+
+      std::cout<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
+      std::cout<< "CGMultiShift: Checking solutions"<<std::endl;
+      
+      // Check answers 
+      for(int s=0; s < nshift; s++) { 
+	Linop.HermOpAndNorm(psi[s],mmp,d,qq);
+	axpy(tmp,mass[s],psi[s],mmp);
+	axpy(r,-alpha[s],src,tmp);
+	RealD rn = norm2(r);
+	RealD cn = norm2(src);
+	std::cout<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
+      }
+      return;
+    }
+  }
+  // ugly hack
+  std::cout<<"CG multi shift did not converge"<<std::endl;
+  assert(0);
+}
+
+  };
+}
+#endif
diff --git a/tests/Make.inc b/tests/Make.inc
index d417ab56..2b132fd0 100644
--- a/tests/Make.inc
+++ b/tests/Make.inc
@@ -1,5 +1,5 @@
 
-bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cf_coarsen_support Test_cf_cr_unprec Test_contfrac_cg Test_contfrac_even_odd Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_gamma Test_main Test_nersc_io Test_remez Test_rng Test_rng_fixed Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd
+bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cf_coarsen_support Test_cf_cr_unprec Test_contfrac_cg Test_contfrac_even_odd Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_gamma Test_main Test_multishift_sqrt Test_nersc_io Test_remez Test_rng Test_rng_fixed Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd
 
 
 Test_GaugeAction_SOURCES=Test_GaugeAction.cc
@@ -70,6 +70,10 @@ Test_main_SOURCES=Test_main.cc
 Test_main_LDADD=-lGrid
 
 
+Test_multishift_sqrt_SOURCES=Test_multishift_sqrt.cc
+Test_multishift_sqrt_LDADD=-lGrid
+
+
 Test_nersc_io_SOURCES=Test_nersc_io.cc
 Test_nersc_io_LDADD=-lGrid
 
diff --git a/tests/Test_multishift_sqrt.cc b/tests/Test_multishift_sqrt.cc
new file mode 100644
index 00000000..8e6b08d4
--- /dev/null
+++ b/tests/Test_multishift_sqrt.cc
@@ -0,0 +1,110 @@
+#include <Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class Field> class DumbOperator  : public LinearOperatorBase<Field> {
+public:
+  LatticeComplex scale;
+  LatticeComplex sqrtscale;
+  DumbOperator(GridBase *grid)
+    : scale(grid),
+      sqrtscale(grid)
+  {
+    GridParallelRNG  pRNG(grid);  
+    std::vector<int> seeds({5,6,7,8});
+    pRNG.SeedFixedIntegers(seeds);
+
+    random(pRNG,sqrtscale);
+    sqrtscale = sqrtscale * adj(sqrtscale);// force real pos def
+    scale = sqrtscale * sqrtscale; 
+  }
+  void Op     (const Field &in, Field &out){
+    out = scale * in;
+  }
+  void AdjOp  (const Field &in, Field &out){
+    out = scale * in;
+  }
+  void HermOpAndNorm(const Field &in, Field &out,double &n1,double &n2){
+    ComplexD dot;
+
+    out = scale * in;
+
+    dot= innerProduct(in,out);
+    n1=real(dot);
+
+    dot = innerProduct(out,out);
+    n2=real(dot);
+  }
+  void ApplySqrt(const Field &in, Field &out){
+    out = sqrtscale * in;
+  }
+};
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  GridCartesian *grid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
+						       GridDefaultSimd(Nd,vComplexF::Nsimd()),
+						       GridDefaultMpi());
+
+  double     lo=0.001;
+  double     hi=1.0;
+  int precision=64;
+  int    degree=10;
+  AlgRemez remez(lo,hi,precision);
+
+  ////////////////////////////////////////
+  // sqrt and inverse sqrt
+  ////////////////////////////////////////
+  std::cout << "Generating degree "<<degree<<" for x^(1/2)"<<std::endl;
+  remez.generateApprox(degree,1,2);
+
+  MultiShiftFunction Sqrt(remez,1.0e-6,false);
+
+  GridParallelRNG  pRNG(grid);  
+  std::vector<int> seeds({1,2,3,4});
+  pRNG.SeedFixedIntegers(seeds);
+
+  LatticeFermion    src(grid); random(pRNG,src);
+  LatticeFermion    combined(grid);
+  LatticeFermion    reference(grid);
+  LatticeFermion    error(grid);
+  std::vector<LatticeFermion> result(degree,grid);
+  LatticeFermion    summed(grid);
+
+  ConjugateGradientMultiShift<LatticeFermion> MSCG(10000,Sqrt);
+
+  DumbOperator<LatticeFermion> Diagonal(grid);
+
+  MSCG(Diagonal,src,result);
+
+
+  //  double a = norm;
+  //  for(int n=0;n<poles.size();n++){
+  //    a = a + residues[n]/(x+poles[n]);
+  //  }
+  assert(Sqrt.order==degree);
+
+  combined = Sqrt.norm*src;
+  for(int i=0;i<degree;i++){
+    combined = combined + Sqrt.residues[i]*result[i];
+  }
+  
+  Diagonal.ApplySqrt(src,reference);
+
+  error = reference - combined;
+
+  std::cout << " Reference "<<norm2(reference)<<std::endl;
+  std::cout << " combined  "<<norm2(combined) <<std::endl;
+  std::cout << " error     "<<norm2(error)    <<std::endl;
+
+  MSCG(Diagonal,src,summed);
+  error = summed - combined;
+  std::cout << " summed-combined "<<norm2(error)    <<std::endl;
+
+  Grid_finalize();
+}
diff --git a/tests/Test_remez.cc b/tests/Test_remez.cc
index 0d65ac7b..5a6300e7 100644
--- a/tests/Test_remez.cc
+++ b/tests/Test_remez.cc
@@ -4,43 +4,6 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 
-class MultiShiftFunction {
-public:
-  std::vector<double> poles;
-  std::vector<double> residues;
-  double norm;
-  double lo,hi;
-  MultiShiftFunction(int n,double _lo,double _hi): poles(n), residues(n), lo(_lo), hi(_hi) {;};
-  double approx(double x);
-  void csv(std::ostream &out);
-  void gnuplot(std::ostream &out);
-};
-double MultiShiftFunction::approx(double x)
-{
-  double a = norm;
-  for(int n=0;n<poles.size();n++){
-    a = a + residues[n]/(x+poles[n]);
-  }
-  return a;
-}
-void MultiShiftFunction::gnuplot(std::ostream &out)
-{
-  out<<"f(x) = "<<norm<<"";
-  for(int n=0;n<poles.size();n++){
-    out<<"+("<<residues[n]<<"/(x+"<<poles[n]<<"))";
-  }
-  out<<";"<<std::endl;
-}
-void MultiShiftFunction::csv(std::ostream &out)
-{
-  for (double x=lo; x<hi; x*=1.05) {
-    double f = approx(x);
-    double r = std::sqrt(x);
-    out<< x<<","<<r<<","<<f<<","<<r-f<<std::endl;
-  }
-  return;
-}
-
 int main (int argc, char ** argv)
 {
   Grid_init(&argc,&argv);
@@ -56,22 +19,18 @@ int main (int argc, char ** argv)
   ////////////////////////////////////////
   // sqrt and inverse sqrt
   ////////////////////////////////////////
-  MultiShiftFunction Sqrt(degree,lo,hi);
-  MultiShiftFunction InvSqrt(degree,lo,hi);
-
-  MultiShiftFunction SqrtSqrt(degree,lo,hi);
-  MultiShiftFunction InvSqrtSqrt(degree,lo,hi);
-
 
   std::cout << "Generating degree "<<degree<<" for x^(1/2)"<<std::endl;
   remez.generateApprox(degree,1,2);
-  remez.getPFE (&   Sqrt.residues[0],&   Sqrt.poles[0],&   Sqrt.norm);
-  remez.getIPFE(&InvSqrt.residues[0],&InvSqrt.poles[0],&InvSqrt.norm);
+  MultiShiftFunction Sqrt(remez,1.0,false);
+  MultiShiftFunction InvSqrt(remez,1.0,true);
+
 
   std::cout << "Generating degree "<<degree<<" for x^(1/4)"<<std::endl;
   remez.generateApprox(degree,1,4);
-  remez.getPFE (&SqrtSqrt.residues[0],&SqrtSqrt.poles[0],&SqrtSqrt.norm);
-  remez.getIPFE(&InvSqrtSqrt.residues[0],&InvSqrtSqrt.poles[0],&InvSqrtSqrt.norm);
+  MultiShiftFunction SqrtSqrt(remez,1.0,false);
+  MultiShiftFunction InvSqrtSqrt(remez,1.0,true);
+
   
   ofstream gnuplot(std::string("Sqrt.gnu"),std::ios::out|std::ios::trunc);
   Sqrt.gnuplot(gnuplot);
diff --git a/tests/Test_wilson_cg_unprec.cc b/tests/Test_wilson_cg_unprec.cc
index c9f2856b..13996017 100644
--- a/tests/Test_wilson_cg_unprec.cc
+++ b/tests/Test_wilson_cg_unprec.cc
@@ -20,7 +20,6 @@ int main (int argc, char ** argv)
 {
   Grid_init(&argc,&argv);
 
-
   std::vector<int> latt_size   = GridDefaultLatt();
   std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
   std::vector<int> mpi_layout  = GridDefaultMpi();