Updating preparing for solvers etc..

2025-11-22 23:49:31 +00:00 · 2015-05-16 23:35:08 +01:00
parent cf99a1f37d
commit e841395dfd
8 changed files with 1170 additions and 34 deletions
--- a/benchmarks/Grid_memory_bandwidth.cc
+++ b/benchmarks/Grid_memory_bandwidth.cc
@@ -19,7 +19,7 @@ int main (int argc, char ** argv)
  std::cout << "===================================================================================================="<<std::endl;
  std::cout << "= Benchmarking fused AXPY bandwidth"<<std::endl;
  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t"<<"GB/s"<<std::endl;
+  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;
  std::cout << "----------------------------------------------------------"<<std::endl;

  for(int lat=4;lat<=32;lat+=4){
@@ -44,15 +44,16 @@ int main (int argc, char ** argv)
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
      
+      double flops=lat*lat*lat*lat*Nvec*2;// mul,add
      double bytes=3*lat*lat*lat*lat*Nvec*sizeof(Real);
-      std::cout<<std::setprecision(2) << lat<<"\t\t"<<bytes<<"\t\t"<<bytes/time<<std::endl;
+      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;

    }

  std::cout << "===================================================================================================="<<std::endl;
  std::cout << "= Benchmarking a*x + y bandwidth"<<std::endl;
  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t"<<"GB/s"<<std::endl;
+  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;
  std::cout << "----------------------------------------------------------"<<std::endl;
  
  for(int lat=4;lat<=32;lat+=4){
@@ -75,15 +76,16 @@ int main (int argc, char ** argv)
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
     
+      double flops=lat*lat*lat*lat*Nvec*2;// mul,add
      double bytes=3*lat*lat*lat*lat*Nvec*sizeof(Real);
-      std::cout<<std::setprecision(2) << lat<<"\t\t"<<bytes<<"\t\t"<<bytes/time<<std::endl;
+      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;

    }

  std::cout << "===================================================================================================="<<std::endl;
  std::cout << "= Benchmarking SCALE bandwidth"<<std::endl;
  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t"<<"GB/s"<<std::endl;
+  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;

  for(int lat=4;lat<=32;lat+=4){

@@ -107,14 +109,15 @@ int main (int argc, char ** argv)
      double time = (stop-start)/Nloop*1000;
      
      double bytes=2*lat*lat*lat*lat*Nvec*sizeof(Real);
-      std::cout <<std::setprecision(2) << lat<<"\t\t"<<bytes<<"\t\t"<<bytes/time<<std::endl;
+      double flops=lat*lat*lat*lat*Nvec*1;// mul
+      std::cout <<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;

  }

  std::cout << "===================================================================================================="<<std::endl;
  std::cout << "= Benchmarking READ bandwidth"<<std::endl;
  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t"<<"GB/s"<<std::endl;
+  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<std::endl;
  std::cout << "----------------------------------------------------------"<<std::endl;

  for(int lat=4;lat<=32;lat+=4){
@@ -139,7 +142,8 @@ int main (int argc, char ** argv)
      double time = (stop-start)/Nloop*1000;
      
      double bytes=lat*lat*lat*lat*Nvec*sizeof(Real);
-      std::cout<<std::setprecision(2) << lat<<"\t\t"<<bytes<<"\t\t"<<bytes/time<<std::endl;
+      double flops=lat*lat*lat*lat*Nvec*2;// mul,add
+      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<<std::endl;

  }    

--- a/benchmarks/Grid_su3.cc
+++ b/benchmarks/Grid_su3.cc
@@ -16,7 +16,7 @@ int main (int argc, char ** argv)
  std::cout << "===================================================================================================="<<std::endl;
  std::cout << "= Benchmarking SU3xSU3  x= x*y"<<std::endl;
  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
+  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
  std::cout << "----------------------------------------------------------"<<std::endl;

  for(int lat=2;lat<=24;lat+=2){
@@ -40,7 +40,7 @@ int main (int argc, char ** argv)
      double bytes=3.0*lat*lat*lat*lat*Nc*Nc*sizeof(Complex);
      double footprint=2.0*lat*lat*lat*lat*Nc*Nc*sizeof(Complex);
      double flops=Nc*Nc*(6.0+8.0+8.0)*lat*lat*lat*lat;
-      std::cout<<std::setprecision(3) << lat<<"\t\t"<<footprint<<"\t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
+      std::cout<<std::setprecision(3) << lat<<"\t\t"<<footprint<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;

    }

@@ -48,7 +48,7 @@ int main (int argc, char ** argv)
  std::cout << "===================================================================================================="<<std::endl;
  std::cout << "= Benchmarking SU3xSU3  z= x*y"<<std::endl;
  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
+  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
  std::cout << "----------------------------------------------------------"<<std::endl;

  for(int lat=2;lat<=24;lat+=2){
@@ -71,14 +71,14 @@ int main (int argc, char ** argv)
      
      double bytes=3*lat*lat*lat*lat*Nc*Nc*sizeof(Complex);
      double flops=Nc*Nc*(6+8+8)*lat*lat*lat*lat;
-      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"\t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
+      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;

    }

  std::cout << "===================================================================================================="<<std::endl;
  std::cout << "= Benchmarking SU3xSU3  mult(z,x,y)"<<std::endl;
  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
+  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
  std::cout << "----------------------------------------------------------"<<std::endl;

  for(int lat=2;lat<=24;lat+=2){
@@ -101,14 +101,14 @@ int main (int argc, char ** argv)
      
      double bytes=3*lat*lat*lat*lat*Nc*Nc*sizeof(Complex);
      double flops=Nc*Nc*(6+8+8)*lat*lat*lat*lat;
-      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"\t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
+      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;

    }

  std::cout << "===================================================================================================="<<std::endl;
  std::cout << "= Benchmarking SU3xSU3  mac(z,x,y)"<<std::endl;
  std::cout << "===================================================================================================="<<std::endl;
-  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
+  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
  std::cout << "----------------------------------------------------------"<<std::endl;

  for(int lat=2;lat<=24;lat+=2){
@@ -131,7 +131,7 @@ int main (int argc, char ** argv)
      
      double bytes=3*lat*lat*lat*lat*Nc*Nc*sizeof(Complex);
      double flops=Nc*Nc*(8+8+8)*lat*lat*lat*lat;
-      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"\t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
+      std::cout<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;

    }

--- a/lib/algorithms/LinearOperator.h
+++ b/lib/algorithms/LinearOperator.h
@@ -5,14 +5,211 @@

 namespace Grid {

-  // Red black cases?
-    template<class vtype> class LinearOperatorBase {
+  /////////////////////////////////////////////////////////////////////////////////////////////
+  // LinearOperators Take a something and return a something.
+  /////////////////////////////////////////////////////////////////////////////////////////////
+  //
+  // Hopefully linearity is satisfied and the AdjOp is indeed the Hermitian conjugate (transpose if real):
+  //
+  //   i)  F(a x + b y) = aF(x) + b F(y).
+  //  ii)  <x|Op|y> = <y|AdjOp|x>^\ast
+  //
+  // Would be fun to have a test linearity & Herm Conj function!
+  /////////////////////////////////////////////////////////////////////////////////////////////
+    template<class Field> class LinearOperatorBase {
    public:
-      void M(const Lattice<vtype> &in, Lattice<vtype> &out){ assert(0);}
-      void Mdag(const Lattice<vtype> &in, Lattice<vtype> &out){ assert(0);}
-      void MdagM(const Lattice<vtype> &in, Lattice<vtype> &out){ assert(0);}
+      virtual void Op     (const Field &in, Field &out) = 0; // Abstract base
+      virtual void AdjOp  (const Field &in, Field &out) = 0; // Abstract base
    };

+  /////////////////////////////////////////////////////////////////////////////////////////////
+  // Hermitian operators are self adjoint and only require Op to be defined, so refine the base
+  /////////////////////////////////////////////////////////////////////////////////////////////
+    template<class Field> class HermitianOperatorBase : public LinearOperatorBase<Field> {
+    public:
+      void AdjOp(const Field &in, Field &out) {
+	return Op(in,out);
+      };
+    };
+
+  /////////////////////////////////////////////////////////////////////////////////////////////
+  // Interface defining what I expect of a general sparse matrix
+  /////////////////////////////////////////////////////////////////////////////////////////////
+    template<class Field> class SparseMatrixBase {
+    public:
+      // Full checkerboar operations
+      virtual void M    (const Field &in, Field &out);
+      virtual void Mdag (const Field &in, Field &out);
+      virtual void MdagM(const Field &in, Field &out);
+      
+      // half checkerboard operaions
+      virtual void Mpc      (const Field &in, Field &out);
+      virtual void MpcDag   (const Field &in, Field &out);
+      virtual void MpcDagMpc(const Field &in, Field &out);
+    };
+
+  /////////////////////////////////////////////////////////////////////////////////////////////
+  // Whereas non hermitian takes a generic sparse matrix (e.g. lattice action)
+  // conforming to sparse matrix interface and builds the full checkerboard non-herm operator
+  // Op and AdjOp distinct.
+  // By sharing the class for Sparse Matrix across multiple operator wrappers, we can share code
+  // between RB and non-RB variants. Sparse matrix is like the fermion action def, and then
+  // the wrappers implement the specialisation of "Op" and "AdjOp" to the cases minimising
+  // replication of code.
+  /////////////////////////////////////////////////////////////////////////////////////////////
+    template<class SparseMatrix,class Field>
+    class NonHermitianOperator : public LinearOperatorBase<Field> {
+      SparseMatrix &_Mat;
+    public:
+      NonHermitianOperator(SparseMatrix &Mat): _Mat(Mat){};
+      void Op     (const Field &in, Field &out){
+	_Mat.M(in,out);
+      }
+      void AdjOp     (const Field &in, Field &out){
+	_Mat.Mdag(in,out);
+      }
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Redblack Non hermitian wrapper
+    ////////////////////////////////////////////////////////////////////////////////////
+    template<class SparseMatrix,class Field>
+    class NonHermitianRedBlackOperator : public LinearOperatorBase<Field> {
+      SparseMatrix &_Mat;
+    public:
+      NonHermitianRedBlackOperator(SparseMatrix &Mat): _Mat(Mat){};
+      void Op     (const Field &in, Field &out){
+	this->Mpc(in,out);
+      }
+      void AdjOp     (const Field &in, Field &out){ //
+	this->MpcDag(in,out);
+      }
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Hermitian wrapper
+    ////////////////////////////////////////////////////////////////////////////////////
+    template<class SparseMatrix,class Field>
+    class HermitianOperator : public HermitianOperatorBase<Field> {
+      SparseMatrix &_Mat;
+    public:
+      HermitianOperator(SparseMatrix &Mat): _Mat(Mat) {};
+
+      void Op     (const Field &in, Field &out){
+	this->Mpc(in,out);
+      }
+      void AdjOp     (const Field &in, Field &out){ //
+	this->MpcDag(in,out);
+      }
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Hermitian RedBlack wrapper
+    ////////////////////////////////////////////////////////////////////////////////////
+    template<class SparseMatrix,class Field>
+    class HermitianRedBlackOperator : public HermitianOperatorBase<Field> {
+      SparseMatrix &_Mat;
+    public:
+      HermitianRedBlackOperator(SparseMatrix &Mat): _Mat(Mat) {};
+      void Op     (const Field &in, Field &out){
+	this->MpcDagMpc(in,out);
+      }
+    };
+
+
+    /////////////////////////////////////////////////////////////
+    // Base classes for functions of operators
+    /////////////////////////////////////////////////////////////
+    template<class Field> class OperatorFunction {
+    public:
+      virtual void operator() (LinearOperatorBase<Field> *Linop, const Field &in, Field &out) = 0;
+    };
+
+    /////////////////////////////////////////////////////////////
+    // Base classes for polynomial functions of operators ? needed?
+    /////////////////////////////////////////////////////////////
+    template<class Field> class OperatorPolynomial : public OperatorFunction<Field> {
+    public:
+      virtual void operator() (LinearOperatorBase<Field> *Linop,const Field &in, Field &out) = 0;
+    };
+
+    /////////////////////////////////////////////////////////////
+    // Base classes for iterative processes based on operators
+    // single input vec, single output vec.
+    /////////////////////////////////////////////////////////////
+    template<class Field> class IterativeProcess : public OperatorFunction<Field> {
+    public:
+      RealD   Tolerance;
+      Integer MaxIterations;
+      virtual void operator() (LinearOperatorBase<Field> *Linop,const Field &in, Field &out) = 0;
+    };
+
+    /////////////////////////////////////////////////////////////
+    // Grand daddy iterative method
+    /////////////////////////////////////////////////////////////
+    template<class Field> class ConjugateGradient : public IterativeProcess<Field> {
+    public:
+      virtual void operator() (HermitianOperatorBase<Field> *Linop,const Field &in, Field &out) = 0;
+    };
+
+    /////////////////////////////////////////////////////////////
+    // A little more modern
+    /////////////////////////////////////////////////////////////
+    template<class Field> class PreconditionedConjugateGradient : public  IterativeProcess<Field>  {
+    public:
+      void operator() (HermitianOperatorBase<Field> *Linop,
+		       OperatorFunction<Field> *Preconditioner,
+		       const Field &in, 
+		       Field &out) = 0;
+    };
+
+    // FIXME : To think about
+
+    // Chroma functionality list defining LinearOperator
+    /*
+     virtual void operator() (T& chi, const T& psi, enum PlusMinus isign) const = 0;
+     virtual void operator() (T& chi, const T& psi, enum PlusMinus isign, Real epsilon) const
+     virtual const Subset& subset() const = 0;
+     virtual unsigned long nFlops() const { return 0; }
+     virtual void deriv(P& ds_u, const T& chi, const T& psi, enum PlusMinus isign) const
+     class UnprecLinearOperator : public DiffLinearOperator<T,P,Q>
+       const Subset& subset() const {return all;}
+     };
+    */
+
+    // Chroma interface defining GaugeAction
+    /*
+      template<typename P, typename Q>   class GaugeAction
+  virtual const CreateGaugeState<P,Q>& getCreateState() const = 0;
+  virtual GaugeState<P,Q>* createState(const Q& q) const
+  virtual const GaugeBC<P,Q>& getGaugeBC() const
+  virtual const Set& getSet(void) const = 0;
+  virtual void deriv(P& result, const Handle< GaugeState<P,Q> >& state) const 
+  virtual Double S(const Handle< GaugeState<P,Q> >& state) const = 0;
+
+  class LinearGaugeAction : public GaugeAction< multi1d<LatticeColorMatrix>, multi1d<LatticeColorMatrix> >
+  typedef multi1d<LatticeColorMatrix>  P;
+  typedef multi1d<LatticeColorMatrix>  Q;
+  virtual void staple(LatticeColorMatrix& result,
+		      const Handle< GaugeState<P,Q> >& state,
+		      int mu, int cb) const = 0;
+    */
+
+    // Chroma interface defining FermionAction
+    /*
+     template<typename T, typename P, typename Q>  class FermAct4D : public FermionAction<T,P,Q>
+     virtual LinearOperator<T>* linOp(Handle< FermState<T,P,Q> > state) const = 0;
+     virtual LinearOperator<T>* lMdagM(Handle< FermState<T,P,Q> > state) const = 0;
+     virtual LinOpSystemSolver<T>* invLinOp(Handle< FermState<T,P,Q> > state,
+     virtual MdagMSystemSolver<T>* invMdagM(Handle< FermState<T,P,Q> > state,
+     virtual LinOpMultiSystemSolver<T>* mInvLinOp(Handle< FermState<T,P,Q> > state,
+     virtual MdagMMultiSystemSolver<T>* mInvMdagM(Handle< FermState<T,P,Q> > state,
+     virtual MdagMMultiSystemSolverAccumulate<T>* mInvMdagMAcc(Handle< FermState<T,P,Q> > state,
+     virtual SystemSolver<T>* qprop(Handle< FermState<T,P,Q> > state,
+     class DiffFermAct4D : public FermAct4D<T,P,Q>
+     virtual DiffLinearOperator<T,Q,P>* linOp(Handle< FermState<T,P,Q> > state) const = 0;
+     virtual DiffLinearOperator<T,Q,P>* lMdagM(Handle< FermState<T,P,Q> > state) const = 0;
+    */
 }

 #endif
--- a/lib/algorithms/approx/PolynomialApprox.h
+++ b/lib/algorithms/approx/PolynomialApprox.h
@@ -0,0 +1,145 @@
+#ifndef GRID_POLYNOMIAL_APPROX_H
+#define GRID_POLYNOMIAL_APPROX_H
+
+namespace Grid {
+
+  ////////////////////////////////////////////////////////////////////////////////////////////
+  // Simple general polynomial with user supplied coefficients
+  ////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field>
+  class Polynomial : public OperatorFunction<Field> {
+  private:
+    std::vector<double> _oeffs;
+  public:
+    Polynomial(std::vector<double> &_Coeffs) : Coeffs(_Coeffs) {};
+
+    // Implement the required interface
+    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
+
+      Field AtoN = in;
+      out = AtoN*Coeffs[0];
+
+      for(int n=1;n<Coeffs.size();n++){
+	Field Mtmp=AtoN;
+	Linop.Op(Mtmp,AtoN);
+	out=out+AtoN*Coeffs[n];
+      }
+    };
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////////////////
+  // Generic Chebyshev approximations
+  ////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field>
+  class Chebyshev : public OperatorFunction<Field> {
+  private:
+    std::vector<double> Coeffs;
+    int order;
+    double hi;
+    double lo;
+
+  public:
+    Chebyshev(double _lo,double _hi,int _order, double (* func)(double) ){
+      lo=_lo;
+      hi=_hi;
+      order=_order;
+      
+      if(order < 2) exit(-1);
+      Coeffs.resize(order);
+      for(int j=0;j<order;j++){
+	double s=0;
+	for(int k=0;k<order;k++){
+	  double y=cos(M_PI*(k+0.5)/order);
+	  double x=0.5*(y*(hi-lo)+(hi+lo));
+	  double f=func(x);
+	  s=s+f*cos( j*M_PI*(k+0.5)/order );
+	}
+	Coeffs[j] = s * 2.0/order;
+      }
+    };
+
+    double Evaluate(double x) // Convenience for plotting the approximation
+    {
+      double Tn;
+      double Tnm;
+      double Tnp;
+      
+      double y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
+      
+      double T0=1;
+      double T1=y;
+      
+      double sum;
+      sum = 0.5*Coeffs[0]*T0;
+      sum+= Coeffs[1]*T1;
+      
+      Tn =T1;
+      Tnm=T0;
+      for(int i=2;i<order;i++){
+	Tnp=2*y*Tn-Tnm;
+	Tnm=Tn;
+	Tn =Tnp;
+	sum+= Tn*Coeffs[i];
+      }
+      return sum;
+    };
+
+    // Convenience for plotting the approximation
+    void   PlotApprox(std::ostream &out) {
+      out<<"Polynomial approx ["<<lo<<","<<hi<<"]"<<std::endl;
+      for(double x=lo;x<hi;x+=(hi-lo)/50.0){
+	out <<x<<"\t"<<Evaluate(x)<<std::endl;
+      }
+    };
+
+    // Implement the required interface; could require Lattice base class
+    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
+
+      Field T0 = in;
+      Field T1 = T0; // Field T1(T0._grid); more efficient but hardwires Lattice class
+      Field T2 = T1;
+      
+      // use a pointer trick to eliminate copies
+      Field *Tnm = &T0;
+      Field *Tn  = &T1;
+      Field *Tnp = &T2;
+      Field y   = in;
+  
+      double xscale = 2.0/(hi-lo);
+      double mscale = -(hi+lo)/(hi-lo);
+
+      Field *T0=Tnm;
+      Field *T1=Tn;
+      
+
+      // Tn=T1 = (xscale M + mscale)in
+      Linop.Op(T0,y);
+
+      T1=y*xscale+in*mscale;
+
+      // sum = .5 c[0] T0 + c[1] T1
+      out = (0.5*coeffs[0])*T0 + coeffs[1]*T1;
+
+      for(int n=2;n<order;n++){
+	
+	Linop.Op(*Tn,y);
+
+	y=xscale*y+mscale*(*Tn);
+
+	*Tnp=2.0*y-(*Tnm);
+	
+	out=out+coeffs[n]* (*Tnp);
+
+	// Cycle pointers to avoid copies
+	Field *swizzle = Tnm;
+	Tnm    =Tn;
+	Tn     =Tnp;
+	Tnp    =swizzle;
+	  
+      }
+    }
+  };
+
+
+}
+#endif
--- a/lib/algorithms/approx/zolotarev.c
+++ b/lib/algorithms/approx/zolotarev.c
@@ -0,0 +1,710 @@
+/* -*- Mode: C; comment-column: 22; fill-column: 79; compile-command: "gcc -o zolotarev zolotarev.c -ansi -pedantic -lm -DTEST"; -*- */
+#define VERSION Source Time-stamp: <19-OCT-2004 09:33:22.00 adk@MISSCONTRARY>
+
+/* These C routines evalute the optimal rational approximation to the signum
+ * function for epsilon < |x| < 1 using Zolotarev's theorem.
+ *
+ * To obtain reliable results for high degree approximations (large n) it is
+ * necessary to compute using sufficiently high precision arithmetic. To this
+ * end the code has been parameterised to work with the preprocessor names
+ * INTERNAL_PRECISION and PRECISION set to float, double, or long double as
+ * appropriate. INTERNAL_PRECISION is used in computing the Zolotarev
+ * coefficients, which are converted to PRECISION before being returned to the
+ * caller. Presumably even higher precision could be obtained using GMP or
+ * similar package, but bear in mind that rounding errors might also be
+ * significant in evaluating the resulting polynomial. The convergence criteria
+ * have been written in a precision-independent form. */
+
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+
+#ifndef INTERNAL_PRECISION
+#define INTERNAL_PRECISION double
+#endif
+
+#include "zolotarev.h"
+#define ZOLOTAREV_INTERNAL
+#undef ZOLOTAREV_DATA
+#define ZOLOTAREV_DATA izd
+#undef ZPRECISION
+#define ZPRECISION INTERNAL_PRECISION
+#include "zolotarev.h"
+#undef ZOLOTAREV_INTERNAL
+
+/* The ANSI standard appears not to know what pi is */
+
+#ifndef M_PI
+#define M_PI ((INTERNAL_PRECISION) 3.141592653589793238462643383279502884197\
+169399375105820974944592307816406286208998628034825342117068)
+#endif
+
+#define ZERO ((INTERNAL_PRECISION) 0)
+#define ONE ((INTERNAL_PRECISION) 1)
+#define TWO ((INTERNAL_PRECISION) 2)
+#define THREE ((INTERNAL_PRECISION) 3)
+#define FOUR ((INTERNAL_PRECISION) 4)
+#define HALF (ONE/TWO)
+
+/* The following obscenity seems to be the simplest (?) way to coerce the C
+ * preprocessor to convert the value of a preprocessor token into a string. */
+
+#define PP2(x) #x
+#define PP1(a,b,c) a ## b(c)
+#define STRINGIFY(name) PP1(PP,2,name)
+
+/* Compute the partial fraction expansion coefficients (alpha) from the
+ * factored form */
+
+static void construct_partfrac(izd *z) {
+  int dn = z -> dn, dd = z -> dd, type = z -> type;
+  int j, k, da = dd + 1 + type;
+  INTERNAL_PRECISION A = z -> A, *a = z -> a, *ap = z -> ap, *alpha;
+  alpha = (INTERNAL_PRECISION*) malloc(da * sizeof(INTERNAL_PRECISION));
+  for (j = 0; j < dd; j++)
+    for (k = 0, alpha[j] = A; k < dd; k++)
+      alpha[j] *=
+	(k < dn ? ap[j] - a[k] : ONE) / (k == j ? ONE : ap[j] - ap[k]);
+  if(type == 1)	      /* implicit pole at zero? */
+    for (k = 0, alpha[dd] = A * (dn > dd ? - a[dd] : ONE); k < dd; k++) {
+      alpha[dd] *= a[k] / ap[k];
+      alpha[k] *= (dn > dd ? ap[k] - a[dd] : ONE) / ap[k];
+    }
+  alpha[da-1] = dn == da - 1 ? A : ZERO;
+  z -> alpha = alpha;
+  z -> da = da;
+  return;
+}
+
+/* Convert factored polynomial into dense polynomial. The input is the overall
+ * factor A and the roots a[i], such that p = A product(x - a[i], i = 1..d) */
+
+static INTERNAL_PRECISION *poly_factored_to_dense(INTERNAL_PRECISION A, 
+						  INTERNAL_PRECISION *a,
+						  int d) {
+  INTERNAL_PRECISION *p;
+  int i, j;
+  p = (INTERNAL_PRECISION *) malloc((d + 2) * sizeof(INTERNAL_PRECISION));
+  p[0] = A;
+  for (i = 0; i < d; i++) {
+    p[i+1] = p[i];
+    for (j = i; j > 0; j--) p[j] = p[j-1] - a[i]*p[j];
+    p[0] *= - a[i];
+  }
+  return p;
+}
+
+/* Convert a rational function of the form R0(x) = x p(x^2)/q(x^2) (type 0) or
+ * R1(x) = p(x^2)/[x q(x^2)] (type 1) into its continued fraction
+ * representation. We assume that 0 <= deg(q) - deg(p) <= 1 for type 0 and 0 <=
+ * deg(p) - deg(q) <= 1 for type 1. On input p and q are in factored form, and
+ * deg(q) = dq, deg(p) = dp.  The output is the continued fraction coefficients
+ * beta, where R(x) = beta[0] x + 1/(beta[1] x + 1/(...)).
+ *
+ * The method used is as follows. There are four cases to consider:
+ *
+ * 0.i.  Type 0, deg p = deg q
+ *
+ * 0.ii. Type 0, deg p = deg q - 1
+ *
+ * 1.i.  Type 1, deg p = deg q
+ *
+ * 1.ii. Type 1, deg p = deg q + 1
+ *
+ * and these are connected by two transformations:
+ *
+ * A. To obtain a continued fraction expansion of type 1 we use a single-step
+ * polynomial division we find beta and r(x) such that p(x) = beta x q(x) +
+ * r(x), with deg(r) = deg(q). This implies that p(x^2) = beta x^2 q(x^2) +
+ * r(x^2), and thus R1(x) = x beta + r(x^2)/(x q(x^2)) = x beta + 1/R0(x)
+ * with R0(x) = x q(x^2)/r(x^2).
+ *
+ * B. A continued fraction expansion of type 0 is obtained in a similar, but
+ * not identical, manner. We use the polynomial division algorithm to compute
+ * the quotient beta and the remainder r that satisfy p(x) = beta q(x) + r(x)
+ * with deg(r) = deg(q) - 1. We thus have x p(x^2) = x beta q(x^2) + x r(x^2),
+ * so R0(x) = x beta + x r(x^2)/q(x^2) = x beta + 1/R1(x) with R1(x) = q(x^2) /
+ * (x r(x^2)).
+ *
+ * Note that the deg(r) must be exactly deg(q) for (A) and deg(q) - 1 for (B)
+ * because p and q have disjoint roots all of multiplicity 1. This means that
+ * the division algorithm requires only a single polynomial subtraction step.
+ *
+ * The transformations between the cases form the following finite state
+ * automaton:
+ *
+ * +------+            +------+            +------+            +------+
+ * |      |            |      | ---(A)---> |      |            |      |
+ * | 0.ii | ---(B)---> | 1.ii |            | 0.i  | <---(A)--- | 1.i  |
+ * |      |            |      | <---(B)--- |      |            |      |
+ * +------+            +------+            +------+            +------+
+ */
+
+static INTERNAL_PRECISION *contfrac_A(INTERNAL_PRECISION *,
+				      INTERNAL_PRECISION *,
+				      INTERNAL_PRECISION *,
+				      INTERNAL_PRECISION *, int, int);
+
+static INTERNAL_PRECISION *contfrac_B(INTERNAL_PRECISION *,
+				      INTERNAL_PRECISION *,
+				      INTERNAL_PRECISION *,
+				      INTERNAL_PRECISION *, int, int);
+
+static void construct_contfrac(izd *z){
+  INTERNAL_PRECISION *r, A = z -> A, *p = z -> a, *q = z -> ap;
+  int dp = z -> dn, dq = z -> dd, type = z -> type;
+
+  z -> db = 2 * dq + 1 + type;
+  z -> beta = (INTERNAL_PRECISION *)
+    malloc(z -> db * sizeof(INTERNAL_PRECISION));
+  p = poly_factored_to_dense(A, p, dp);
+  q = poly_factored_to_dense(ONE, q, dq);
+  r = (INTERNAL_PRECISION *) malloc((MAX(dp,dq) + 1) *
+				    sizeof(INTERNAL_PRECISION));
+  if (type == 0) (void) contfrac_B(z -> beta, p, q, r, dp, dq);
+  else (void) contfrac_A(z -> beta, p, q, r, dp, dq);
+  free(p); free(q); free(r);
+  return;
+}
+
+static INTERNAL_PRECISION *contfrac_A(INTERNAL_PRECISION *beta,
+				      INTERNAL_PRECISION *p,
+				      INTERNAL_PRECISION *q,
+				      INTERNAL_PRECISION *r, int dp, int dq) {
+  INTERNAL_PRECISION quot, *rb;
+  int j;
+
+  /* p(x) = x beta q(x) + r(x); dp = dq or dp = dq + 1 */
+
+  quot = dp == dq ? ZERO : p[dp] / q[dq];
+  r[0] = p[0];
+  for (j = 1; j <= dp; j++) r[j] = p[j] - quot * q[j-1];
+#ifdef DEBUG
+  printf("%s: Continued Fraction form: deg p = %2d, deg q = %2d, beta = %g\n",
+	 __FUNCTION__, dp, dq, (float) quot);
+  for (j = 0; j <= dq + 1; j++)
+    printf("\tp[%2d] = %14.6g\tq[%2d] = %14.6g\tr[%2d] = %14.6g\n",
+	   j, (float) (j > dp ? ZERO : p[j]),
+	   j, (float) (j == 0 ? ZERO : q[j-1]),
+	   j, (float) (j == dp ? ZERO : r[j]));
+#endif /* DEBUG */
+  *(rb = contfrac_B(beta, q, r, p, dq, dq)) = quot;
+  return rb + 1;
+}
+
+static INTERNAL_PRECISION *contfrac_B(INTERNAL_PRECISION *beta,
+				      INTERNAL_PRECISION *p,
+				      INTERNAL_PRECISION *q,
+				      INTERNAL_PRECISION *r, int dp, int dq) {
+  INTERNAL_PRECISION quot, *rb;
+  int j;
+
+  /* p(x) = beta q(x) + r(x); dp = dq or dp = dq - 1 */
+
+  quot = dp == dq ? p[dp] / q[dq] : ZERO;
+  for (j = 0; j < dq; j++) r[j] = p[j] - quot * q[j];
+#ifdef DEBUG
+  printf("%s: Continued Fraction form: deg p = %2d, deg q = %2d, beta = %g\n",
+	 __FUNCTION__, dp, dq, (float) quot);
+  for (j = 0; j <= dq; j++)
+    printf("\tp[%2d] = %14.6g\tq[%2d] = %14.6g\tr[%2d] = %14.6g\n",
+	   j, (float) (j > dp ? ZERO : p[j]),
+	   j, (float) q[j],
+	   j, (float) (j == dq ? ZERO : r[j]));
+#endif /* DEBUG */
+  *(rb = dq > 0 ? contfrac_A(beta, q, r, p, dq, dq-1) : beta) = quot;
+  return rb + 1;
+}
+
+/* The global variable U is used to hold the argument u throughout the AGM
+ * recursion. The global variables F and K are set in the innermost
+ * instantiation of the recursive function AGM to the values of the elliptic
+ * integrals F(u,k) and K(k) respectively. They must be made thread local to
+ * make this code thread-safe in a multithreaded environment. */
+
+static INTERNAL_PRECISION U, F, K;	/* THREAD LOCAL */
+
+/* Recursive implementation of Gauss' arithmetico-geometric mean, which is the
+ * kernel of the method used to compute the Jacobian elliptic functions
+ * sn(u,k), cn(u,k), and dn(u,k) with parameter k (where 0 < k < 1), as well
+ * as the elliptic integral F(s,k) satisfying F(sn(u,k)) = u and the complete
+ * elliptic integral K(k).
+ *
+ * The algorithm used is a recursive implementation of the Gauss (Landen)
+ * transformation.
+ *
+ * The function returns the value of sn(u,k'), where k' is the dual parameter,
+ * and also sets the values of the global variables F and K.  The latter is
+ * used to determine the sign of cn(u,k').
+ *
+ * The algorithm is deemed to have converged when b ceases to increase. This
+ * works whatever INTERNAL_PRECISION is specified. */
+
+static INTERNAL_PRECISION AGM(INTERNAL_PRECISION a,
+			      INTERNAL_PRECISION b,
+			      INTERNAL_PRECISION s) {
+  static INTERNAL_PRECISION pb = -ONE;
+  INTERNAL_PRECISION c, d, xi;
+
+  if (b <= pb) {
+    pb = -ONE;
+    F = asin(s) / a;		/* Here, a is the AGM */
+    K = M_PI / (TWO * a);
+    return sin(U * a);
+  }
+  pb = b;
+  c = a - b;
+  d = a + b;
+  xi = AGM(HALF*d, sqrt(a*b), ONE + c*c == ONE ?
+	   HALF*s*d/a : (a - sqrt(a*a - s*s*c*d))/(c*s));
+  return 2*a*xi / (d + c*xi*xi);
+}
+
+/* Computes sn(u,k), cn(u,k), dn(u,k), F(u,k), and K(k). It is essentially a
+ * wrapper for the routine AGM. The sign of cn(u,k) is defined to be -1 if
+ * K(k) < u < 3*K(k) and +1 otherwise, and thus sign is computed by some quite
+ * unnecessarily obfuscated bit manipulations. */
+
+static void sncndnFK(INTERNAL_PRECISION u, INTERNAL_PRECISION k,
+		     INTERNAL_PRECISION* sn, INTERNAL_PRECISION* cn,
+		     INTERNAL_PRECISION* dn, INTERNAL_PRECISION* elF,
+		     INTERNAL_PRECISION* elK) {
+  int sgn;
+  U = u;
+  *sn = AGM(ONE, sqrt(ONE - k*k), u);
+  sgn = ((int) (fabs(u) / K)) % 4; /* sgn = 0, 1, 2, 3 */
+  sgn ^= sgn >> 1;    /* (sgn & 1) = 0, 1, 1, 0 */
+  sgn = 1 - ((sgn & 1) << 1);	/* sgn = 1, -1, -1, 1 */
+  *cn = ((INTERNAL_PRECISION) sgn) * sqrt(ONE - *sn * *sn);
+  *dn = sqrt(ONE - k*k* *sn * *sn);
+  *elF = F;
+  *elK = K;
+}
+
+/* Compute the coefficients for the optimal rational approximation R(x) to
+ * sgn(x) of degree n over the interval epsilon < |x| < 1 using Zolotarev's
+ * formula. 
+ *
+ * Set type = 0 for the Zolotarev approximation, which is zero at x = 0, and
+ * type = 1 for the approximation which is infinite at x = 0. */
+
+zolotarev_data* bfm_zolotarev(PRECISION epsilon, int n, int type) {
+  INTERNAL_PRECISION A, c, cp, kp, ksq, sn, cn, dn, Kp, Kj, z, z0, t, M, F,
+    l, invlambda, xi, xisq, *tv, s, opl;
+  int m, czero, ts;
+  zolotarev_data *zd;
+  izd *d = (izd*) malloc(sizeof(izd));
+
+  d -> type = type;
+  d -> epsilon = (INTERNAL_PRECISION) epsilon;
+  d -> n = n;
+  d -> dd = n / 2;
+  d -> dn = d -> dd - 1 + n % 2; /* n even: dn = dd - 1, n odd: dn = dd */
+  d -> deg_denom = 2 * d -> dd;
+  d -> deg_num = 2 * d -> dn + 1;
+
+  d -> a = (INTERNAL_PRECISION*) malloc((1 + d -> dn) *
+					sizeof(INTERNAL_PRECISION));
+  d -> ap = (INTERNAL_PRECISION*) malloc(d -> dd *
+					 sizeof(INTERNAL_PRECISION));
+  ksq = d -> epsilon * d -> epsilon;
+  kp = sqrt(ONE - ksq);
+  sncndnFK(ZERO, kp, &sn, &cn, &dn, &F, &Kp); /* compute Kp = K(kp) */
+  z0 = TWO * Kp / (INTERNAL_PRECISION) n;
+  M = ONE;
+  A = ONE / d -> epsilon;
+
+  sncndnFK(HALF * z0, kp, &sn, &cn, &dn, &F, &Kj); /* compute xi */
+  xi = ONE / dn;
+  xisq = xi * xi;
+  invlambda = xi;
+
+  for (m = 0; m < d -> dd; m++) {
+    czero = 2 * (m + 1) == n; /* n even and m = dd -1 */
+
+    z = z0 * ((INTERNAL_PRECISION) m + ONE);
+    sncndnFK(z, kp, &sn, &cn, &dn, &F, &Kj);
+    t = cn / sn;
+    c = - t*t;
+    if (!czero) (d -> a)[d -> dn - 1 - m] = ksq / c;
+
+    z = z0 * ((INTERNAL_PRECISION) m + HALF);
+    sncndnFK(z, kp, &sn, &cn, &dn, &F, &Kj);
+    t = cn / sn;
+    cp = - t*t;
+    (d -> ap)[d -> dd - 1 - m] = ksq / cp;
+
+    M *= (ONE - c) / (ONE - cp);
+    A *= (czero ? -ksq : c) * (ONE - cp) / (cp * (ONE - c));
+    invlambda *= (ONE - c*xisq) / (ONE - cp*xisq);
+  }
+  invlambda /= M;
+  d -> A = TWO / (ONE + invlambda) * A;
+  d -> Delta = (invlambda - ONE) / (invlambda + ONE);
+
+  d -> gamma = (INTERNAL_PRECISION*) malloc((1 + d -> n) *
+					    sizeof(INTERNAL_PRECISION));
+  l = ONE / invlambda;
+  opl = ONE + l;
+  sncndnFK(sqrt( d -> type == 1
+		   ? (THREE + l) / (FOUR * opl)
+		   : (ONE + THREE*l) / (opl*opl*opl)
+	       ), sqrt(ONE - l*l), &sn, &cn, &dn, &F, &Kj);
+  s = M * F;
+  for (m = 0; m < d -> n; m++) {
+    sncndnFK(s + TWO*Kp*m/n, kp, &sn, &cn, &dn, &F, &Kj);
+    d -> gamma[m] = d -> epsilon / dn;
+  }
+
+  /* If R(x) is a Zolotarev rational approximation of degree (n,m) with maximum
+   * error Delta, then (1 - Delta^2) / R(x) is also an optimal Chebyshev
+   * approximation of degree (m,n) */
+
+  if (d -> type == 1) {
+    d -> A = (ONE - d -> Delta * d -> Delta) / d -> A;
+    tv = d -> a; d -> a = d -> ap; d -> ap = tv;
+    ts = d -> dn; d -> dn = d -> dd; d -> dd = ts;
+    ts = d -> deg_num; d -> deg_num = d -> deg_denom; d -> deg_denom = ts;
+  }
+
+  construct_partfrac(d);
+  construct_contfrac(d);
+
+  /* Converting everything to PRECISION for external use only */
+
+  zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
+  zd -> A = (PRECISION) d -> A;
+  zd -> Delta = (PRECISION) d -> Delta;
+  zd -> epsilon = (PRECISION) d -> epsilon;
+  zd -> n = d -> n;
+  zd -> type = d -> type;
+  zd -> dn = d -> dn;
+  zd -> dd = d -> dd;
+  zd -> da = d -> da;
+  zd -> db = d -> db;
+  zd -> deg_num = d -> deg_num;
+  zd -> deg_denom = d -> deg_denom;
+
+  zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
+  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
+  free(d -> a);
+
+  zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
+  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
+  free(d -> ap);
+
+  zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
+  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
+  free(d -> alpha);
+
+  zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
+  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
+  free(d -> beta);
+
+  zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
+  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
+  free(d -> gamma);
+
+  free(d);
+  return zd;
+}
+
+zolotarev_data* bfm_higham(PRECISION epsilon, int n) {
+  INTERNAL_PRECISION A, M, c, cp, z, z0, t, epssq;
+  int m, czero;
+  zolotarev_data *zd;
+  izd *d = (izd*) malloc(sizeof(izd));
+
+  d -> type = 0;
+  d -> epsilon = (INTERNAL_PRECISION) epsilon;
+  d -> n = n;
+  d -> dd = n / 2;
+  d -> dn = d -> dd - 1 + n % 2; /* n even: dn = dd - 1, n odd: dn = dd */
+  d -> deg_denom = 2 * d -> dd;
+  d -> deg_num = 2 * d -> dn + 1;
+
+  d -> a = (INTERNAL_PRECISION*) malloc((1 + d -> dn) *
+					sizeof(INTERNAL_PRECISION));
+  d -> ap = (INTERNAL_PRECISION*) malloc(d -> dd *
+					 sizeof(INTERNAL_PRECISION));
+  A = (INTERNAL_PRECISION) n;
+  z0 = M_PI / A;
+  A = n % 2 == 0 ? A : ONE / A;
+  M = d -> epsilon * A;
+  epssq = d -> epsilon * d -> epsilon;
+
+  for (m = 0; m < d -> dd; m++) {
+    czero = 2 * (m + 1) == n; /* n even and m = dd - 1*/
+
+    if (!czero) {
+      z = z0 * ((INTERNAL_PRECISION) m + ONE);
+      t = tan(z);
+      c = - t*t;
+      (d -> a)[d -> dn - 1 - m] = c;
+      M *= epssq - c;
+    }
+
+    z = z0 * ((INTERNAL_PRECISION) m + HALF);
+    t = tan(z);
+    cp = - t*t;
+    (d -> ap)[d -> dd - 1 - m] = cp;
+    M /= epssq - cp;
+  }
+
+  d -> gamma = (INTERNAL_PRECISION*) malloc((1 + d -> n) *
+					    sizeof(INTERNAL_PRECISION));
+  for (m = 0; m < d -> n; m++) d -> gamma[m] = ONE;
+
+  d -> A = A;
+  d -> Delta = ONE - M;
+
+  construct_partfrac(d);
+  construct_contfrac(d);
+
+  /* Converting everything to PRECISION for external use only */
+
+  zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
+  zd -> A = (PRECISION) d -> A;
+  zd -> Delta = (PRECISION) d -> Delta;
+  zd -> epsilon = (PRECISION) d -> epsilon;
+  zd -> n = d -> n;
+  zd -> type = d -> type;
+  zd -> dn = d -> dn;
+  zd -> dd = d -> dd;
+  zd -> da = d -> da;
+  zd -> db = d -> db;
+  zd -> deg_num = d -> deg_num;
+  zd -> deg_denom = d -> deg_denom;
+
+  zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
+  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
+  free(d -> a);
+
+  zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
+  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
+  free(d -> ap);
+
+  zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
+  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
+  free(d -> alpha);
+
+  zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
+  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
+  free(d -> beta);
+
+  zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
+  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
+  free(d -> gamma);
+
+  free(d);
+  return zd;
+}
+
+#ifdef TEST
+
+#undef ZERO
+#define ZERO ((PRECISION) 0)
+#undef ONE
+#define ONE ((PRECISION) 1)
+#undef TWO
+#define TWO ((PRECISION) 2)
+
+/* Evaluate the rational approximation R(x) using the factored form */
+
+static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) {
+  int m;
+  PRECISION R;
+
+  if (rdata -> type == 0) {
+    R = rdata -> A * x;
+    for (m = 0; m < rdata -> deg_denom/2; m++)
+      R *= (2*(m+1) > rdata -> deg_num ? ONE : x*x - rdata -> a[m]) /
+	(x*x - rdata -> ap[m]);
+  } else {
+    R = rdata -> A / x;
+    for (m = 0; m < rdata -> deg_num/2; m++)
+      R *= (x*x - rdata -> a[m]) /
+	(2*(m+1) > rdata -> deg_denom ? ONE : x*x - rdata -> ap[m]);
+  }
+  return R;
+}
+
+/* Evaluate the rational approximation R(x) using the partial fraction form */
+
+static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) {
+  int m;
+  PRECISION R = rdata -> alpha[rdata -> da - 1];
+  for (m = 0; m < rdata -> dd; m++)
+    R += rdata -> alpha[m] / (x * x - rdata -> ap[m]);
+  if (rdata -> type == 1) R += rdata -> alpha[rdata -> dd] / (x * x);
+  return R * x;
+}    
+
+/* Evaluate the rational approximation R(x) using continued fraction form. 
+ *
+ * If x = 0 and type = 1 then the result should be INF, whereas if x = 0 and
+ * type = 0 then the result should be 0, but division by zero will occur at
+ * intermediate stages of the evaluation. For IEEE implementations with
+ * non-signalling overflow this will work correctly since 1/(1/0) = 1/INF = 0,
+ * but with signalling overflow you will get an error message. */
+
+static PRECISION zolotarev_contfrac_eval(PRECISION x, zolotarev_data* rdata) {
+  int m;
+  PRECISION R = rdata -> beta[0] * x;
+  for (m = 1; m < rdata -> db; m++) R = rdata -> beta[m] * x + ONE / R;
+  return R;
+}    
+
+/* Evaluate the rational approximation R(x) using Cayley form */
+
+static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) {
+  int m;
+  PRECISION T;
+
+  T = rdata -> type == 0 ? ONE : -ONE;
+  for (m = 0; m < rdata -> n; m++)
+    T *= (rdata -> gamma[m] - x) / (rdata -> gamma[m] + x);
+  return (ONE - T) / (ONE + T);
+}
+
+/* Test program. Apart from printing out the parameters for R(x) it produces
+ * the following data files for plotting (unless NPLOT is defined):
+ *
+ * zolotarev-fn is a plot of R(x) for |x| < 1.2. This should look like sgn(x).
+ *
+ * zolotarev-err is a plot of the error |R(x) - sgn(x)| scaled by 1/Delta. This
+ * should oscillate deg_num + deg_denom + 2 times between +1 and -1 over the
+ * domain epsilon <= |x| <= 1.
+ *
+ * If ALLPLOTS is defined then zolotarev-partfrac (zolotarev-contfrac) is a
+ * plot of the difference between the values of R(x) computed using the
+ * factored form and the partial fraction (continued fraction) form, scaled by
+ * 1/Delta. It should be zero everywhere. */
+
+int main(int argc, char** argv) {
+  
+  int m, n, plotpts = 5000, type = 0;
+  float eps, x, ypferr, ycferr, ycaylerr, maxypferr, maxycferr, maxycaylerr;
+  zolotarev_data *rdata;
+  PRECISION y;
+  FILE *plot_function, *plot_error, 
+    *plot_partfrac, *plot_contfrac, *plot_cayley;
+
+  if (argc < 3 || argc > 4) {
+    fprintf(stderr, "Usage: %s epsilon n [type]\n", *argv);
+    exit(EXIT_FAILURE);
+  }
+  sscanf(argv[1], "%g", &eps);	/* First argument is epsilon */
+  sscanf(argv[2], "%d", &n);	/* Second argument is n */
+  if (argc == 4) sscanf(argv[3], "%d", &type); /* Third argument is type */
+
+  if (type < 0 || type > 2) {
+    fprintf(stderr, "%s: type must be 0 (Zolotarev R(0) = 0),\n"
+	    "\t\t1 (Zolotarev R(0) = Inf, or 2 (Higham)\n", *argv);
+    exit(EXIT_FAILURE);
+  }
+
+  rdata = type == 2 
+    ? higham((PRECISION) eps, n) 
+    : zolotarev((PRECISION) eps, n, type);
+
+  printf("Zolotarev Test: R(epsilon = %g, n = %d, type = %d)\n\t" 
+	 STRINGIFY(VERSION) "\n\t" STRINGIFY(HVERSION)
+	 "\n\tINTERNAL_PRECISION = " STRINGIFY(INTERNAL_PRECISION)
+	 "\tPRECISION = " STRINGIFY(PRECISION)
+	 "\n\n\tRational approximation of degree (%d,%d), %s at x = 0\n"
+	 "\tDelta = %g (maximum error)\n\n"
+	 "\tA = %g (overall factor)\n",
+	 (float) rdata -> epsilon, rdata -> n, type,
+	 rdata -> deg_num, rdata -> deg_denom,
+	 rdata -> type == 1 ? "infinite" : "zero",
+	 (float) rdata -> Delta, (float) rdata -> A);
+  for (m = 0; m < MIN(rdata -> dd, rdata -> dn); m++)
+    printf("\ta[%2d] = %14.8g\t\ta'[%2d] = %14.8g\n",
+	   m + 1, (float) rdata -> a[m], m + 1, (float) rdata -> ap[m]);
+  if (rdata -> dd > rdata -> dn)
+    printf("\t\t\t\t\ta'[%2d] = %14.8g\n",
+	   rdata -> dn + 1, (float) rdata -> ap[rdata -> dn]);
+  if (rdata -> dd < rdata -> dn)
+    printf("\ta[%2d] = %14.8g\n",
+	   rdata -> dd + 1, (float) rdata -> a[rdata -> dd]);
+
+  printf("\n\tPartial fraction coefficients\n");
+  printf("\talpha[ 0] = %14.8g\n",
+	 (float) rdata -> alpha[rdata -> da - 1]);
+  for (m = 0; m < rdata -> dd; m++)
+    printf("\talpha[%2d] = %14.8g\ta'[%2d] = %14.8g\n",
+	   m + 1, (float) rdata -> alpha[m], m + 1, (float) rdata -> ap[m]);
+  if (rdata -> type == 1)
+    printf("\talpha[%2d] = %14.8g\ta'[%2d] = %14.8g\n",
+	   rdata -> dd + 1, (float) rdata -> alpha[rdata -> dd],
+	   rdata -> dd + 1, (float) ZERO);
+
+  printf("\n\tContinued fraction coefficients\n");
+  for (m = 0; m < rdata -> db; m++)
+    printf("\tbeta[%2d] = %14.8g\n", m, (float) rdata -> beta[m]);
+
+  printf("\n\tCayley transform coefficients\n");
+  for (m = 0; m < rdata -> n; m++)
+    printf("\tgamma[%2d] = %14.8g\n", m, (float) rdata -> gamma[m]);
+
+#ifndef NPLOT
+  plot_function = fopen("zolotarev-fn.dat", "w");
+  plot_error = fopen("zolotarev-err.dat", "w");
+#ifdef ALLPLOTS
+  plot_partfrac = fopen("zolotarev-partfrac.dat", "w");
+  plot_contfrac = fopen("zolotarev-contfrac.dat", "w");
+  plot_cayley = fopen("zolotarev-cayley.dat", "w");
+#endif /* ALLPLOTS */
+  for (m = 0, maxypferr = maxycferr = maxycaylerr = 0.0; m <= plotpts; m++) {
+    x = 2.4 * (float) m / plotpts - 1.2;
+    if (rdata -> type == 0 || fabs(x) * (float) plotpts > 1.0) {
+      /* skip x = 0 for type 1, as R(0) is singular */
+      y = zolotarev_eval((PRECISION) x, rdata);
+      fprintf(plot_function, "%g %g\n", x, (float) y);
+      fprintf(plot_error, "%g %g\n",
+	      x, (float)((y - ((x > 0.0 ? ONE : -ONE))) / rdata -> Delta));
+      ypferr = (float)((zolotarev_partfrac_eval((PRECISION) x, rdata) - y)
+		       / rdata -> Delta);
+      ycferr = (float)((zolotarev_contfrac_eval((PRECISION) x, rdata) - y)
+		       / rdata -> Delta);
+      ycaylerr = (float)((zolotarev_cayley_eval((PRECISION) x, rdata) - y)
+		       / rdata -> Delta);
+      if (fabs(x) < 1.0 && fabs(x) > rdata -> epsilon) {
+	maxypferr = MAX(maxypferr, fabs(ypferr));
+	maxycferr = MAX(maxycferr, fabs(ycferr));
+	maxycaylerr = MAX(maxycaylerr, fabs(ycaylerr));
+      }
+#ifdef ALLPLOTS
+      fprintf(plot_partfrac, "%g %g\n", x, ypferr);
+      fprintf(plot_contfrac, "%g %g\n", x, ycferr);
+      fprintf(plot_cayley, "%g %g\n", x, ycaylerr);
+#endif /* ALLPLOTS */
+    }
+  }
+#ifdef ALLPLOTS
+  fclose(plot_cayley);
+  fclose(plot_contfrac);
+  fclose(plot_partfrac);
+#endif /* ALLPLOTS */
+  fclose(plot_error);
+  fclose(plot_function);
+
+  printf("\n\tMaximum PF error = %g (relative to Delta)\n", maxypferr);
+  printf("\tMaximum CF error = %g (relative to Delta)\n", maxycferr);
+  printf("\tMaximum Cayley error = %g (relative to Delta)\n", maxycaylerr);
+#endif /* NPLOT */
+
+  free(rdata -> a);
+  free(rdata -> ap);
+  free(rdata -> alpha);
+  free(rdata -> beta);
+  free(rdata -> gamma);
+  free(rdata);
+
+  return EXIT_SUCCESS;
+}
+#endif /* TEST */
--- a/lib/algorithms/approx/zolotarev.h
+++ b/lib/algorithms/approx/zolotarev.h
@@ -0,0 +1,85 @@
+/* -*- Mode: C; comment-column: 22; fill-column: 79; -*- */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY>
+
+
+#ifndef ZOLOTAREV_INTERNAL
+#ifndef PRECISION
+#define PRECISION double
+#endif
+#define ZPRECISION PRECISION
+#define ZOLOTAREV_DATA zolotarev_data
+#endif
+
+/* This struct contains the coefficients which parameterise an optimal rational
+ * approximation to the signum function.
+ *
+ * The parameterisations used are:
+ *
+ * Factored form for type 0 (R0(0) = 0)
+ *
+ * R0(x) = A * x * prod(x^2 - a[j], j = 0 .. dn-1) / prod(x^2 - ap[j], j = 0
+ * .. dd-1),
+ *
+ * where deg_num = 2*dn + 1 and deg_denom = 2*dd.
+ *
+ * Factored form for type 1 (R1(0) = infinity)
+ *
+ * R1(x) = (A / x) * prod(x^2 - a[j], j = 0 .. dn-1) / prod(x^2 - ap[j], j = 0
+ * .. dd-1),
+ *
+ * where deg_num = 2*dn and deg_denom = 2*dd + 1. 
+ *
+ * Partial fraction form
+ *
+ * R(x) = alpha[da] * x + sum(alpha[j] * x / (x^2 - ap[j]), j = 0 .. da-1)
+ *
+ * where da = dd for type 0 and da = dd + 1 with ap[dd] = 0 for type 1.
+ *
+ * Continued fraction form 
+ *
+ * R(x) = beta[db-1] * x + 1 / (beta[db-2] * x + 1 / (beta[db-3] * x + ...))
+ *
+ * with the final coefficient being beta[0], with d' = 2 * dd + 1 for type 0
+ * and db = 2 * dd + 2 for type 1.
+ *
+ * Cayley form (Chiu's domain wall formulation)
+ *
+ * R(x) = (1 - T(x)) / (1 + T(x))
+ *
+ * where T(x) = prod((x - gamma[j]) / (x + gamma[j]), j = 0 .. n-1)
+ */
+
+typedef struct {
+  ZPRECISION *a,      /* zeros of numerator, a[0 .. dn-1] */
+    *ap,	      /* poles (zeros of denominator), ap[0 .. dd-1] */
+    A,		      /* overall factor */
+    *alpha,	      /* coefficients of partial fraction, alpha[0 .. da-1] */
+    *beta,	      /* coefficients of continued fraction, beta[0 .. db-1] */
+    *gamma,	      /* zeros of numerator of T in Cayley form */
+    Delta,	      /* maximum error, |R(x) - sgn(x)| <= Delta */
+    epsilon;	      /* minimum x value, epsilon < |x| < 1 */
+  int n,	      /* approximation degree */
+    type,	      /* 0: R(0) = 0, 1: R(0) = infinity */
+    dn, dd, da, db,   /* number of elements of a, ap, alpha, and beta */
+    deg_num,	      /* degree of numerator = deg_denom +/- 1 */
+    deg_denom;	      /* degree of denominator */
+} ZOLOTAREV_DATA;
+
+#ifndef ZOLOTAREV_INTERNAL
+
+/* zolotarev(epsilon, n, type) returns a pointer to an initialised
+ * zolotarev_data structure. The arguments must satisfy the constraints that
+ * epsilon > 0, n > 0, and type = 0 or 1. */
+
+ZOLOTAREV_DATA* bfm_higham(PRECISION epsilon, int n) ;
+ZOLOTAREV_DATA* bfm_zolotarev(PRECISION epsilon, int n, int type);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
--- a/lib/qcd/Grid_qcd_wilson_dop.cc
+++ b/lib/qcd/Grid_qcd_wilson_dop.cc
@@ -290,10 +290,7 @@ void WilsonMatrix::MpcDagMpc(const LatticeFermion &in, LatticeFermion &out)
 {
  return;
 }
-void WilsonMatrix::MDagM    (const LatticeFermion &in, LatticeFermion &out)
-{
-  return;
-}
+


 }}
--- a/lib/qcd/Grid_qcd_wilson_dop.h
+++ b/lib/qcd/Grid_qcd_wilson_dop.h
@@ -9,7 +9,7 @@ namespace Grid {

  namespace QCD {

-    class WilsonMatrix : public LinearOperatorBase<LatticeFermion>
+    class WilsonMatrix : public SparseMatrixBase<LatticeFermion>
    {
      //NB r=1;
    public:
@@ -40,6 +40,11 @@ namespace Grid {
      virtual void Mdag (const LatticeFermion &in, LatticeFermion &out);
      virtual void MdagM(const LatticeFermion &in, LatticeFermion &out);

+      // half checkerboard operaions
+      void Mpc      (const LatticeFermion &in, LatticeFermion &out);
+      void MpcDag   (const LatticeFermion &in, LatticeFermion &out);
+      void MpcDagMpc(const LatticeFermion &in, LatticeFermion &out);
+
      // non-hermitian hopping term; half cb or both
      void Dhop(const LatticeFermion &in, LatticeFermion &out);

@@ -48,17 +53,10 @@ namespace Grid {

      typedef iScalar<iMatrix<vComplex, Nc> > matrix;

-      // half checkerboard operaions
-      void MpcDag   (const LatticeFermion &in, LatticeFermion &out);
-      void Mpc      (const LatticeFermion &in, LatticeFermion &out);
-      void MpcDagMpc(const LatticeFermion &in, LatticeFermion &out);
-
-      // full checkerboard hermitian
-      void MDagM    (const LatticeFermion &in, LatticeFermion &out);
-
      
    };

+
  }
 }
 #endif