relative Eigen links, allows moving safely Grid's directory

Optional CUDA info
SerialisableClassName method for serialisable enum, and boolean to test if a serialisable object is an enum
2026-06-21 03:08:15 +01:00 · 2020-06-04 10:56:34 +01:00 · 2020-06-02 14:21:49 -04:00 · 2020-05-15 20:00:34 +01:00 · 2020-05-12 18:56:47 -04:00 · 2020-05-12 17:57:24 -04:00
504 changed files with 6580 additions and 38667 deletions
@@ -22,8 +22,18 @@
 #undef __CUDACC__
 #undef __CUDA_ARCH__
 #define __NVCC__REDEFINE__
+#endif 
+
+/* SYCL save and restore compile environment*/
+#ifdef __SYCL_DEVICE_ONLY__  
+#pragma push
+#pragma push_macro("__SYCL_DEVICE_ONLY__")
+#undef __SYCL_DEVICE_ONLY__
+#undef EIGEN_USE_SYCL
+#define EIGEN_DONT_VECTORIZE
 #endif

+
 #include <Grid/Eigen/Dense>
 #include <Grid/Eigen/unsupported/CXX11/Tensor>

@@ -35,7 +45,14 @@
 #pragma pop
 #endif

+/*SYCL restore*/
+#ifdef __SYCL__REDEFINE__
+#pragma pop_macro("__SYCL_DEVICE_ONLY__")
+#pragma pop
+#endif
+
 #if defined __GNUC__
 #pragma GCC diagnostic pop
 #endif

+
@@ -39,14 +39,18 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/algorithms/approx/Remez.h>
 #include <Grid/algorithms/approx/MultiShiftFunction.h>
 #include <Grid/algorithms/approx/Forecast.h>
+#include <Grid/algorithms/approx/RemezGeneral.h>
+#include <Grid/algorithms/approx/ZMobius.h>

 #include <Grid/algorithms/iterative/Deflation.h>
 #include <Grid/algorithms/iterative/ConjugateGradient.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
 #include <Grid/algorithms/iterative/ConjugateResidual.h>
 #include <Grid/algorithms/iterative/NormalEquations.h>
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
+#include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
 #include <Grid/algorithms/iterative/MinimalResidual.h>
@@ -541,17 +541,14 @@ public:
  ///////////////////////
  GridBase * Grid(void)         { return _grid; };   // this is all the linalg routines need to know

-  RealD M (const CoarseVector &in, CoarseVector &out){
-
+  void M (const CoarseVector &in, CoarseVector &out)
+  {
    conformable(_grid,in.Grid());
    conformable(in.Grid(),out.Grid());

-    //    RealD Nin = norm2(in);
    SimpleCompressor<siteVector> compressor;

-    double comms_usec = -usecond();
    Stencil.HaloExchange(in,compressor);
-    comms_usec += usecond();

    auto in_v = in.View();
    auto out_v = out.View();
@@ -565,12 +562,7 @@ public:
    typedef decltype(coalescedRead(in_v[0])) calcVector;
    typedef decltype(coalescedRead(in_v[0](0))) calcComplex;

-    GridStopWatch ArithmeticTimer;
    int osites=Grid()->oSites();
-    //    double flops = osites*Nsimd*nbasis*nbasis*8.0*geom.npoint;
-    //    double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex);
-    double usecs =-usecond();
-    // assert(geom.npoint==9);

    accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
      int ss = sss/nbasis;
@@ -598,23 +590,9 @@ public:
      }
      coalescedWrite(out_v[ss](b),res,lane);
    });
-    usecs +=usecond();
-
-    double nrm_usec=-usecond();
-    RealD Nout= norm2(out);
-    nrm_usec+=usecond();
-
-    /*
-        std::cout << GridLogMessage << "\tNorm        " << nrm_usec << " us" <<std::endl;
-        std::cout << GridLogMessage << "\tHalo        " << comms_usec << " us" <<std::endl;
-        std::cout << GridLogMessage << "\tMatrix      " << usecs << " us" <<std::endl;
-        std::cout << GridLogMessage << "\t  mflop/s   " << flops/usecs<<std::endl;
-        std::cout << GridLogMessage << "\t  MB/s      " << bytes/usecs<<std::endl;
-    */
-    return Nout;
  };

-  RealD Mdag (const CoarseVector &in, CoarseVector &out)
+  void Mdag (const CoarseVector &in, CoarseVector &out)
  {
    if(hermitian) {
      // corresponds to Petrov-Galerkin coarsening
@@ -625,7 +603,6 @@ public:
      G5C(tmp, in); 
      M(tmp, out);
      G5C(out, out);
-      return norm2(out);
    }
  };
  void MdirComms(const CoarseVector &in)
@@ -870,8 +847,6 @@ public:
 	    auto A_self  = A[self_stencil].View();

 	    accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
-	    //      if( disp!= 0 ) { accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });}
-	    //	    accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_self[ss](j,i),A_self(ss)(j,i)+iZProj_v(ss)); });

 	  }
 	}
@@ -43,7 +43,6 @@ NAMESPACE_BEGIN(Grid);
 /////////////////////////////////////////////////////////////////////////////////////////////
 template<class Field> class LinearOperatorBase {
 public:
-
  // Support for coarsening to a multigrid
  virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base
  virtual void OpDir  (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base
@@ -94,7 +93,10 @@ public:
    _Mat.Mdag(in,out);
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
-    _Mat.MdagM(in,out,n1,n2);
+    _Mat.MdagM(in,out);
+    ComplexD dot = innerProduct(in,out);
+    n1=real(dot);
+    n2=norm2(out);
  }
  void HermOp(const Field &in, Field &out){
    _Mat.MdagM(in,out);
@@ -131,17 +133,14 @@ public:
    assert(0);
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
-    _Mat.MdagM(in,out,n1,n2);
-    out = out + _shift*in;
-
-    ComplexD dot;	
-    dot= innerProduct(in,out);
+    HermOp(in,out);
+    ComplexD dot = innerProduct(in,out);
    n1=real(dot);
    n2=norm2(out);
  }
  void HermOp(const Field &in, Field &out){
-    RealD n1,n2;
-    HermOpAndNorm(in,out,n1,n2);
+    _Mat.MdagM(in,out);
+    out = out + _shift*in;
  }
 };

@@ -170,7 +169,7 @@ public:
    _Mat.M(in,out);
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
-    _Mat.M(in,out);
+    HermOp(in,out);
    ComplexD dot= innerProduct(in,out); n1=real(dot);
    n2=norm2(out);
  }
@@ -208,212 +207,305 @@ public:
  }
 };

-    //////////////////////////////////////////////////////////
-    // Even Odd Schur decomp operators; there are several
-    // ways to introduce the even odd checkerboarding
-    //////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////
+// Even Odd Schur decomp operators; there are several
+// ways to introduce the even odd checkerboarding
+//////////////////////////////////////////////////////////

-    template<class Field>
-    class SchurOperatorBase :  public LinearOperatorBase<Field> {
-    public:
-      virtual  RealD Mpc      (const Field &in, Field &out) =0;
-      virtual  RealD MpcDag   (const Field &in, Field &out) =0;
-      virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
-      Field tmp(in.Grid());
-      tmp.Checkerboard() = in.Checkerboard();
-	ni=Mpc(in,tmp);
-	no=MpcDag(tmp,out);
-      }
-      virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
-      out.Checkerboard() = in.Checkerboard();
-	MpcDagMpc(in,out,n1,n2);
-      }
-      virtual void HermOp(const Field &in, Field &out){
-	RealD n1,n2;
-	HermOpAndNorm(in,out,n1,n2);
-      }
-      void Op     (const Field &in, Field &out){
-	Mpc(in,out);
-      }
-      void AdjOp     (const Field &in, Field &out){ 
-	MpcDag(in,out);
-      }
-      // Support for coarsening to a multigrid
-      void OpDiag (const Field &in, Field &out) {
-	assert(0); // must coarsen the unpreconditioned system
-      }
-      void OpDir  (const Field &in, Field &out,int dir,int disp) {
-	assert(0);
-      }
-      void OpDirAll  (const Field &in, std::vector<Field> &out){
-	assert(0);
-      };
-    };
-    template<class Matrix,class Field>
-    class SchurDiagMooeeOperator :  public SchurOperatorBase<Field> {
-    public:
-      Matrix &_Mat;
-      SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
-      virtual  RealD Mpc      (const Field &in, Field &out) {
-      Field tmp(in.Grid());
-      tmp.Checkerboard() = !in.Checkerboard();
-	//std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << "  _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
-
-	_Mat.Meooe(in,tmp);
-	_Mat.MooeeInv(tmp,out);
-	_Mat.Meooe(out,tmp);
-
-      //std::cout << "cb in " << in.Checkerboard() << "  cb out " << out.Checkerboard() << std::endl;
-	_Mat.Mooee(in,out);
-	return axpy_norm(out,-1.0,tmp,out);
-      }
-      virtual  RealD MpcDag   (const Field &in, Field &out){
-	Field tmp(in.Grid());
-
-	_Mat.MeooeDag(in,tmp);
-        _Mat.MooeeInvDag(tmp,out);
-	_Mat.MeooeDag(out,tmp);
-
-	_Mat.MooeeDag(in,out);
-	return axpy_norm(out,-1.0,tmp,out);
-      }
-    };
-    template<class Matrix,class Field>
-      class SchurDiagOneOperator :  public SchurOperatorBase<Field> {
-    protected:
-      Matrix &_Mat;
-    public:
-      SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){};
-
-      virtual  RealD Mpc      (const Field &in, Field &out) {
-	Field tmp(in.Grid());
-
-	_Mat.Meooe(in,out);
-	_Mat.MooeeInv(out,tmp);
-	_Mat.Meooe(tmp,out);
-	_Mat.MooeeInv(out,tmp);
-
-	return axpy_norm(out,-1.0,tmp,in);
-      }
-      virtual  RealD MpcDag   (const Field &in, Field &out){
-	Field tmp(in.Grid());
-
-	_Mat.MooeeInvDag(in,out);
-	_Mat.MeooeDag(out,tmp);
-	_Mat.MooeeInvDag(tmp,out);
-	_Mat.MeooeDag(out,tmp);
-
-	return axpy_norm(out,-1.0,tmp,in);
-      }
-    };
-    template<class Matrix,class Field>
-      class SchurDiagTwoOperator :  public SchurOperatorBase<Field> {
-    protected:
-      Matrix &_Mat;
-    public:
-      SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
-
-      virtual  RealD Mpc      (const Field &in, Field &out) {
-	Field tmp(in.Grid());
-
-	_Mat.MooeeInv(in,out);
-	_Mat.Meooe(out,tmp);
-	_Mat.MooeeInv(tmp,out);
-	_Mat.Meooe(out,tmp);
-
-	return axpy_norm(out,-1.0,tmp,in);
-      }
-      virtual  RealD MpcDag   (const Field &in, Field &out){
-	Field tmp(in.Grid());
-
-	_Mat.MeooeDag(in,out);
-	_Mat.MooeeInvDag(out,tmp);
-	_Mat.MeooeDag(tmp,out);
-	_Mat.MooeeInvDag(out,tmp);
-
-	return axpy_norm(out,-1.0,tmp,in);
-      }
-    };
-    ///////////////////////////////////////////////////////////////////////////////////////////////////
-    // Left  handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta  -->  ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta
-    // Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta  -->  ( 1 - Moe Mee^-1 Meo Moo^-1) phi=eta ; psi = Moo^-1 phi
-    ///////////////////////////////////////////////////////////////////////////////////////////////////
-    template<class Matrix,class Field> using SchurDiagOneRH = SchurDiagTwoOperator<Matrix,Field> ;
-    template<class Matrix,class Field> using SchurDiagOneLH = SchurDiagOneOperator<Matrix,Field> ;
-    ///////////////////////////////////////////////////////////////////////////////////////////////////
-    //  Staggered use
-    ///////////////////////////////////////////////////////////////////////////////////////////////////
-    template<class Matrix,class Field>
-      class SchurStaggeredOperator :  public SchurOperatorBase<Field> {
-    protected:
-      Matrix &_Mat;
-      Field tmp;
-      RealD mass;
-      double tMpc;
-      double tIP;
-      double tMeo;
-      double taxpby_norm;
-      uint64_t ncall;
-public:
-      void Report(void)
-      {
-	std::cout << GridLogMessage << " HermOpAndNorm.Mpc "<< tMpc/ncall<<" usec "<<std::endl;
-	std::cout << GridLogMessage << " HermOpAndNorm.IP  "<< tIP /ncall<<" usec "<<std::endl;
-	std::cout << GridLogMessage << " Mpc.MeoMoe        "<< tMeo/ncall<<" usec "<<std::endl;
-	std::cout << GridLogMessage << " Mpc.axpby_norm    "<< taxpby_norm/ncall<<" usec "<<std::endl;
-      }
-      SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid()) 
-      { 
-	assert( _Mat.isTrivialEE() );
-	mass = _Mat.Mass();
-	tMpc=0;
-	tIP =0;
-        tMeo=0;
-        taxpby_norm=0;
-	ncall=0;
-      }
+template<class Field>
+class SchurOperatorBase :  public LinearOperatorBase<Field> {
+ public:
+  virtual  void Mpc      (const Field &in, Field &out) =0;
+  virtual  void MpcDag   (const Field &in, Field &out) =0;
+  virtual  void MpcDagMpc(const Field &in, Field &out) {
+    Field tmp(in.Grid());
+    tmp.Checkerboard() = in.Checkerboard();
+    Mpc(in,tmp);
+    MpcDag(tmp,out);
+  }
  virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
-	ncall++;
-	tMpc-=usecond();
-    n2 = Mpc(in,out);
-	tMpc+=usecond();
-	tIP-=usecond();
-    ComplexD dot= innerProduct(in,out);
-	tIP+=usecond();
-    n1 = real(dot);
+    out.Checkerboard() = in.Checkerboard();
+    MpcDagMpc(in,out);
+    ComplexD dot= innerProduct(in,out); 
+    n1=real(dot);
+    n2=norm2(out);
  }
  virtual void HermOp(const Field &in, Field &out){
-	ncall++;
-	tMpc-=usecond();
-	_Mat.Meooe(in,out);
-	_Mat.Meooe(out,tmp);
-	tMpc+=usecond();
-	taxpby_norm-=usecond();
-	axpby(out,-1.0,mass*mass,tmp,in);
-	taxpby_norm+=usecond();
+    out.Checkerboard() = in.Checkerboard();
+    MpcDagMpc(in,out);
  }
-  virtual  RealD Mpc      (const Field &in, Field &out) 
-  {
+  void Op     (const Field &in, Field &out){
+    Mpc(in,out);
+  }
+  void AdjOp     (const Field &in, Field &out){ 
+    MpcDag(in,out);
+  }
+  // Support for coarsening to a multigrid
+  void OpDiag (const Field &in, Field &out) {
+    assert(0); // must coarsen the unpreconditioned system
+  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {
+    assert(0);
+  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    assert(0);
+  };
+};
+template<class Matrix,class Field>
+  class SchurDiagMooeeOperator :  public SchurOperatorBase<Field> {
+ public:
+    Matrix &_Mat;
+    SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
+    virtual  void Mpc      (const Field &in, Field &out) {
+      Field tmp(in.Grid());
+      tmp.Checkerboard() = !in.Checkerboard();
+      
+      _Mat.Meooe(in,tmp);
+      _Mat.MooeeInv(tmp,out);
+      _Mat.Meooe(out,tmp);
+      _Mat.Mooee(in,out);
+      axpy(out,-1.0,tmp,out);
+    }
+    virtual void MpcDag   (const Field &in, Field &out){
+      Field tmp(in.Grid());
+	
+      _Mat.MeooeDag(in,tmp);
+      _Mat.MooeeInvDag(tmp,out);
+      _Mat.MeooeDag(out,tmp);
+      _Mat.MooeeDag(in,out);
+      axpy(out,-1.0,tmp,out);
+    }
+};
+template<class Matrix,class Field>
+  class SchurDiagOneOperator :  public SchurOperatorBase<Field> {
+ protected:
+    Matrix &_Mat;
+ public:
+    SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){};
+    
+    virtual void Mpc      (const Field &in, Field &out) {
+      Field tmp(in.Grid());

+      _Mat.Meooe(in,out);
+      _Mat.MooeeInv(out,tmp);
+      _Mat.Meooe(tmp,out);
+      _Mat.MooeeInv(out,tmp);
+      axpy(out,-1.0,tmp,in);
+    }
+    virtual void MpcDag   (const Field &in, Field &out){
+      Field tmp(in.Grid());
+      
+      _Mat.MooeeInvDag(in,out);
+      _Mat.MeooeDag(out,tmp);
+      _Mat.MooeeInvDag(tmp,out);
+      _Mat.MeooeDag(out,tmp);
+      axpy(out,-1.0,tmp,in);
+    }
+};
+template<class Matrix,class Field>
+  class SchurDiagTwoOperator :  public SchurOperatorBase<Field> {
+ protected:
+    Matrix &_Mat;
+ public:
+    SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
+    
+    virtual void Mpc      (const Field &in, Field &out) {
+      Field tmp(in.Grid());
+      
+      _Mat.MooeeInv(in,out);
+      _Mat.Meooe(out,tmp);
+      _Mat.MooeeInv(tmp,out);
+      _Mat.Meooe(out,tmp);
+      
+      axpy(out,-1.0,tmp,in);
+    }
+    virtual  void MpcDag   (const Field &in, Field &out){
+      Field tmp(in.Grid());
+
+      _Mat.MeooeDag(in,out);
+      _Mat.MooeeInvDag(out,tmp);
+      _Mat.MeooeDag(tmp,out);
+      _Mat.MooeeInvDag(out,tmp);
+
+      axpy(out,-1.0,tmp,in);
+    }
+};
+
+template<class Field>
+class NonHermitianSchurOperatorBase :  public LinearOperatorBase<Field> 
+{
+ public:
+  virtual void  Mpc      (const Field& in, Field& out) = 0;
+  virtual void  MpcDag   (const Field& in, Field& out) = 0;
+  virtual void  MpcDagMpc(const Field& in, Field& out) {
+    Field tmp(in.Grid());
+    tmp.Checkerboard() = in.Checkerboard();
+    Mpc(in,tmp);
+    MpcDag(tmp,out);
+  }
+  virtual void HermOpAndNorm(const Field& in, Field& out, RealD& n1, RealD& n2) {
+    assert(0);
+  }
+  virtual void HermOp(const Field& in, Field& out) {
+    assert(0);
+  }
+  void Op(const Field& in, Field& out) {
+    Mpc(in, out);
+  }
+  void AdjOp(const Field& in, Field& out) { 
+    MpcDag(in, out);
+  }
+  // Support for coarsening to a multigrid
+  void OpDiag(const Field& in, Field& out) {
+    assert(0); // must coarsen the unpreconditioned system
+  }
+  void OpDir(const Field& in, Field& out, int dir, int disp) {
+    assert(0);
+  }
+  void OpDirAll(const Field& in, std::vector<Field>& out){
+    assert(0);
+  };
+};
+
+template<class Matrix, class Field>
+class NonHermitianSchurDiagMooeeOperator :  public NonHermitianSchurOperatorBase<Field> 
+{
+ public:
+  Matrix& _Mat;
+ NonHermitianSchurDiagMooeeOperator(Matrix& Mat): _Mat(Mat){};
+  virtual void Mpc(const Field& in, Field& out) {
+    Field tmp(in.Grid());
+    tmp.Checkerboard() = !in.Checkerboard();
+    
+    _Mat.Meooe(in, tmp);
+    _Mat.MooeeInv(tmp, out);
+    _Mat.Meooe(out, tmp);
+    
+    _Mat.Mooee(in, out);
+    
+    axpy(out, -1.0, tmp, out);
+  }
+  virtual void MpcDag(const Field& in, Field& out) {
+    Field tmp(in.Grid());
+    
+    _Mat.MeooeDag(in, tmp);
+    _Mat.MooeeInvDag(tmp, out);
+    _Mat.MeooeDag(out, tmp);
+	  
+    _Mat.MooeeDag(in, out);
+    
+    axpy(out, -1.0, tmp, out);
+  }
+};
+    
+template<class Matrix,class Field>
+class NonHermitianSchurDiagOneOperator : public NonHermitianSchurOperatorBase<Field> 
+{
+ protected:
+  Matrix &_Mat;
+  
+ public:
+  NonHermitianSchurDiagOneOperator (Matrix& Mat): _Mat(Mat){};
+  virtual void Mpc(const Field& in, Field& out) {
+    Field tmp(in.Grid());
+	  
+    _Mat.Meooe(in, out);
+    _Mat.MooeeInv(out, tmp);
+    _Mat.Meooe(tmp, out);
+    _Mat.MooeeInv(out, tmp);
+
+    axpy(out, -1.0, tmp, in);
+  }
+  virtual void MpcDag(const Field& in, Field& out) {
+    Field tmp(in.Grid());
+    
+    _Mat.MooeeInvDag(in, out);
+    _Mat.MeooeDag(out, tmp);
+    _Mat.MooeeInvDag(tmp, out);
+    _Mat.MeooeDag(out, tmp);
+    
+    axpy(out, -1.0, tmp, in);
+  }
+};
+
+template<class Matrix, class Field>
+class NonHermitianSchurDiagTwoOperator : public NonHermitianSchurOperatorBase<Field> 
+{
+ protected:
+  Matrix& _Mat;
+  
+ public:
+ NonHermitianSchurDiagTwoOperator(Matrix& Mat): _Mat(Mat){};
+
+  virtual void Mpc(const Field& in, Field& out) {
+    Field tmp(in.Grid());
+    
+    _Mat.MooeeInv(in, out);
+    _Mat.Meooe(out, tmp);
+    _Mat.MooeeInv(tmp, out);
+    _Mat.Meooe(out, tmp);
+
+    axpy(out, -1.0, tmp, in);
+  }
+  virtual void MpcDag(const Field& in, Field& out) {
+    Field tmp(in.Grid());
+    
+    _Mat.MeooeDag(in, out);
+    _Mat.MooeeInvDag(out, tmp);
+    _Mat.MeooeDag(tmp, out);
+    _Mat.MooeeInvDag(out, tmp);
+
+    axpy(out, -1.0, tmp, in);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Left  handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta  -->  ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta
+// Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta  -->  ( 1 - Moe Mee^-1 Meo Moo^-1) phi=eta ; psi = Moo^-1 phi
+///////////////////////////////////////////////////////////////////////////////////////////////////
+template<class Matrix,class Field> using SchurDiagOneRH = SchurDiagTwoOperator<Matrix,Field> ;
+template<class Matrix,class Field> using SchurDiagOneLH = SchurDiagOneOperator<Matrix,Field> ;
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//  Staggered use
+///////////////////////////////////////////////////////////////////////////////////////////////////
+template<class Matrix,class Field>
+class SchurStaggeredOperator :  public SchurOperatorBase<Field> {
+ protected:
+  Matrix &_Mat;
+  Field tmp;
+  RealD mass;
+ public:
+  SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid()) 
+  { 
+    assert( _Mat.isTrivialEE() );
+    mass = _Mat.Mass();
+  }
+  virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    Mpc(in,out);
+    ComplexD dot= innerProduct(in,out);
+    n1 = real(dot);
+    n2 =0.0;
+  }
+  virtual void HermOp(const Field &in, Field &out){
+    Mpc(in,out);
+    //    _Mat.Meooe(in,out);
+    //    _Mat.Meooe(out,tmp);
+    //    axpby(out,-1.0,mass*mass,tmp,in);
+  }
+  virtual  void Mpc      (const Field &in, Field &out) 
+  {
    Field tmp(in.Grid());
    Field tmp2(in.Grid());
+	
+    //    _Mat.Mooee(in,out);
+    //    _Mat.Mooee(out,tmp);

-    //    std::cout << GridLogIterative << " HermOp.Mpc "<<std::endl;
-    _Mat.Mooee(in,out);
-    _Mat.Mooee(out,tmp);
-    //    std::cout << GridLogIterative << " HermOp.MooeeMooee "<<std::endl;
-
-    tMeo-=usecond();
    _Mat.Meooe(in,out);
    _Mat.Meooe(out,tmp);
-    tMeo+=usecond();
-    taxpby_norm-=usecond();
-    RealD nn=axpby_norm(out,-1.0,mass*mass,tmp,in);
-    taxpby_norm+=usecond();
-    return nn;
+    axpby(out,-1.0,mass*mass,tmp,in);
  }
-  virtual  RealD MpcDag   (const Field &in, Field &out){
-    return Mpc(in,out);
+  virtual  void MpcDag   (const Field &in, Field &out){
+    Mpc(in,out);
  }
  virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
    assert(0);// Never need with staggered
@@ -421,7 +513,6 @@ public:
 };
 template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>;

-
 /////////////////////////////////////////////////////////////
 // Base classes for functions of operators
 /////////////////////////////////////////////////////////////
@@ -38,16 +38,12 @@ template<class Field> class SparseMatrixBase {
 public:
  virtual GridBase *Grid(void) =0;
  // Full checkerboar operations
-  virtual RealD M    (const Field &in, Field &out)=0;
-  virtual RealD Mdag (const Field &in, Field &out)=0;
-  virtual void  MdagM(const Field &in, Field &out,RealD &ni,RealD &no) {
-    Field tmp (in.Grid());
-    ni=M(in,tmp);
-    no=Mdag(tmp,out);
-  }
+  virtual void  M    (const Field &in, Field &out)=0;
+  virtual void  Mdag (const Field &in, Field &out)=0;
  virtual void  MdagM(const Field &in, Field &out) {
-    RealD ni, no;
-    MdagM(in,out,ni,no);
+    Field tmp (in.Grid());
+    M(in,tmp);
+    Mdag(tmp,out);
  }
  virtual  void Mdiag    (const Field &in, Field &out)=0;
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
@@ -234,10 +234,8 @@ public:

    GridBase *grid=in.Grid();

-    // std::cout << "Chevyshef(): in.Grid()="<<in.Grid()<<std::endl;
-    //std::cout <<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
-
    int vol=grid->gSites();
+    typedef typename Field::vector_type vector_type;

    Field T0(grid); T0 = in;  
    Field T1(grid); 
@@ -258,14 +256,28 @@ public:
    //    out = ()*T0 + Coeffs[1]*T1;
    axpby(out,0.5*Coeffs[0],Coeffs[1],T0,T1);
    for(int n=2;n<order;n++){
-	
+
      Linop.HermOp(*Tn,y);
-      //     y=xscale*y+mscale*(*Tn);
-      //      *Tnp=2.0*y-(*Tnm);
-      //      out=out+Coeffs[n]* (*Tnp);
+#if 0
+      auto y_v = y.View();
+      auto Tn_v = Tn->View();
+      auto Tnp_v = Tnp->View();
+      auto Tnm_v = Tnm->View();
+      constexpr int Nsimd = vector_type::Nsimd();
+      accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, {
+	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
+	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
+      });
+      if ( Coeffs[n] != 0.0) {
+	axpy(out,Coeffs[n],*Tnp,out);
+      }
+#else
      axpby(y,xscale,mscale,y,(*Tn));
      axpby(*Tnp,2.0,-1.0,y,(*Tnm));
-      axpy(out,Coeffs[n],*Tnp,out);
+      if ( Coeffs[n] != 0.0) {
+	axpy(out,Coeffs[n],*Tnp,out);
+      }
+#endif
      // Cycle pointers to avoid copies
      Field *swizzle = Tnm;
      Tnm    =Tn;
@@ -0,0 +1,473 @@
+#include<math.h>
+#include<stdio.h>
+#include<stdlib.h>
+#include<string>
+#include<iostream>
+#include<iomanip>
+#include<cassert>
+
+#include<Grid/algorithms/approx/RemezGeneral.h>
+
+
+// Constructor
+AlgRemezGeneral::AlgRemezGeneral(double lower, double upper, long precision,
+				 bigfloat (*f)(bigfloat x, void *data), void *data): f(f), 
+										     data(data), 
+										     prec(precision),
+										     apstrt(lower), apend(upper), apwidt(upper - lower),
+										     n(0), d(0), pow_n(0), pow_d(0)
+{
+  bigfloat::setDefaultPrecision(prec);
+
+  std::cout<<"Approximation bounds are ["<<apstrt<<","<<apend<<"]\n";
+  std::cout<<"Precision of arithmetic is "<<precision<<std::endl;
+}
+
+//Determine the properties of the numerator and denominator polynomials
+void AlgRemezGeneral::setupPolyProperties(int num_degree, int den_degree, PolyType num_type_in, PolyType den_type_in){
+  pow_n = num_degree;
+  pow_d = den_degree;
+
+  if(pow_n % 2 == 0 && num_type_in == PolyType::Odd) assert(0);
+  if(pow_n % 2 == 1 && num_type_in == PolyType::Even) assert(0);
+
+  if(pow_d % 2 == 0 && den_type_in == PolyType::Odd) assert(0);
+  if(pow_d % 2 == 1 && den_type_in == PolyType::Even) assert(0);
+
+  num_type = num_type_in;
+  den_type = den_type_in;
+
+  num_pows.resize(pow_n+1);
+  den_pows.resize(pow_d+1);
+
+  int n_in = 0;
+  bool odd = num_type == PolyType::Full || num_type == PolyType::Odd;
+  bool even = num_type == PolyType::Full || num_type == PolyType::Even;
+  for(int i=0;i<=pow_n;i++){
+    num_pows[i] = -1;
+    if(i % 2 == 0 && even) num_pows[i] = n_in++;
+    if(i % 2 == 1 && odd) num_pows[i] = n_in++;
+  }
+
+  std::cout << n_in << " terms in numerator" << std::endl;
+  --n_in; //power is 1 less than the number of terms, eg  pow=1   a x^1  + b x^0
+
+  int d_in = 0;
+  odd = den_type == PolyType::Full || den_type == PolyType::Odd;
+  even = den_type == PolyType::Full || den_type == PolyType::Even;
+  for(int i=0;i<=pow_d;i++){
+    den_pows[i] = -1;
+    if(i % 2 == 0 && even) den_pows[i] = d_in++;
+    if(i % 2 == 1 && odd) den_pows[i] = d_in++;
+  }
+
+  std::cout << d_in << " terms in denominator" << std::endl;
+  --d_in;
+
+  n = n_in;
+  d = d_in;
+}
+
+//Setup algorithm
+void AlgRemezGeneral::reinitializeAlgorithm(){
+  spread = 1.0e37;
+  iter = 0;
+
+  neq = n + d + 1; //not +2 because highest-power term in denominator is fixed to 1
+
+  param.resize(neq);
+  yy.resize(neq+1);
+
+  //Initialize linear equation temporaries
+  A.resize(neq*neq);
+  B.resize(neq);
+  IPS.resize(neq);
+
+  //Initialize maximum and minimum errors
+  xx.resize(neq+2);
+  mm.resize(neq+1);
+  initialGuess();
+
+  //Initialize search steps
+  step.resize(neq+1);
+  stpini();
+}
+
+double AlgRemezGeneral::generateApprox(const int num_degree, const int den_degree, 
+				       const PolyType num_type_in, const PolyType den_type_in, 
+				       const double _tolerance, const int report_freq){
+  //Setup the properties of the polynomial
+  setupPolyProperties(num_degree, den_degree, num_type_in, den_type_in);
+
+  //Setup the algorithm
+  reinitializeAlgorithm();
+
+  bigfloat tolerance = _tolerance;
+
+  //Iterate until convergance
+  while (spread > tolerance) { 
+    if (iter++ % report_freq==0)
+      std::cout<<"Iteration " <<iter-1<<" spread "<<(double)spread<<" delta "<<(double)delta << std::endl; 
+
+    equations();
+    if (delta < tolerance) {
+      std::cout<<"Iteration " << iter-1 << " delta too small (" << delta << "<" << tolerance << "), try increasing precision\n";
+      assert(0);
+    };    
+    assert( delta>= tolerance );
+
+    search();
+  }
+
+  int sign;
+  double error = (double)getErr(mm[0],&sign);
+  std::cout<<"Converged at "<<iter<<" iterations; error = "<<error<<std::endl;
+
+  // Return the maximum error in the approximation
+  return error;
+}
+
+
+// Initial values of maximal and minimal errors
+void AlgRemezGeneral::initialGuess(){
+  // Supply initial guesses for solution points
+  long ncheb = neq;			// Degree of Chebyshev error estimate
+
+  // Find ncheb+1 extrema of Chebyshev polynomial
+  bigfloat a = ncheb;
+  bigfloat r;
+
+  mm[0] = apstrt;
+  for (long i = 1; i < ncheb; i++) {
+    r = 0.5 * (1 - cos((M_PI * i)/(double) a));
+    //r *= sqrt_bf(r);
+    r = (exp((double)r)-1.0)/(exp(1.0)-1.0);
+    mm[i] = apstrt + r * apwidt;
+  }
+  mm[ncheb] = apend;
+
+  a = 2.0 * ncheb;
+  for (long i = 0; i <= ncheb; i++) {
+    r = 0.5 * (1 - cos(M_PI * (2*i+1)/(double) a));
+    //r *= sqrt_bf(r); // Squeeze to low end of interval
+    r = (exp((double)r)-1.0)/(exp(1.0)-1.0);
+    xx[i] = apstrt + r * apwidt;
+  }
+}
+
+// Initialise step sizes
+void AlgRemezGeneral::stpini(){
+  xx[neq+1] = apend;
+  delta = 0.25;
+  step[0] = xx[0] - apstrt;
+  for (int i = 1; i < neq; i++) step[i] = xx[i] - xx[i-1];
+  step[neq] = step[neq-1];
+}
+
+// Search for error maxima and minima
+void AlgRemezGeneral::search(){
+  bigfloat a, q, xm, ym, xn, yn, xx1;
+  int emsign, ensign, steps;
+
+  int meq = neq + 1;
+
+  bigfloat eclose = 1.0e30;
+  bigfloat farther = 0l;
+
+  bigfloat xx0 = apstrt;
+
+  for (int i = 0; i < meq; i++) {
+    steps = 0;
+    xx1 = xx[i]; // Next zero
+    if (i == meq-1) xx1 = apend;
+    xm = mm[i];
+    ym = getErr(xm,&emsign);
+    q = step[i];
+    xn = xm + q;
+    if (xn < xx0 || xn >= xx1) {	// Cannot skip over adjacent boundaries
+      q = -q;
+      xn = xm;
+      yn = ym;
+      ensign = emsign;
+    } else {
+      yn = getErr(xn,&ensign);
+      if (yn < ym) {
+	q = -q;
+	xn = xm;
+	yn = ym;
+	ensign = emsign;
+      }
+    }
+  
+    while(yn >= ym) {		// March until error becomes smaller.
+      if (++steps > 10)
+      	break;
+      
+      ym = yn;
+      xm = xn;
+      emsign = ensign;
+      a = xm + q;
+      if (a == xm || a <= xx0 || a >= xx1)
+	break;// Must not skip over the zeros either side.      
+
+      xn = a;
+      yn = getErr(xn,&ensign);
+    }
+
+    mm[i] = xm;			// Position of maximum
+    yy[i] = ym;			// Value of maximum
+
+    if (eclose > ym) eclose = ym;
+    if (farther < ym) farther = ym;
+
+    xx0 = xx1; // Walk to next zero.
+  } // end of search loop
+
+  q = (farther - eclose);	// Decrease step size if error spread increased
+
+  if (eclose != 0.0) q /= eclose; // Relative error spread
+
+  if (q >= spread)
+    delta *= 0.5; // Spread is increasing; decrease step size
+  
+  spread = q;
+
+  for (int i = 0; i < neq; i++) {
+    q = yy[i+1];
+    if (q != 0.0) q = yy[i] / q  - (bigfloat)1l;
+    else q = 0.0625;
+    if (q > (bigfloat)0.25) q = 0.25;
+    q *= mm[i+1] - mm[i];
+    step[i] = q * delta;
+  }
+  step[neq] = step[neq-1];
+  
+  for (int i = 0; i < neq; i++) {	// Insert new locations for the zeros.
+    xm = xx[i] - step[i];
+
+    if (xm <= apstrt)
+      continue;
+
+    if (xm >= apend)
+      continue;
+
+    if (xm <= mm[i])
+      xm = (bigfloat)0.5 * (mm[i] + xx[i]);    
+
+    if (xm >= mm[i+1])
+      xm = (bigfloat)0.5 * (mm[i+1] + xx[i]);
+    
+    xx[i] = xm;
+  }
+}
+
+// Solve the equations
+void AlgRemezGeneral::equations(){
+  bigfloat x, y, z;
+  bigfloat *aa;
+  
+  for (int i = 0; i < neq; i++) {	// set up the equations for solution by simq()
+    int ip = neq * i;		// offset to 1st element of this row of matrix
+    x = xx[i];			// the guess for this row
+    y = func(x);		// right-hand-side vector
+
+    z = (bigfloat)1l;
+    aa = A.data()+ip;
+    int t = 0;
+    for (int j = 0; j <= pow_n; j++) {
+      if(num_pows[j] != -1){ *aa++ = z; t++; }
+      z *= x;
+    }
+    assert(t == n+1);
+
+    z = (bigfloat)1l;
+    t = 0;
+    for (int j = 0; j < pow_d; j++) {
+      if(den_pows[j] != -1){ *aa++ = -y * z; t++; }
+      z *= x;
+    }
+    assert(t == d);
+
+    B[i] = y * z;		// Right hand side vector
+  }
+
+  // Solve the simultaneous linear equations.
+  if (simq()){
+    std::cout<<"simq failed\n";
+    exit(0);
+  }
+}
+
+
+// Evaluate the rational form P(x)/Q(x) using coefficients
+// from the solution vector param
+bigfloat AlgRemezGeneral::approx(const bigfloat x) const{
+  // Work backwards toward the constant term.
+  int c = n;
+  bigfloat yn = param[c--];		// Highest order numerator coefficient
+  for (int i = pow_n-1; i >= 0; i--) yn = x * yn  +  (num_pows[i] != -1 ? param[c--] : bigfloat(0l));  
+
+  c = n+d;
+  bigfloat yd = 1l; //Highest degree coefficient is 1.0
+  for (int i = pow_d-1; i >= 0; i--) yd = x * yd  +  (den_pows[i] != -1 ? param[c--] : bigfloat(0l)); 
+
+  return(yn/yd);
+}
+
+// Compute size and sign of the approximation error at x
+bigfloat AlgRemezGeneral::getErr(bigfloat x, int *sign) const{
+  bigfloat f = func(x);
+  bigfloat e = approx(x) - f;
+  if (f != 0) e /= f;
+  if (e < (bigfloat)0.0) {
+    *sign = -1;
+    e = -e;
+  }
+  else *sign = 1;
+  
+  return(e);
+}
+
+// Solve the system AX=B
+int AlgRemezGeneral::simq(){
+
+  int ip, ipj, ipk, ipn;
+  int idxpiv;
+  int kp, kp1, kpk, kpn;
+  int nip, nkp;
+  bigfloat em, q, rownrm, big, size, pivot, sum;
+  bigfloat *aa;
+  bigfloat *X = param.data();
+
+  int n = neq;
+  int nm1 = n - 1;
+  // Initialize IPS and X
+  
+  int ij = 0;
+  for (int i = 0; i < n; i++) {
+    IPS[i] = i;
+    rownrm = 0.0;
+    for(int j = 0; j < n; j++) {
+      q = abs_bf(A[ij]);
+      if(rownrm < q) rownrm = q;
+      ++ij;
+    }
+    if (rownrm == (bigfloat)0l) {
+      std::cout<<"simq rownrm=0\n";
+      return(1);
+    }
+    X[i] = (bigfloat)1.0 / rownrm;
+  }
+  
+  for (int k = 0; k < nm1; k++) {
+    big = 0.0;
+    idxpiv = 0;
+    
+    for (int i = k; i < n; i++) {
+      ip = IPS[i];
+      ipk = n*ip + k;
+      size = abs_bf(A[ipk]) * X[ip];
+      if (size > big) {
+	big = size;
+	idxpiv = i;
+      }
+    }
+    
+    if (big == (bigfloat)0l) {
+      std::cout<<"simq big=0\n";
+      return(2);
+    }
+    if (idxpiv != k) {
+      int j = IPS[k];
+      IPS[k] = IPS[idxpiv];
+      IPS[idxpiv] = j;
+    }
+    kp = IPS[k];
+    kpk = n*kp + k;
+    pivot = A[kpk];
+    kp1 = k+1;
+    for (int i = kp1; i < n; i++) {
+      ip = IPS[i];
+      ipk = n*ip + k;
+      em = -A[ipk] / pivot;
+      A[ipk] = -em;
+      nip = n*ip;
+      nkp = n*kp;
+      aa = A.data()+nkp+kp1;
+      for (int j = kp1; j < n; j++) {
+	ipj = nip + j;
+	A[ipj] = A[ipj] + em * *aa++;
+      }
+    }
+  }
+  kpn = n * IPS[n-1] + n - 1;	// last element of IPS[n] th row
+  if (A[kpn] == (bigfloat)0l) {
+    std::cout<<"simq A[kpn]=0\n";
+    return(3);
+  }
+
+  
+  ip = IPS[0];
+  X[0] = B[ip];
+  for (int i = 1; i < n; i++) {
+    ip = IPS[i];
+    ipj = n * ip;
+    sum = 0.0;
+    for (int j = 0; j < i; j++) {
+      sum += A[ipj] * X[j];
+      ++ipj;
+    }
+    X[i] = B[ip] - sum;
+  }
+  
+  ipn = n * IPS[n-1] + n - 1;
+  X[n-1] = X[n-1] / A[ipn];
+  
+  for (int iback = 1; iback < n; iback++) {
+    //i goes (n-1),...,1
+    int i = nm1 - iback;
+    ip = IPS[i];
+    nip = n*ip;
+    sum = 0.0;
+    aa = A.data()+nip+i+1;
+    for (int j= i + 1; j < n; j++) 
+      sum += *aa++ * X[j];
+    X[i] = (X[i] - sum) / A[nip+i];
+  }
+  
+  return(0);
+}
+
+void AlgRemezGeneral::csv(std::ostream & os) const{
+  os << "Numerator" << std::endl;
+  for(int i=0;i<=pow_n;i++){
+    os << getCoeffNum(i) << "*x^" << i;
+    if(i!=pow_n) os << " + ";
+  }
+  os << std::endl;
+
+  os << "Denominator" << std::endl;
+  for(int i=0;i<=pow_d;i++){
+    os << getCoeffDen(i) << "*x^" << i;
+    if(i!=pow_d) os << " + ";
+  }
+  os << std::endl;
+
+  //For a true minimax solution the errors should all be equal and the signs should oscillate +-+-+- etc
+  int sign;
+  os << "Errors at maxima: coordinate, error, (sign)" << std::endl;
+  for(int i=0;i<neq+1;i++){ 
+    os << mm[i] << " " << getErr(mm[i],&sign) << " (" << sign << ")" << std::endl;
+  }
+
+  os << "Scan over range:" << std::endl;
+  int npt = 60;
+  bigfloat dlt = (apend - apstrt)/bigfloat(npt-1);
+
+  for (bigfloat x=apstrt; x<=apend; x = x + dlt) {
+    double f = evaluateFunc(x);
+    double r = evaluateApprox(x);
+    os<< x<<","<<r<<","<<f<<","<<r-f<<std::endl;
+  }
+  return;
+}
@@ -0,0 +1,170 @@
+/*
+  C.Kelly Jan 2020 based on implementation by M. Clark May 2005
+
+  AlgRemezGeneral is an implementation of the Remez algorithm for approximating an arbitrary function by a rational polynomial 
+  It includes optional restriction to odd/even polynomials for the numerator and/or denominator
+*/
+
+#ifndef INCLUDED_ALG_REMEZ_GENERAL_H
+#define INCLUDED_ALG_REMEZ_GENERAL_H
+
+#include <stddef.h>
+#include <Grid/GridStd.h>
+
+#ifdef HAVE_LIBGMP
+#include "bigfloat.h"
+#else
+#include "bigfloat_double.h"
+#endif
+
+
+class AlgRemezGeneral{
+ public:
+  enum PolyType { Even, Odd, Full };
+
+ private:
+
+  // In GSL-style, pass the function as a function pointer. Any data required to evaluate the function is passed in as a void pointer
+  bigfloat (*f)(bigfloat x, void *data);
+  void *data;
+
+  // The approximation parameters
+  std::vector<bigfloat> param;
+  bigfloat norm;
+
+  // The number of non-zero terms in the numerator and denominator
+  int n, d;
+  // The numerator and denominator degree (i.e.  the largest power)
+  int pow_n, pow_d;
+  
+  // Specify if the numerator and/or denominator are odd/even polynomials
+  PolyType num_type;
+  PolyType den_type;
+  std::vector<int> num_pows; //contains the mapping, with -1 if not present
+  std::vector<int> den_pows;
+
+  // The bounds of the approximation
+  bigfloat apstrt, apwidt, apend;
+
+  // Variables used to calculate the approximation
+  int nd1, iter;
+  std::vector<bigfloat> xx;
+  std::vector<bigfloat> mm;
+  std::vector<bigfloat> step;
+
+  bigfloat delta, spread;
+  
+  // Variables used in search
+  std::vector<bigfloat> yy;
+
+  // Variables used in solving linear equations
+  std::vector<bigfloat> A;
+  std::vector<bigfloat> B;
+  std::vector<int> IPS;
+
+  // The number of equations we must solve at each iteration (n+d+1)
+  int neq;
+
+  // The precision of the GNU MP library
+  long prec;
+
+  // Initialize member variables associated with the polynomial's properties
+  void setupPolyProperties(int num_degree, int den_degree, PolyType num_type_in, PolyType den_type_in);
+
+  // Initial values of maximal and minmal errors
+  void initialGuess();
+
+  // Initialise step sizes
+  void stpini();
+
+  // Initialize the algorithm
+  void reinitializeAlgorithm();
+
+  // Solve the equations
+  void equations();
+
+  // Search for error maxima and minima
+  void search(); 
+
+  // Calculate function required for the approximation
+  inline bigfloat func(bigfloat x) const{
+    return f(x, data);
+  }
+
+  // Compute size and sign of the approximation error at x
+  bigfloat getErr(bigfloat x, int *sign) const;
+
+  // Solve the system AX=B   where X = param
+  int simq();
+
+  // Evaluate the rational form P(x)/Q(x) using coefficients from the solution vector param
+  bigfloat approx(bigfloat x) const;
+
+ public:
+  
+  AlgRemezGeneral(double lower, double upper, long prec,
+		  bigfloat (*f)(bigfloat x, void *data), void *data);
+
+  inline int getDegree(void) const{ 
+    assert(n==d);
+    return n;
+  }
+  // Reset the bounds of the approximation
+  inline void setBounds(double lower, double upper) {
+    apstrt = lower;
+    apend = upper;
+    apwidt = apend - apstrt;
+  }
+
+  // Get the bounds of the approximation
+  inline void getBounds(double &lower, double &upper) const{ 
+    lower=(double)apstrt;
+    upper=(double)apend;
+  }
+
+  // Run the algorithm to generate the rational approximation
+  double generateApprox(int num_degree, int den_degree, 
+			PolyType num_type, PolyType den_type,
+			const double tolerance = 1e-15, const int report_freq = 1000);
+  
+  inline double generateApprox(int num_degree, int den_degree, 
+			       const double tolerance = 1e-15, const int report_freq = 1000){
+    return generateApprox(num_degree, den_degree, Full, Full, tolerance, report_freq);
+  }
+  
+  // Evaluate the rational form P(x)/Q(x) using coefficients from the
+  // solution vector param
+  inline double evaluateApprox(double x) const{
+    return (double)approx((bigfloat)x);
+  }
+
+  // Evaluate the rational form Q(x)/P(x) using coefficients from the solution vector param
+  inline double evaluateInverseApprox(double x) const{
+    return 1.0/(double)approx((bigfloat)x);
+  }  
+
+  // Calculate function required for the approximation
+  inline double evaluateFunc(double x) const{
+    return (double)func((bigfloat)x);
+  }
+
+  // Calculate inverse function required for the approximation
+  inline double evaluateInverseFunc(double x) const{
+    return 1.0/(double)func((bigfloat)x);
+  }
+
+  // Dump csv of function, approx and error
+  void csv(std::ostream &os = std::cout) const;
+
+  // Get the coefficient of the term x^i in the numerator
+  inline double getCoeffNum(const int i) const{    
+    return num_pows[i] == -1 ? 0. : double(param[num_pows[i]]);
+  }
+  // Get the coefficient of the term x^i in the denominator
+  inline double getCoeffDen(const int i) const{ 
+    if(i == pow_d) return 1.0;
+    else return den_pows[i] == -1 ? 0. : double(param[den_pows[i]+n+1]); 
+  }
+};
+
+#endif
@@ -0,0 +1,183 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/approx/ZMobius.cc
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@phys.columbia.edu>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/algorithms/approx/ZMobius.h>
+#include <Grid/algorithms/approx/RemezGeneral.h>
+
+NAMESPACE_BEGIN(Grid);
+NAMESPACE_BEGIN(Approx);
+
+//Compute the tanh approximation
+inline double epsilonMobius(const double x, const std::vector<ComplexD> &w){
+  int Ls = w.size();
+
+  ComplexD fxp = 1., fmp = 1.;
+  for(int i=0;i<Ls;i++){
+    fxp = fxp * ( w[i] + x );
+    fmp = fmp * ( w[i] - x );
+  }
+  return ((fxp - fmp)/(fxp + fmp)).real();
+}
+inline double epsilonMobius(const double x, const std::vector<RealD> &w){
+  int Ls = w.size();
+
+  double fxp = 1., fmp = 1.;
+  for(int i=0;i<Ls;i++){
+    fxp = fxp * ( w[i] + x );
+    fmp = fmp * ( w[i] - x );
+  }
+  return (fxp - fmp)/(fxp + fmp);
+}
+
+
+
+//Compute the tanh approximation in a form suitable for the Remez
+bigfloat epsilonMobius(bigfloat x, void* data){
+  const std::vector<RealD> &omega = *( (std::vector<RealD> const*)data );
+  bigfloat fxp(1.0);
+  bigfloat fmp(1.0);
+
+  for(int i=0;i<omega.size();i++){
+    fxp = fxp * ( bigfloat(omega[i]) + x);
+    fmp = fmp * ( bigfloat(omega[i]) - x);
+  }
+  return (fxp - fmp)/(fxp + fmp);
+}
+
+//Compute the Zmobius Omega parameters suitable for eigenvalue range   -lambda_bound <= lambda <= lambda_bound
+//Note omega_i = 1/(b_i + c_i)   where b_i and c_i are the Mobius parameters
+void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out,
+			 const std::vector<RealD> &omega_in, const int Ls_in,
+			 const RealD lambda_bound){
+  assert(omega_in.size() == Ls_in);
+  omega_out.resize(Ls_out);
+
+  //Use the Remez algorithm to generate the appropriate rational polynomial
+  //For odd polynomial, to satisfy Haar condition must take either positive or negative half of range (cf https://arxiv.org/pdf/0803.0439.pdf page 6)  
+  AlgRemezGeneral remez(0, lambda_bound, 64, &epsilonMobius, (void*)&omega_in); 
+  remez.generateApprox(Ls_out-1, Ls_out,AlgRemezGeneral::Odd, AlgRemezGeneral::Even, 1e-15, 100);
+  remez.csv(std::cout);
+
+  //The rational approximation has the form  [ f(x) - f(-x) ] / [ f(x) + f(-x) ]  where  f(x) = \Prod_{i=0}^{L_s-1} ( \omega_i + x )
+  //cf https://academiccommons.columbia.edu/doi/10.7916/D8T72HD7  pg 102
+  //omega_i are therefore the negative of the complex roots of f(x)
+
+  //We can find the roots by recognizing that the eigenvalues of a matrix A are the roots of the characteristic polynomial
+  // \rho(\lambda) = det( A - \lambda I )    where I is the unit matrix
+  //The matrix whose characteristic polynomial is an arbitrary monic polynomial a0 + a1 x + a2 x^2 + ... x^n   is the companion matrix 
+  // A = | 0    1   0    0 0 .... 0 |
+  //     | 0    0   1    0 0 .... 0 |
+  //     | :    :   :    : :      : |
+  //     | 0    0   0    0 0      1
+  //     | -a0 -a1 -a2  ...  ... -an|
+
+
+  //Note the Remez defines the largest power to have unit coefficient
+  std::vector<RealD> coeffs(Ls_out+1);
+  for(int i=0;i<Ls_out+1;i+=2) coeffs[i] = coeffs[i] = remez.getCoeffDen(i); //even powers
+  for(int i=1;i<Ls_out+1;i+=2) coeffs[i] = coeffs[i] = remez.getCoeffNum(i); //odd powers
+
+  std::vector<std::complex<RealD> > roots(Ls_out);
+
+  //Form the companion matrix
+  Eigen::MatrixXd compn(Ls_out,Ls_out);
+  for(int i=0;i<Ls_out-1;i++) compn(i,0) = 0.;
+  compn(Ls_out - 1, 0) = -coeffs[0];
+  
+  for(int j=1;j<Ls_out;j++){
+    for(int i=0;i<Ls_out-1;i++) compn(i,j) = i == j-1 ? 1. : 0.;
+    compn(Ls_out - 1, j) = -coeffs[j];
+  }
+
+  //Eigensolve
+  Eigen::EigenSolver<Eigen::MatrixXd> slv(compn, false);
+
+  const auto & ev = slv.eigenvalues();
+  for(int i=0;i<Ls_out;i++)
+    omega_out[i] = -ev(i);
+
+  //Sort ascending (smallest at start of vector!)
+  std::sort(omega_out.begin(), omega_out.end(), 
+	    [&](const ComplexD &a, const ComplexD &b){ return a.real() < b.real() || (a.real() == b.real() && a.imag() < b.imag()); });
+
+  //McGlynn thesis pg 122 suggest improved iteration counts if magnitude of omega diminishes towards the center of the 5th dimension
+  std::vector<ComplexD> omega_tmp = omega_out;
+  int s_low=0, s_high=Ls_out-1, ss=0;
+  for(int s_from = Ls_out-1; s_from >= 0; s_from--){ //loop from largest omega
+    int s_to;
+    if(ss % 2 == 0){
+      s_to = s_low++;
+    }else{
+      s_to = s_high--;
+    }
+    omega_out[s_to] = omega_tmp[s_from];
+    ++ss;
+  }
+  
+  std::cout << "Resulting omega_i:" << std::endl;  
+  for(int i=0;i<Ls_out;i++)
+    std::cout << omega_out[i] << std::endl;
+
+  std::cout << "Test result matches the approximate polynomial found by the Remez" << std::endl;
+  std::cout << "<x> <remez approx> <poly approx> <diff poly approx remez approx> <exact> <diff poly approx exact>\n";
+  
+  int npt = 60;
+  double dlt = lambda_bound/double(npt-1);
+
+  for (int i =0; i<npt; i++){
+    double x = i*dlt;
+    double r = remez.evaluateApprox(x);
+    double p = epsilonMobius(x, omega_out);
+    double e = epsilonMobius(x, omega_in);
+
+    std::cout << x<< " " << r << " " << p <<" " <<r-p << " " << e << " " << e-p << std::endl;
+  }
+
+}
+  
+//mobius_param = b+c   with b-c=1
+void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out, const RealD mobius_param, const int Ls_in, const RealD lambda_bound){
+  std::vector<RealD> omega_in(Ls_in, 1./mobius_param);
+  computeZmobiusOmega(omega_out, Ls_out, omega_in, Ls_in, lambda_bound);
+}
+
+//ZMobius class takes  gamma_i = (b+c) omega_i as its input, where b, c are factored out
+void computeZmobiusGamma(std::vector<ComplexD> &gamma_out, 
+			 const RealD mobius_param_out, const int Ls_out, 
+			 const RealD mobius_param_in, const int Ls_in,
+			 const RealD lambda_bound){
+  computeZmobiusOmega(gamma_out, Ls_out, mobius_param_in, Ls_in, lambda_bound);
+  for(int i=0;i<Ls_out;i++) gamma_out[i] = gamma_out[i] * mobius_param_out;
+}
+//Assumes mobius_param_out == mobius_param_in
+void computeZmobiusGamma(std::vector<ComplexD> &gamma_out, const int Ls_out, const RealD mobius_param, const int Ls_in, const RealD lambda_bound){
+  computeZmobiusGamma(gamma_out, mobius_param, Ls_out, mobius_param, Ls_in, lambda_bound);
+}
+
+NAMESPACE_END(Approx);
+NAMESPACE_END(Grid);
@@ -0,0 +1,57 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/approx/ZMobius.h
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@phys.columbia.edu>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_ZMOBIUS_APPROX_H
+#define GRID_ZMOBIUS_APPROX_H
+
+#include <Grid/GridCore.h>
+
+NAMESPACE_BEGIN(Grid);
+NAMESPACE_BEGIN(Approx);
+
+//Compute the Zmobius Omega parameters suitable for eigenvalue range   -lambda_bound <= lambda <= lambda_bound
+//Note omega_i = 1/(b_i + c_i)   where b_i and c_i are the Mobius parameters
+void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out,
+			 const std::vector<RealD> &omega_in, const int Ls_in,
+			 const RealD lambda_bound);
+  
+//mobius_param = b+c   with b-c=1
+void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out, const RealD mobius_param, const int Ls_in, const RealD lambda_bound);
+
+//ZMobius class takes  gamma_i = (b+c) omega_i as its input, where b, c are factored out
+void computeZmobiusGamma(std::vector<ComplexD> &gamma_out, 
+			 const RealD mobius_param_out, const int Ls_out, 
+			 const RealD mobius_param_in, const int Ls_in,
+			 const RealD lambda_bound);
+
+//Assumes mobius_param_out == mobius_param_in
+void computeZmobiusGamma(std::vector<ComplexD> &gamma_out, const int Ls_out, const RealD mobius_param, const int Ls_in, const RealD lambda_bound);
+
+NAMESPACE_END(Approx);
+NAMESPACE_END(Grid);
+
+#endif
@@ -25,6 +25,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+
+#ifndef INCLUDED_BIGFLOAT_DOUBLE_H
+#define INCLUDED_BIGFLOAT_DOUBLE_H
+
 #include <math.h>

 typedef double mfloat; 
@@ -186,4 +190,6 @@ public:
  //  friend bigfloat& random(void);
 };

+#endif
+

@@ -0,0 +1,222 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/algorithms/iterative/BiCGSTAB.h
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: juettner <juettner@soton.ac.uk>
+Author: David Murphy <djmurphy@mit.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef GRID_BICGSTAB_H
+#define GRID_BICGSTAB_H
+
+NAMESPACE_BEGIN(Grid);
+
+/////////////////////////////////////////////////////////////
+// Base classes for iterative processes based on operators
+// single input vec, single output vec.
+/////////////////////////////////////////////////////////////
+
+template <class Field>
+class BiCGSTAB : public OperatorFunction<Field> 
+{
+  public:
+    using OperatorFunction<Field>::operator();
+    
+    bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
+                             // Defaults true.
+    RealD Tolerance;
+    Integer MaxIterations;
+    Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+  
+    BiCGSTAB(RealD tol, Integer maxit, bool err_on_no_conv = true) : 
+      Tolerance(tol), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv){};
+
+    void operator()(LinearOperatorBase<Field>& Linop, const Field& src, Field& psi) 
+    {
+      psi.Checkerboard() = src.Checkerboard();
+      conformable(psi, src);
+
+      RealD cp(0), rho(1), rho_prev(0), alpha(1), beta(0), omega(1);
+      RealD a(0), bo(0), b(0), ssq(0);
+
+      Field p(src);
+      Field r(src);
+      Field rhat(src);
+      Field v(src);
+      Field s(src);
+      Field t(src);
+      Field h(src);
+
+      v = Zero();
+      p = Zero();
+
+      // Initial residual computation & set up
+      RealD guess = norm2(psi);
+      assert(std::isnan(guess) == 0);
+    
+      Linop.Op(psi, v);
+      b = norm2(v);
+
+      r = src - v;
+      rhat = r;
+      a = norm2(r);
+      ssq = norm2(src);
+
+      std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB: guess " << guess << std::endl;
+      std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB:   src " << ssq << std::endl;
+      std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB:    mp " << b << std::endl;
+      std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB:     r " << a << std::endl;
+
+      RealD rsq = Tolerance * Tolerance * ssq;
+
+      // Check if guess is really REALLY good :)
+      if(a <= rsq){ return; }
+
+      std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB: k=0 residual " << a << " target " << rsq << std::endl;
+
+      GridStopWatch LinalgTimer;
+      GridStopWatch InnerTimer;
+      GridStopWatch AxpyNormTimer;
+      GridStopWatch LinearCombTimer;
+      GridStopWatch MatrixTimer;
+      GridStopWatch SolverTimer;
+
+      SolverTimer.Start();
+      int k;
+      for (k = 1; k <= MaxIterations; k++) 
+      {
+        rho_prev = rho;
+
+        LinalgTimer.Start();
+        InnerTimer.Start();
+        ComplexD Crho  = innerProduct(rhat,r);
+        InnerTimer.Stop();
+        rho = Crho.real();
+
+        beta = (rho / rho_prev) * (alpha / omega);
+
+        LinearCombTimer.Start();
+        bo = beta * omega;
+        auto p_v = p.View();
+        auto r_v = r.View();
+        auto v_v = v.View();
+        accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{
+          coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss));
+        });
+        LinearCombTimer.Stop();
+        LinalgTimer.Stop();
+
+        MatrixTimer.Start();
+        Linop.Op(p,v);
+        MatrixTimer.Stop();
+
+        LinalgTimer.Start();
+        InnerTimer.Start();
+        ComplexD Calpha = innerProduct(rhat,v);
+        InnerTimer.Stop();
+        alpha = rho / Calpha.real();
+
+        LinearCombTimer.Start();
+        auto h_v = h.View();
+        auto psi_v = psi.View();
+        accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{
+          coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss));
+        });
+        
+        auto s_v = s.View();
+        accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{
+          coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss));
+        });
+        LinearCombTimer.Stop();
+        LinalgTimer.Stop();
+
+        MatrixTimer.Start();
+        Linop.Op(s,t);
+        MatrixTimer.Stop();
+
+        LinalgTimer.Start();
+        InnerTimer.Start();
+        ComplexD Comega = innerProduct(t,s);
+        InnerTimer.Stop();
+        omega = Comega.real() / norm2(t);
+
+        LinearCombTimer.Start();
+        auto t_v = t.View();
+        accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{
+          coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss));
+          coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss));
+        });
+        LinearCombTimer.Stop();
+
+        cp = norm2(r);
+        LinalgTimer.Stop();
+
+        std::cout << GridLogIterative << "BiCGSTAB: Iteration " << k << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
+
+        // Stopping condition
+        if(cp <= rsq) 
+        {
+          SolverTimer.Stop();
+          Linop.Op(psi, v);
+          p = v - src;
+
+          RealD srcnorm = sqrt(norm2(src));
+          RealD resnorm = sqrt(norm2(p));
+          RealD true_residual = resnorm / srcnorm;
+
+          std::cout << GridLogMessage << "BiCGSTAB Converged on iteration " << k << std::endl;
+          std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp/ssq) << std::endl;
+          std::cout << GridLogMessage << "\tTrue residual " << true_residual << std::endl;
+          std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
+
+          std::cout << GridLogMessage << "Time breakdown " << std::endl;
+          std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() << std::endl;
+          std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() << std::endl;
+          std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() << std::endl;
+          std::cout << GridLogMessage << "\tInner      " << InnerTimer.Elapsed() << std::endl;
+          std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() << std::endl;
+          std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() << std::endl;
+
+          if(ErrorOnNoConverge){ assert(true_residual / Tolerance < 10000.0); }
+
+          IterationsToComplete = k;	
+
+          return;
+        }
+      }
+      
+      std::cout << GridLogMessage << "BiCGSTAB did NOT converge" << std::endl;
+
+      if(ErrorOnNoConverge){ assert(0); }
+      IterationsToComplete = k;
+    }
+};
+
+NAMESPACE_END(Grid);
+
+#endif
@@ -0,0 +1,158 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: ./lib/algorithms/iterative/BiCGSTABMixedPrec.h
+
+Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@phys.columbia.edu>
+Author: David Murphy <djmurphy@mit.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef GRID_BICGSTAB_MIXED_PREC_H
+#define GRID_BICGSTAB_MIXED_PREC_H
+
+NAMESPACE_BEGIN(Grid);
+
+// Mixed precision restarted defect correction BiCGSTAB
+template<class FieldD, class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
+class MixedPrecisionBiCGSTAB : public LinearFunction<FieldD> 
+{
+  public:                                                
+    RealD   Tolerance;
+    RealD   InnerTolerance; // Initial tolerance for inner CG. Defaults to Tolerance but can be changed
+    Integer MaxInnerIterations;
+    Integer MaxOuterIterations;
+    GridBase* SinglePrecGrid; // Grid for single-precision fields
+    RealD OuterLoopNormMult; // Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
+    LinearOperatorBase<FieldF> &Linop_f;
+    LinearOperatorBase<FieldD> &Linop_d;
+
+    Integer TotalInnerIterations; //Number of inner CG iterations
+    Integer TotalOuterIterations; //Number of restarts
+    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
+
+    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
+    LinearFunction<FieldF> *guesser;
+    
+    MixedPrecisionBiCGSTAB(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid, 
+        LinearOperatorBase<FieldF>& _Linop_f, LinearOperatorBase<FieldD>& _Linop_d) : 
+      Linop_f(_Linop_f), Linop_d(_Linop_d), Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), 
+      MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid), OuterLoopNormMult(100.), guesser(NULL) {};
+
+    void useGuesser(LinearFunction<FieldF>& g){
+      guesser = &g;
+    }
+  
+    void operator() (const FieldD& src_d_in, FieldD& sol_d)
+    {
+      TotalInnerIterations = 0;
+    
+      GridStopWatch TotalTimer;
+      TotalTimer.Start();
+      
+      int cb = src_d_in.Checkerboard();
+      sol_d.Checkerboard() = cb;
+      
+      RealD src_norm = norm2(src_d_in);
+      RealD stop = src_norm * Tolerance*Tolerance;
+
+      GridBase* DoublePrecGrid = src_d_in.Grid();
+      FieldD tmp_d(DoublePrecGrid);
+      tmp_d.Checkerboard() = cb;
+      
+      FieldD tmp2_d(DoublePrecGrid);
+      tmp2_d.Checkerboard() = cb;
+      
+      FieldD src_d(DoublePrecGrid);
+      src_d = src_d_in; //source for next inner iteration, computed from residual during operation
+      
+      RealD inner_tol = InnerTolerance;
+      
+      FieldF src_f(SinglePrecGrid);
+      src_f.Checkerboard() = cb;
+      
+      FieldF sol_f(SinglePrecGrid);
+      sol_f.Checkerboard() = cb;
+      
+      BiCGSTAB<FieldF> CG_f(inner_tol, MaxInnerIterations);
+      CG_f.ErrorOnNoConverge = false;
+
+      GridStopWatch InnerCGtimer;
+
+      GridStopWatch PrecChangeTimer;
+      
+      Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
+        
+      for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++)
+      {
+        // Compute double precision rsd and also new RHS vector.
+        Linop_d.Op(sol_d, tmp_d);
+        RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
+        
+        std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Outer iteration " << outer_iter << " residual " << norm << " target " << stop << std::endl;
+
+        if(norm < OuterLoopNormMult * stop){
+          std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Outer iteration converged on iteration " << outer_iter << std::endl;
+          break;
+        }
+        while(norm * inner_tol * inner_tol < stop){ inner_tol *= 2; } // inner_tol = sqrt(stop/norm) ??
+
+        PrecChangeTimer.Start();
+        precisionChange(src_f, src_d);
+        PrecChangeTimer.Stop();
+        
+        sol_f = Zero();
+
+        //Optionally improve inner solver guess (eg using known eigenvectors)
+        if(guesser != NULL){ (*guesser)(src_f, sol_f); }
+
+        //Inner CG
+        CG_f.Tolerance = inner_tol;
+        InnerCGtimer.Start();
+        CG_f(Linop_f, src_f, sol_f);
+        InnerCGtimer.Stop();
+        TotalInnerIterations += CG_f.IterationsToComplete;
+        
+        //Convert sol back to double and add to double prec solution
+        PrecChangeTimer.Start();
+        precisionChange(tmp_d, sol_f);
+        PrecChangeTimer.Stop();
+        
+        axpy(sol_d, 1.0, tmp_d, sol_d);
+      }
+      
+      //Final trial CG
+      std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Starting final patch-up double-precision solve" << std::endl;
+      
+      BiCGSTAB<FieldD> CG_d(Tolerance, MaxInnerIterations);
+      CG_d(Linop_d, src_d_in, sol_d);
+      TotalFinalStepIterations = CG_d.IterationsToComplete;
+
+      TotalTimer.Stop();
+      std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
+      std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
+  }
+};
+
+NAMESPACE_END(Grid);
+
+#endif
@@ -52,6 +52,7 @@ class BlockConjugateGradient : public OperatorFunction<Field> {
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  Integer PrintInterval; //GridLogMessages or Iterative
+  RealD TrueResidual;
  
  BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
    : Tolerance(tol), CGtype(cgtype),   blockDim(_Orthog),  MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv),PrintInterval(100)
@@ -306,7 +307,8 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)

      Linop.HermOp(X, AD);
      AD = AD-B;
-      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AD)/norm2(B)) <<std::endl;
+      TrueResidual = std::sqrt(norm2(AD)/norm2(B));
+      std::cout << GridLogMessage <<"\tTrue residual is " << TrueResidual <<std::endl;

      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
@@ -442,7 +444,8 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &

      Linop.HermOp(Psi, AP);
      AP = AP-Src;
-      std::cout <<GridLogMessage << "\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
+      TrueResidual = std::sqrt(norm2(AP)/norm2(Src));
+      std::cout <<GridLogMessage << "\tTrue residual is " << TrueResidual <<std::endl;

      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
@@ -653,7 +656,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
      if ( rr > max_resid ) max_resid = rr;
    }

-    std::cout << GridLogIterative << "\t Block Iteration "<<k<<" ave resid "<< sqrt(rrsum/sssum) << " max "<< sqrt(max_resid) <<std::endl;
+    std::cout << GridLogIterative << "\t Block Iteration "<<k<<" ave resid "<< std::sqrt(rrsum/sssum) << " max "<< std::sqrt(max_resid) <<std::endl;

    if ( max_resid < Tolerance*Tolerance ) { 

@@ -668,7 +671,8 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field

      for(int b=0;b<Nblock;b++) Linop.HermOp(X[b], AD[b]);
      for(int b=0;b<Nblock;b++) AD[b] = AD[b]-B[b];
-      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(normv(AD)/normv(B)) <<std::endl;
+      TrueResidual = std::sqrt(normv(AD)/normv(B));
+      std::cout << GridLogMessage << "\tTrue residual is " << TrueResidual <<std::endl;

      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
@@ -49,6 +49,7 @@ public:
  RealD Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+  RealD TrueResidual;
  
  ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
    : Tolerance(tol),
@@ -81,6 +82,14 @@ public:
    cp = a;
    ssq = norm2(src);

+    // Handle trivial case of zero src
+    if (ssq == 0.){
+      psi = Zero();
+      IterationsToComplete = 1;
+      TrueResidual = 0.;
+      return;
+    }
+
    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: guess " << guess << std::endl;
    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:   src " << ssq << std::endl;
    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:    mp " << d << std::endl;
@@ -92,6 +101,7 @@ public:

    // Check if guess is really REALLY good :)
    if (cp <= rsq) {
+      TrueResidual = std::sqrt(a/ssq);
      std::cout << GridLogMessage << "ConjugateGradient guess is converged already " << std::endl;
      IterationsToComplete = 0;	
      return;
@@ -141,7 +151,7 @@ public:
      LinalgTimer.Stop();

      std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
-                << " residual^2 " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
+                << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;

      // Stopping condition
      if (cp <= rsq) {
@@ -169,10 +179,17 @@ public:
        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);

 	IterationsToComplete = k;	
+	TrueResidual = true_residual;

        return;
      }
    }
+    // Failed. Calculate true residual before giving up                                                         
+    Linop.HermOpAndNorm(psi, mmp, d, qq);
+    p = mmp - src;
+
+    TrueResidual = sqrt(norm2(p)/ssq);
+
    std::cout << GridLogMessage << "ConjugateGradient did NOT converge "<<k<<" / "<< MaxIterations<< std::endl;

    if (ErrorOnNoConverge) assert(0);
@@ -46,15 +46,19 @@ public:

  RealD   Tolerance;
  Integer MaxIterations;
-    Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
  int verbose;
  MultiShiftFunction shifts;
+  std::vector<RealD> TrueResidualShift;

  ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) : 
    MaxIterations(maxit),
    shifts(_shifts)
  { 
    verbose=1;
+    IterationsToCompleteShift.resize(_shifts.order);
+    TrueResidualShift.resize(_shifts.order);
  }

  void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
@@ -125,6 +129,17 @@ public:
    // Residuals "r" are src
    // First search direction "p" is also src
    cp = norm2(src);
+
+    // Handle trivial case of zero src.
+    if( cp == 0. ){
+      for(int s=0;s<nshift;s++){
+	psi[s] = Zero();
+	IterationsToCompleteShift[s] = 1;
+	TrueResidualShift[s] = 0.;
+      }
+      return;
+    }
+
    for(int s=0;s<nshift;s++){
      rsq[s] = cp * mresidual[s] * mresidual[s];
      std::cout<<GridLogMessage<<"ConjugateGradientMultiShift: shift "<<s
@@ -270,6 +285,7 @@ public:
      for(int s=0;s<nshift;s++){
      
 	if ( (!converged[s]) ){
+	  IterationsToCompleteShift[s] = k;
 	
 	  RealD css  = c * z[s][iz]* z[s][iz];
 	
@@ -299,7 +315,8 @@ public:
 	  axpy(r,-alpha[s],src,tmp);
 	  RealD rn = norm2(r);
 	  RealD cn = norm2(src);
-	  std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
+	  TrueResidualShift[s] = std::sqrt(rn/cn);
+	  std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<< TrueResidualShift[s] <<std::endl;
 	}

      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
@@ -37,211 +37,6 @@ Author: Christoph Lehner <clehner@bnl.gov>

 NAMESPACE_BEGIN(Grid); 

-  ////////////////////////////////////////////////////////
-  // Move following 100 LOC to lattice/Lattice_basis.h
-  ////////////////////////////////////////////////////////
-template<class Field>
-void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k) 
-{
-  // If assume basis[j] are already orthonormal,
-  // can take all inner products in parallel saving 2x bandwidth
-  // Save 3x bandwidth on the second line of loop.
-  // perhaps 2.5x speed up.
-  // 2x overall in Multigrid Lanczos  
-  for(int j=0; j<k; ++j){
-    auto ip = innerProduct(basis[j],w);
-    w = w - ip*basis[j];
-  }
-}
-
-template<class Field>
-void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm) 
-{
-  typedef decltype(basis[0].View()) View;
-  auto tmp_v = basis[0].View();
-  Vector<View> basis_v(basis.size(),tmp_v);
-  typedef typename Field::vector_object vobj;
-  GridBase* grid = basis[0].Grid();
-
-  for(int k=0;k<basis.size();k++){
-    basis_v[k] = basis[k].View();
-  }
-#if 0
-  std::vector < vobj , commAllocator<vobj> > Bt(thread_max() * Nm); // Thread private
-  thread_region
-  {
-    vobj* B = Bt.data() + Nm * thread_num();
-
-    thread_for_in_region(ss, grid->oSites(),{
-      for(int j=j0; j<j1; ++j) B[j]=0.;
-      
-      for(int j=j0; j<j1; ++j){
-	for(int k=k0; k<k1; ++k){
-	  B[j] +=Qt(j,k) * basis_v[k][ss];
-	}
-      }
-      for(int j=j0; j<j1; ++j){
-	basis_v[j][ss] = B[j];
-      }
-    });
-  }
-#else
-
-  int nrot = j1-j0;
-
-
-  uint64_t oSites   =grid->oSites();
-  uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
-
-  //  printf("BasisRotate %d %d nrot %d siteBlock %d\n",j0,j1,nrot,siteBlock);
-
-  Vector <vobj> Bt(siteBlock * nrot); 
-  auto Bp=&Bt[0];
-
-  // GPU readable copy of Eigen matrix
-  Vector<double> Qt_jv(Nm*Nm);
-  double *Qt_p = & Qt_jv[0];
-  for(int k=0;k<Nm;++k){
-    for(int j=0;j<Nm;++j){
-      Qt_p[j*Nm+k]=Qt(j,k);
-    }
-  }
-
-  // Block the loop to keep storage footprint down
-  vobj zz=Zero();
-  for(uint64_t s=0;s<oSites;s+=siteBlock){
-
-    // remaining work in this block
-    int ssites=MIN(siteBlock,oSites-s);
-
-    // zero out the accumulators
-    accelerator_for(ss,siteBlock*nrot,vobj::Nsimd(),{
-	auto z=coalescedRead(zz);
-	coalescedWrite(Bp[ss],z);
-    });
-
-    accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
-	
-      int j =sj%nrot;
-      int jj  =j0+j;
-      int ss =sj/nrot;
-      int sss=ss+s;
-
-      for(int k=k0; k<k1; ++k){
-	auto tmp = coalescedRead(Bp[ss*nrot+j]);
-	coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_v[k][sss]));
-      }
-    });
-
-    accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
-      int j =sj%nrot;
-      int jj  =j0+j;
-      int ss =sj/nrot;
-      int sss=ss+s;
-      coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
-    });
-  }
-#endif
-}
-
-// Extract a single rotated vector
-template<class Field>
-void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) 
-{
-  typedef decltype(basis[0].View()) View;
-  typedef typename Field::vector_object vobj;
-  GridBase* grid = basis[0].Grid();
-
-  result.Checkerboard() = basis[0].Checkerboard();
-  auto result_v=result.View();
-  Vector<View> basis_v(basis.size(),result_v);
-  for(int k=0;k<basis.size();k++){
-    basis_v[k] = basis[k].View();
-  }
-  vobj zz=Zero();
-  Vector<double> Qt_jv(Nm);
-  double * Qt_j = & Qt_jv[0];
-  for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
-  accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
-    auto B=coalescedRead(zz);
-    for(int k=k0; k<k1; ++k){
-      B +=Qt_j[k] * coalescedRead(basis_v[k][ss]);
-    }
-    coalescedWrite(result_v[ss], B);
-  });
-}
-
-template<class Field>
-void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx) 
-{
-  int vlen = idx.size();
-
-  assert(vlen>=1);
-  assert(vlen<=sort_vals.size());
-  assert(vlen<=_v.size());
-
-  for (size_t i=0;i<vlen;i++) {
-
-    if (idx[i] != i) {
-
-      //////////////////////////////////////
-      // idx[i] is a table of desired sources giving a permutation.
-      // Swap v[i] with v[idx[i]].
-      // Find  j>i for which _vnew[j] = _vold[i],
-      // track the move idx[j] => idx[i]
-      // track the move idx[i] => i
-      //////////////////////////////////////
-      size_t j;
-      for (j=i;j<idx.size();j++)
-	if (idx[j]==i)
-	  break;
-
-      assert(idx[i] > i);     assert(j!=idx.size());      assert(idx[j]==i);
-
-      swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy
-      std::swap(sort_vals[i],sort_vals[idx[i]]);
-
-      idx[j] = idx[i];
-      idx[i] = i;
-    }
-  }
-}
-
-inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals) 
-{
-  std::vector<int> idx(sort_vals.size());
-  std::iota(idx.begin(), idx.end(), 0);
-
-  // sort indexes based on comparing values in v
-  std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
-    return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
-  });
-  return idx;
-}
-
-template<class Field>
-void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse) 
-{
-  std::vector<int> idx = basisSortGetIndex(sort_vals);
-  if (reverse)
-    std::reverse(idx.begin(), idx.end());
-  
-  basisReorderInPlace(_v,sort_vals,idx);
-}
-
-// PAB: faster to compute the inner products first then fuse loops.
-// If performance critical can improve.
-template<class Field>
-void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
-  result = Zero();
-  assert(_v.size()==eval.size());
-  int N = (int)_v.size();
-  for (int i=0;i<N;i++) {
-    Field& tmp = _v[i];
-    axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
-  }
-}
-
 /////////////////////////////////////////////////////////////
 // Implicitly restarted lanczos
 /////////////////////////////////////////////////////////////
@@ -405,6 +405,70 @@ namespace Grid {
    }
  };

+  template<class Field> class NonHermitianSchurRedBlackDiagMooeeSolve : public SchurRedBlackBase<Field> 
+  {
+    public:
+      typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
+
+      NonHermitianSchurRedBlackDiagMooeeSolve(OperatorFunction<Field>& RBSolver, const bool initSubGuess = false,
+          const bool _solnAsInitGuess = false)  
+      : SchurRedBlackBase<Field>(RBSolver, initSubGuess, _solnAsInitGuess) {};
+
+      //////////////////////////////////////////////////////
+      // Override RedBlack specialisation
+      //////////////////////////////////////////////////////
+      virtual void RedBlackSource(Matrix& _Matrix, const Field& src, Field& src_e, Field& src_o)
+      {
+        GridBase* grid  = _Matrix.RedBlackGrid();
+        GridBase* fgrid = _Matrix.Grid();
+
+        Field  tmp(grid);
+        Field Mtmp(grid);
+
+        pickCheckerboard(Even, src_e, src);
+        pickCheckerboard(Odd , src_o, src);
+
+        /////////////////////////////////////////////////////
+        // src_o = Mdag * (source_o - Moe MeeInv source_e)
+        /////////////////////////////////////////////////////
+        _Matrix.MooeeInv(src_e, tmp);   assert(   tmp.Checkerboard() == Even );
+        _Matrix.Meooe   (tmp, Mtmp);    assert(  Mtmp.Checkerboard() == Odd  );     
+        src_o -= Mtmp;                  assert( src_o.Checkerboard() == Odd  );     
+      }
+      
+      virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol)
+      {
+        GridBase* grid  = _Matrix.RedBlackGrid();
+        GridBase* fgrid = _Matrix.Grid();
+
+        Field     tmp(grid);
+        Field   sol_e(grid);
+        Field src_e_i(grid);
+        
+        ///////////////////////////////////////////////////
+        // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+        ///////////////////////////////////////////////////
+        _Matrix.Meooe(sol_o, tmp);         assert(     tmp.Checkerboard() == Even );
+        src_e_i = src_e - tmp;             assert( src_e_i.Checkerboard() == Even );
+        _Matrix.MooeeInv(src_e_i, sol_e);  assert(   sol_e.Checkerboard() == Even );
+       
+        setCheckerboard(sol, sol_e); assert( sol_e.Checkerboard() == Even );
+        setCheckerboard(sol, sol_o); assert( sol_o.Checkerboard() == Odd  );
+      }
+
+      virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o)
+      {
+        NonHermitianSchurDiagMooeeOperator<Matrix,Field> _OpEO(_Matrix);
+        this->_HermitianRBSolver(_OpEO, src_o, sol_o);  assert(sol_o.Checkerboard() == Odd);
+      }
+
+      virtual void RedBlackSolve(Matrix& _Matrix, const std::vector<Field>& src_o, std::vector<Field>& sol_o)
+      {
+        NonHermitianSchurDiagMooeeOperator<Matrix,Field> _OpEO(_Matrix);
+        this->_HermitianRBSolver(_OpEO, src_o, sol_o); 
+      }
+  };
+
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Site diagonal is identity, right preconditioned by Mee^inv
  // ( 1 - Meo Moo^inv Moe Mee^inv  ) phi =( 1 - Meo Moo^inv Moe Mee^inv  ) Mee psi =  = eta  = eta
@@ -482,5 +546,76 @@ namespace Grid {
      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
    }
  };
+
+  template<class Field> class NonHermitianSchurRedBlackDiagTwoSolve : public SchurRedBlackBase<Field> 
+  {
+    public:
+      typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
+
+      /////////////////////////////////////////////////////
+      // Wrap the usual normal equations Schur trick
+      /////////////////////////////////////////////////////
+      NonHermitianSchurRedBlackDiagTwoSolve(OperatorFunction<Field>& RBSolver, const bool initSubGuess = false,
+          const bool _solnAsInitGuess = false)  
+      : SchurRedBlackBase<Field>(RBSolver, initSubGuess, _solnAsInitGuess) {};
+
+      virtual void RedBlackSource(Matrix& _Matrix, const Field& src, Field& src_e, Field& src_o)
+      {
+        GridBase* grid  = _Matrix.RedBlackGrid();
+        GridBase* fgrid = _Matrix.Grid();
+
+        Field  tmp(grid);
+        Field Mtmp(grid);
+
+        pickCheckerboard(Even, src_e, src);
+        pickCheckerboard(Odd , src_o, src);
+      
+        /////////////////////////////////////////////////////
+        // src_o = Mdag * (source_o - Moe MeeInv source_e)
+        /////////////////////////////////////////////////////
+        _Matrix.MooeeInv(src_e, tmp);   assert(   tmp.Checkerboard() == Even );
+        _Matrix.Meooe   (tmp, Mtmp);    assert(  Mtmp.Checkerboard() == Odd  );     
+        src_o -= Mtmp;                  assert( src_o.Checkerboard() == Odd  );     
+      }
+
+      virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol)
+      {
+        GridBase* grid  = _Matrix.RedBlackGrid();
+        GridBase* fgrid = _Matrix.Grid();
+
+        Field sol_o_i(grid);
+        Field     tmp(grid);
+        Field   sol_e(grid);
+
+        ////////////////////////////////////////////////
+        // MooeeInv due to pecond
+        ////////////////////////////////////////////////
+        _Matrix.MooeeInv(sol_o, tmp);
+        sol_o_i = tmp;
+
+        ///////////////////////////////////////////////////
+        // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+        ///////////////////////////////////////////////////
+        _Matrix.Meooe(sol_o_i, tmp);    assert(   tmp.Checkerboard() == Even );
+        tmp = src_e - tmp;              assert( src_e.Checkerboard() == Even );
+        _Matrix.MooeeInv(tmp, sol_e);   assert( sol_e.Checkerboard() == Even );
+       
+        setCheckerboard(sol, sol_e);    assert(   sol_e.Checkerboard() == Even );
+        setCheckerboard(sol, sol_o_i);  assert( sol_o_i.Checkerboard() == Odd  );
+      };
+
+      virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o)
+      {
+        NonHermitianSchurDiagTwoOperator<Matrix,Field> _OpEO(_Matrix);
+        this->_HermitianRBSolver(_OpEO, src_o, sol_o);
+      };
+
+      virtual void RedBlackSolve(Matrix& _Matrix, const std::vector<Field>& src_o,  std::vector<Field>& sol_o)
+      {
+        NonHermitianSchurDiagTwoOperator<Matrix,Field> _OpEO(_Matrix);
+        this->_HermitianRBSolver(_OpEO, src_o, sol_o); 
+      }
+  };
 }
+
 #endif
@@ -6,21 +6,39 @@ NAMESPACE_BEGIN(Grid);
 MemoryStats *MemoryProfiler::stats = nullptr;
 bool         MemoryProfiler::debug = false;

-#ifdef GRID_NVCC
-#define SMALL_LIMIT (0)
-#else
-#define SMALL_LIMIT (4096)
+int PointerCache::NcacheSmall = PointerCache::NcacheSmallMax;
+#ifdef GRID_CUDA
+int PointerCache::Ncache      = 32;
+#else 
+int PointerCache::Ncache      = 8;
 #endif
+int PointerCache::Victim;
+int PointerCache::VictimSmall;
+PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::NcacheMax];
+PointerCache::PointerCacheEntry PointerCache::EntriesSmall[PointerCache::NcacheSmallMax];

-#ifdef POINTER_CACHE
-int PointerCache::victim;
+void PointerCache::Init(void)
+{
+  char * str;

-PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
+  str= getenv("GRID_ALLOC_NCACHE_LARGE");
+  if ( str ) Ncache = atoi(str);
+  if ( (Ncache<0) || (Ncache > NcacheMax)) Ncache = NcacheMax;

-void *PointerCache::Insert(void *ptr,size_t bytes) {
-
-  if (bytes < SMALL_LIMIT ) return ptr;
+  str= getenv("GRID_ALLOC_NCACHE_SMALL");
+  if ( str ) NcacheSmall = atoi(str);
+  if ( (NcacheSmall<0) || (NcacheSmall > NcacheSmallMax)) NcacheSmall = NcacheSmallMax;

+  //  printf("Aligned alloocator cache: large %d/%d small %d/%d\n",Ncache,NcacheMax,NcacheSmall,NcacheSmallMax);
+}
+void *PointerCache::Insert(void *ptr,size_t bytes) 
+{
+  if (bytes < GRID_ALLOC_SMALL_LIMIT ) 
+    return Insert(ptr,bytes,EntriesSmall,NcacheSmall,VictimSmall);
+  return Insert(ptr,bytes,Entries,Ncache,Victim);  
+}
+void *PointerCache::Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) 
+{
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
 #endif 
@@ -28,8 +46,8 @@ void *PointerCache::Insert(void *ptr,size_t bytes) {
  void * ret = NULL;
  int v = -1;

-  for(int e=0;e<Ncache;e++) {
-    if ( Entries[e].valid==0 ) {
+  for(int e=0;e<ncache;e++) {
+    if ( entries[e].valid==0 ) {
      v=e; 
      break;
    }
@@ -37,40 +55,43 @@ void *PointerCache::Insert(void *ptr,size_t bytes) {

  if ( v==-1 ) {
    v=victim;
-    victim = (victim+1)%Ncache;
+    victim = (victim+1)%ncache;
  }

-  if ( Entries[v].valid ) {
-    ret = Entries[v].address;
-    Entries[v].valid = 0;
-    Entries[v].address = NULL;
-    Entries[v].bytes = 0;
+  if ( entries[v].valid ) {
+    ret = entries[v].address;
+    entries[v].valid = 0;
+    entries[v].address = NULL;
+    entries[v].bytes = 0;
  }

-  Entries[v].address=ptr;
-  Entries[v].bytes  =bytes;
-  Entries[v].valid  =1;
+  entries[v].address=ptr;
+  entries[v].bytes  =bytes;
+  entries[v].valid  =1;

  return ret;
 }

-void *PointerCache::Lookup(size_t bytes) {
-
-  if (bytes < SMALL_LIMIT ) return NULL;
-
+void *PointerCache::Lookup(size_t bytes)
+{
+  if (bytes < GRID_ALLOC_SMALL_LIMIT ) 
+    return Lookup(bytes,EntriesSmall,NcacheSmall);
+  return Lookup(bytes,Entries,Ncache);
+}
+void *PointerCache::Lookup(size_t bytes,PointerCacheEntry *entries,int ncache) 
+{
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
 #endif 
-
-  for(int e=0;e<Ncache;e++){
-    if ( Entries[e].valid && ( Entries[e].bytes == bytes ) ) {
-      Entries[e].valid = 0;
-      return Entries[e].address;
+  for(int e=0;e<ncache;e++){
+    if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
+      entries[e].valid = 0;
+      return entries[e].address;
    }
  }
  return NULL;
 }
-#endif
+

 void check_huge_pages(void *Buf,uint64_t BYTES)
 {
@@ -42,21 +42,21 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 #define POINTER_CACHE
 #define GRID_ALLOC_ALIGN (2*1024*1024)
+#define GRID_ALLOC_SMALL_LIMIT (4096)

 NAMESPACE_BEGIN(Grid);

 // Move control to configure.ac and Config.h?
-#ifdef POINTER_CACHE
+
 class PointerCache {
 private:
 /*Pinning pages is costly*/
 /*Could maintain separate large and small allocation caches*/
-#ifdef GRID_NVCC 
-  static const int Ncache=128;
-#else
-  static const int Ncache=8;
-#endif
-  static int victim;
+/* Could make these configurable, perhaps up to a max size*/
+  static const int NcacheSmallMax=128; 
+  static const int NcacheMax=16;
+  static int NcacheSmall;
+  static int Ncache;

  typedef struct { 
    void *address;
@@ -64,15 +64,18 @@ private:
    int valid;
  } PointerCacheEntry;
    
-  static PointerCacheEntry Entries[Ncache];
+  static PointerCacheEntry Entries[NcacheMax];
+  static int Victim;
+  static PointerCacheEntry EntriesSmall[NcacheSmallMax];
+  static int VictimSmall;

 public:
-
+  static void Init(void);
  static void *Insert(void *ptr,size_t bytes) ;
+  static void *Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) ;
  static void *Lookup(size_t bytes) ;
-
+  static void *Lookup(size_t bytes,PointerCacheEntry *entries,int ncache) ;
 };
-#endif  

 std::string sizeString(size_t bytes);

@@ -89,6 +92,13 @@ public:
  static bool        debug;
 };

+#ifdef GRID_NVCC
+#define profilerCudaMeminfo \
+  { size_t f, t ; cudaMemGetInfo ( &f,&t); std::cout << GridLogDebug << "[Memory debug] Cuda free "<<f<<"/"<<t << std::endl;}
+#else
+#define profilerCudaMeminfo
+#endif
+
 #define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
 #define profilerDebugPrint						\
  if (MemoryProfiler::stats)						\
@@ -103,7 +113,8 @@ public:
 		<< std::endl;						\
      std::cout << GridLogDebug << "[Memory debug] freed  : " << memString(s->totalFreed) \
 		<< std::endl;						\
-    }
+    }									\
+  profilerCudaMeminfo;

 #define profilerAllocate(bytes)						\
  if (MemoryProfiler::stats)						\
@@ -114,6 +114,7 @@ public:
  void GlobalSumVector(RealD *,int N);
  void GlobalSum(uint32_t &);
  void GlobalSum(uint64_t &);
+  void GlobalSumVector(uint64_t*,int N);
  void GlobalSum(ComplexF &c);
  void GlobalSumVector(ComplexF *c,int N);
  void GlobalSum(ComplexD &c);
@@ -255,6 +255,10 @@ void CartesianCommunicator::GlobalSum(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
+void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
+  assert(ierr==0);
+}
 void CartesianCommunicator::GlobalXOR(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
  assert(ierr==0);
@@ -70,9 +70,10 @@ CartesianCommunicator::~CartesianCommunicator(){}
 void CartesianCommunicator::GlobalSum(float &){}
 void CartesianCommunicator::GlobalSumVector(float *,int N){}
 void CartesianCommunicator::GlobalSum(double &){}
+void CartesianCommunicator::GlobalSumVector(double *,int N){}
 void CartesianCommunicator::GlobalSum(uint32_t &){}
 void CartesianCommunicator::GlobalSum(uint64_t &){}
-void CartesianCommunicator::GlobalSumVector(double *,int N){}
+void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){}
 void CartesianCommunicator::GlobalXOR(uint32_t &){}
 void CartesianCommunicator::GlobalXOR(uint64_t &){}

@@ -74,7 +74,9 @@ void *SharedMemory::ShmBufferMalloc(size_t bytes){
  if (heap_bytes >= heap_size) {
    std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
    std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
-    std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl;
+    std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
+    std::cout<< " Current bytes is " << (heap_bytes/(1024*1024)) <<"MB"<<std::endl;
+    std::cout<< " Current heap  is " << (heap_size/(1024*1024)) <<"MB"<<std::endl;
    assert(heap_bytes<heap_size);
  }
  //std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl;
@@ -49,4 +49,29 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_COMMS_SHMEM
 #include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
 #endif 
+
+NAMESPACE_BEGIN(Grid);
+
+template<typename Op, typename T1> 
+auto Cshift(const LatticeUnaryExpression<Op,T1> &expr,int dim,int shift)
+    -> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> 
+{
+  return Cshift(closure(expr),dim,shift);
+}
+template <class Op, class T1, class T2>
+auto Cshift(const LatticeBinaryExpression<Op,T1,T2> &expr,int dim,int shift)
+  -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> 
+{
+  return Cshift(closure(expr),dim,shift);
+}
+template <class Op, class T1, class T2, class T3>
+auto Cshift(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr,int dim,int shift)
+  -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),
+				   eval(0, expr.arg2),
+				   eval(0, expr.arg3)))> 
+{
+  return Cshift(closure(expr),dim,shift);
+}
+NAMESPACE_END(Grid);
+
 #endif
@@ -35,7 +35,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_local.h>
 #include <Grid/lattice/Lattice_reduction.h>
 #include <Grid/lattice/Lattice_peekpoke.h>
-#include <Grid/lattice/Lattice_reality.h>
+//#include <Grid/lattice/Lattice_reality.h>
 #include <Grid/lattice/Lattice_comparison_utils.h>
 #include <Grid/lattice/Lattice_comparison.h>
 #include <Grid/lattice/Lattice_coordinate.h>
@@ -43,4 +43,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_rng.h>
 #include <Grid/lattice/Lattice_unary.h>
 #include <Grid/lattice/Lattice_transfer.h>
-
+#include <Grid/lattice/Lattice_basis.h>
@@ -9,6 +9,7 @@ Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
+Author: Christoph Lehner <christoph@lhnr.de

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -94,7 +95,7 @@ const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg)
 template <class lobj> accelerator_inline 
 const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg) 
 {
-  auto view = arg.View();
+  auto view = arg.AcceleratorView(ViewRead);
  return view[ss];
 }

@@ -7,6 +7,7 @@
    Copyright (C) 2015

 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Christoph Lehner <christoph@lhnr.de>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -36,9 +37,9 @@ NAMESPACE_BEGIN(Grid);
 template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  ret.Checkerboard() = lhs.Checkerboard();
-  auto ret_v = ret.View();
-  auto lhs_v = lhs.View();
-  auto rhs_v = rhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
+  auto rhs_v = rhs.AcceleratorView(ViewRead);
  conformable(ret,rhs);
  conformable(lhs,rhs);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
@@ -55,9 +56,9 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,rhs);
  conformable(lhs,rhs);
-  auto ret_v = ret.View();
-  auto lhs_v = lhs.View();
-  auto rhs_v = rhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
+  auto rhs_v = rhs.AcceleratorView(ViewRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    auto lhs_t=lhs_v(ss);
@@ -72,9 +73,9 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,rhs);
  conformable(lhs,rhs);
-  auto ret_v = ret.View();
-  auto lhs_v = lhs.View();
-  auto rhs_v = rhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
+  auto rhs_v = rhs.AcceleratorView(ViewRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    auto lhs_t=lhs_v(ss);
@@ -88,9 +89,9 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,rhs);
  conformable(lhs,rhs);
-  auto ret_v = ret.View();
-  auto lhs_v = lhs.View();
-  auto rhs_v = rhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
+  auto rhs_v = rhs.AcceleratorView(ViewRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    auto lhs_t=lhs_v(ss);
@@ -107,8 +108,8 @@ template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(lhs,ret);
-  auto ret_v = ret.View();
-  auto lhs_v = lhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    mult(&tmp,&lhs_v(ss),&rhs);
@@ -120,8 +121,8 @@ template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,lhs);
-  auto ret_v = ret.View();
-  auto lhs_v = lhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    auto lhs_t=lhs_v(ss);
@@ -134,8 +135,8 @@ template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,lhs);
-  auto ret_v = ret.View();
-  auto lhs_v = lhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    auto lhs_t=lhs_v(ss);
@@ -147,8 +148,8 @@ template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(lhs,ret);
-  auto ret_v = ret.View();
-  auto lhs_v = lhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    auto lhs_t=lhs_v(ss);
@@ -164,8 +165,8 @@ template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
-  auto ret_v = ret.View();
-  auto rhs_v = lhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto rhs_v = lhs.AcceleratorView(ViewRead);
  accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    auto rhs_t=rhs_v(ss);
@@ -178,8 +179,8 @@ template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
-  auto ret_v = ret.View();
-  auto rhs_v = lhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto rhs_v = lhs.AcceleratorView(ViewRead);
  accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    auto rhs_t=rhs_v(ss);
@@ -192,8 +193,8 @@ template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
-  auto ret_v = ret.View();
-  auto rhs_v = lhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto rhs_v = lhs.AcceleratorView(ViewRead);
  accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    auto rhs_t=rhs_v(ss);
@@ -205,8 +206,8 @@ template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
-  auto ret_v = ret.View();
-  auto rhs_v = lhs.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto rhs_v = lhs.AcceleratorView(ViewRead);
  accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    auto rhs_t=rhs_v(ss);
@@ -220,9 +221,9 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
  ret.Checkerboard() = x.Checkerboard();
  conformable(ret,x);
  conformable(x,y);
-  auto ret_v = ret.View();
-  auto x_v = x.View();
-  auto y_v = y.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto x_v = x.AcceleratorView(ViewRead);
+  auto y_v = y.AcceleratorView(ViewRead);
  accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
    auto tmp = a*x_v(ss)+y_v(ss);
    coalescedWrite(ret_v[ss],tmp);
@@ -233,9 +234,9 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
  ret.Checkerboard() = x.Checkerboard();
  conformable(ret,x);
  conformable(x,y);
-  auto ret_v = ret.View();
-  auto x_v = x.View();
-  auto y_v = y.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+  auto x_v = x.AcceleratorView(ViewRead);
+  auto y_v = y.AcceleratorView(ViewRead);
  accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
    auto tmp = a*x_v(ss)+b*y_v(ss);
    coalescedWrite(ret_v[ss],tmp);
@@ -9,6 +9,7 @@ Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Christoph Lehner <christoph@lhnr.de>

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -49,6 +50,26 @@ void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
  assert(lhs == rhs);
 }

+////////////////////////////////////////////////////////////////////////////
+// Advise the LatticeAccelerator class
+////////////////////////////////////////////////////////////////////////////
+enum LatticeAcceleratorAdvise {
+  AdviseInfrequentUse = 0x1,    // Advise that the data is used infrequently.  This can
+                                // significantly influence performance of bulk storage.
+  AdviseReadMostly = 0x2,       // Data will mostly be read.  On some architectures
+                                // enables read-only copies of memory to be kept on
+                                // host and device.
+};
+
+////////////////////////////////////////////////////////////////////////////
+// View Access Mode
+////////////////////////////////////////////////////////////////////////////
+enum ViewMode {
+  ViewRead = 0x1,
+  ViewWrite = 0x2,
+  ViewReadWrite = 0x3
+};
+
 ////////////////////////////////////////////////////////////////////////////
 // Minimal base class containing only data valid to access from accelerator
 // _odata will be a managed pointer in CUDA
@@ -75,6 +96,37 @@ public:
    if (grid) conformable(grid, _grid);
    else      grid = _grid;
  };
+
+  accelerator_inline void Advise(int advise) {
+#ifdef GRID_NVCC
+#ifndef __CUDA_ARCH__ // only on host
+    if (advise & AdviseInfrequentUse) {
+      cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetPreferredLocation,cudaCpuDeviceId);
+    }
+    if (advise & AdviseReadMostly) {
+      cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetReadMostly,-1);
+    }
+#endif
+#endif
+  };
+
+  accelerator_inline void AcceleratorPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future
+#ifdef GRID_NVCC
+#ifndef __CUDA_ARCH__ // only on host
+    int target;
+    cudaGetDevice(&target);
+    cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),target);
+#endif
+#endif
+  };
+
+  accelerator_inline void HostPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future
+#ifdef GRID_NVCC
+#ifndef __CUDA_ARCH__ // only on host
+    cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId);
+#endif
+#endif
+  };
 };

 /////////////////////////////////////////////////////////////////////////////////////////
@@ -206,9 +258,23 @@ public:
  // The view is trivially copy constructible and may be copied to an accelerator device
  // in device lambdas
  /////////////////////////////////////////////////////////////////////////////////
-  LatticeView<vobj> View (void) const 
+  LatticeView<vobj> View (void) const // deprecated, should pick AcceleratorView for accelerator_for
+  {                                   //                     and HostView        for thread_for
+    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
+    return accessor;
+  }
+
+  LatticeView<vobj> AcceleratorView(int mode = ViewReadWrite) const 
  {
    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
+    accessor.AcceleratorPrefetch(mode);
+    return accessor;
+  }
+
+  LatticeView<vobj> HostView(int mode = ViewReadWrite) const 
+  {
+    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
+    accessor.HostPrefetch(mode);
    return accessor;
  }
  
@@ -232,7 +298,7 @@ public:
    assert( (cb==Odd) || (cb==Even));
    this->checkerboard=cb;

-    auto me  = View();
+    auto me  = AcceleratorView(ViewWrite);
    accelerator_for(ss,me.size(),1,{
      auto tmp = eval(ss,expr);
      vstream(me[ss],tmp);
@@ -251,7 +317,7 @@ public:
    assert( (cb==Odd) || (cb==Even));
    this->checkerboard=cb;

-    auto me  = View();
+    auto me  = AcceleratorView(ViewWrite);
    accelerator_for(ss,me.size(),1,{
      auto tmp = eval(ss,expr);
      vstream(me[ss],tmp);
@@ -269,7 +335,7 @@ public:
    CBFromExpression(cb,expr);
    assert( (cb==Odd) || (cb==Even));
    this->checkerboard=cb;
-    auto me  = View();
+    auto me  = AcceleratorView(ViewWrite);
    accelerator_for(ss,me.size(),1,{
      auto tmp = eval(ss,expr);
      vstream(me[ss],tmp);
@@ -357,7 +423,6 @@ public:
  // copy constructor
  ///////////////////////////////////////////
  Lattice(const Lattice& r){ 
-    //    std::cout << "Lattice constructor(const Lattice &) "<<this<<std::endl; 
    this->_grid = r.Grid();
    resize(this->_grid->oSites());
    *this = r;
@@ -380,8 +445,8 @@ public:
    typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
    conformable(*this,r);
    this->checkerboard = r.Checkerboard();
-    auto me =   View();
-    auto him= r.View();
+    auto me =   AcceleratorView(ViewWrite);
+    auto him= r.AcceleratorView(ViewRead);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
      coalescedWrite(me[ss],him(ss));
    });
@@ -394,8 +459,8 @@ public:
  inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
    this->checkerboard = r.Checkerboard();
    conformable(*this,r);
-    auto me =   View();
-    auto him= r.View();
+    auto me =   AcceleratorView(ViewWrite);
+    auto him= r.AcceleratorView(ViewRead);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
      coalescedWrite(me[ss],him(ss));
    });
@@ -0,0 +1,236 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/lattice/Lattice_basis.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Christoph Lehner <christoph@lhnr.de>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+template<class Field>
+void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k) 
+{
+  // If assume basis[j] are already orthonormal,
+  // can take all inner products in parallel saving 2x bandwidth
+  // Save 3x bandwidth on the second line of loop.
+  // perhaps 2.5x speed up.
+  // 2x overall in Multigrid Lanczos  
+  for(int j=0; j<k; ++j){
+    auto ip = innerProduct(basis[j],w);
+    w = w - ip*basis[j];
+  }
+}
+
+template<class VField, class Matrix>
+void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) 
+{
+  typedef decltype(basis[0]) Field;
+  typedef decltype(basis[0].View()) View;
+  auto tmp_v = basis[0].AcceleratorView(ViewReadWrite);
+  Vector<View> basis_v(basis.size(),tmp_v);
+  typedef typename std::remove_reference<decltype(tmp_v[0])>::type vobj;
+  GridBase* grid = basis[0].Grid();
+      
+  for(int k=0;k<basis.size();k++){
+    basis_v[k] = basis[k].AcceleratorView(ViewReadWrite);
+  }
+
+#ifndef GRID_NVCC
+  thread_region
+  {
+    std::vector < vobj > B(Nm); // Thread private
+    thread_for_in_region(ss, grid->oSites(),{
+	for(int j=j0; j<j1; ++j) B[j]=0.;
+      
+	for(int j=j0; j<j1; ++j){
+	  for(int k=k0; k<k1; ++k){
+	    B[j] +=Qt(j,k) * basis_v[k][ss];
+	  }
+	}
+	for(int j=j0; j<j1; ++j){
+	  basis_v[j][ss] = B[j];
+	}
+      });
+  }
+#else
+  int nrot = j1-j0;
+  if (!nrot) // edge case not handled gracefully by Cuda
+    return;
+
+  uint64_t oSites   =grid->oSites();
+  uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
+
+  Vector <vobj> Bt(siteBlock * nrot); 
+  auto Bp=&Bt[0];
+
+  // GPU readable copy of matrix
+  Vector<double> Qt_jv(Nm*Nm);
+  double *Qt_p = & Qt_jv[0];
+  thread_for(i,Nm*Nm,{
+      int j = i/Nm;
+      int k = i%Nm;
+      Qt_p[i]=Qt(j,k);
+    });
+
+  // Block the loop to keep storage footprint down
+  for(uint64_t s=0;s<oSites;s+=siteBlock){
+
+    // remaining work in this block
+    int ssites=MIN(siteBlock,oSites-s);
+
+    // zero out the accumulators
+    accelerator_for(ss,siteBlock*nrot,vobj::Nsimd(),{
+	decltype(coalescedRead(Bp[ss])) z;
+	z=Zero();
+	coalescedWrite(Bp[ss],z);
+      });
+
+    accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
+	
+	int j =sj%nrot;
+	int jj  =j0+j;
+	int ss =sj/nrot;
+	int sss=ss+s;
+
+	for(int k=k0; k<k1; ++k){
+	  auto tmp = coalescedRead(Bp[ss*nrot+j]);
+	  coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_v[k][sss]));
+	}
+      });
+
+    accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
+	int j =sj%nrot;
+	int jj  =j0+j;
+	int ss =sj/nrot;
+	int sss=ss+s;
+	coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
+      });
+  }
+#endif
+}
+
+// Extract a single rotated vector
+template<class Field>
+void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) 
+{
+  typedef decltype(basis[0].AcceleratorView()) View;
+  typedef typename Field::vector_object vobj;
+  GridBase* grid = basis[0].Grid();
+
+  result.Checkerboard() = basis[0].Checkerboard();
+  auto result_v=result.AcceleratorView(ViewWrite);
+  Vector<View> basis_v(basis.size(),result_v);
+  for(int k=0;k<basis.size();k++){
+    basis_v[k] = basis[k].AcceleratorView(ViewRead);
+  }
+  vobj zz=Zero();
+  Vector<double> Qt_jv(Nm);
+  double * Qt_j = & Qt_jv[0];
+  for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
+  accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
+    auto B=coalescedRead(zz);
+    for(int k=k0; k<k1; ++k){
+      B +=Qt_j[k] * coalescedRead(basis_v[k][ss]);
+    }
+    coalescedWrite(result_v[ss], B);
+  });
+}
+
+template<class Field>
+void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx) 
+{
+  int vlen = idx.size();
+
+  assert(vlen>=1);
+  assert(vlen<=sort_vals.size());
+  assert(vlen<=_v.size());
+
+  for (size_t i=0;i<vlen;i++) {
+
+    if (idx[i] != i) {
+
+      //////////////////////////////////////
+      // idx[i] is a table of desired sources giving a permutation.
+      // Swap v[i] with v[idx[i]].
+      // Find  j>i for which _vnew[j] = _vold[i],
+      // track the move idx[j] => idx[i]
+      // track the move idx[i] => i
+      //////////////////////////////////////
+      size_t j;
+      for (j=i;j<idx.size();j++)
+	if (idx[j]==i)
+	  break;
+
+      assert(idx[i] > i);     assert(j!=idx.size());      assert(idx[j]==i);
+
+      swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy
+      std::swap(sort_vals[i],sort_vals[idx[i]]);
+
+      idx[j] = idx[i];
+      idx[i] = i;
+    }
+  }
+}
+
+inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals) 
+{
+  std::vector<int> idx(sort_vals.size());
+  std::iota(idx.begin(), idx.end(), 0);
+
+  // sort indexes based on comparing values in v
+  std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
+    return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
+  });
+  return idx;
+}
+
+template<class Field>
+void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse) 
+{
+  std::vector<int> idx = basisSortGetIndex(sort_vals);
+  if (reverse)
+    std::reverse(idx.begin(), idx.end());
+  
+  basisReorderInPlace(_v,sort_vals,idx);
+}
+
+// PAB: faster to compute the inner products first then fuse loops.
+// If performance critical can improve.
+template<class Field>
+void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
+  result = Zero();
+  assert(_v.size()==eval.size());
+  int N = (int)_v.size();
+  for (int i=0;i<N;i++) {
+    Field& tmp = _v[i];
+    axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
+  }
+}
+
+NAMESPACE_END(Grid);
@@ -156,7 +156,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
 // Peek a scalar object from the SIMD array
 //////////////////////////////////////////////////////////
 template<class vobj,class sobj>
-accelerator_inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
+inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
        
  GridBase *grid = l.Grid();

@@ -185,7 +185,7 @@ accelerator_inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate
 };

 template<class vobj,class sobj>
-accelerator_inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
+inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){

  GridBase *grid=l.Grid();

@@ -40,6 +40,7 @@ NAMESPACE_BEGIN(Grid);

 template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
  Lattice<vobj> ret(lhs.Grid());
+  ret.Checkerboard()=lhs.Checkerboard();
  auto lhs_v = lhs.View();
  auto ret_v = ret.View();
  accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
@@ -50,6 +51,7 @@ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){

 template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
  Lattice<vobj> ret(lhs.Grid());
+  ret.Checkerboard() = lhs.Checkerboard();
  auto lhs_v = lhs.View();
  auto ret_v = ret.View();
  accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
@@ -5,6 +5,7 @@
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Christoph Lehner <christoph@lhnr.de>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
@@ -93,7 +94,7 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){

 // Double inner product
 template<class vobj>
-inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
+inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
 {
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_typeD vector_type;
@@ -102,8 +103,8 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
  GridBase *grid = left.Grid();
  
  // Might make all code paths go this way.
-  auto left_v = left.View();
-  auto right_v=right.View();
+  auto left_v = left.AcceleratorView(ViewRead);
+  auto right_v=right.AcceleratorView(ViewRead);

  const uint64_t nsimd = grid->Nsimd();
  const uint64_t sites = grid->oSites();
@@ -137,11 +138,18 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
  })
  nrm = TensorRemove(sum(inner_tmp_v,sites));
 #endif
-  grid->GlobalSum(nrm);
-
  return nrm;
 }

+template<class vobj>
+inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
+  GridBase *grid = left.Grid();
+  ComplexD nrm = rankInnerProduct(left,right);
+  grid->GlobalSum(nrm);
+  return nrm;
+}
+
+
 /////////////////////////
 // Fast axpby_norm
 // z = a x + b y
@@ -167,9 +175,9 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
  
  GridBase *grid = x.Grid();

-  auto x_v=x.View();
-  auto y_v=y.View();
-  auto z_v=z.View();
+  auto x_v=x.AcceleratorView(ViewRead);
+  auto y_v=y.AcceleratorView(ViewRead);
+  auto z_v=z.AcceleratorView(ViewWrite);

  const uint64_t nsimd = grid->Nsimd();
  const uint64_t sites = grid->oSites();
@@ -204,8 +212,64 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
  grid->GlobalSum(nrm);
  return nrm; 
 }
-
 
+template<class vobj> strong_inline void
+innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Lattice<vobj> &right)
+{
+  conformable(left,right);
+
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_typeD vector_type;
+  Vector<ComplexD> tmp(2);
+
+  GridBase *grid = left.Grid();
+
+  auto left_v=left.AcceleratorView(ViewRead);
+  auto right_v=right.AcceleratorView(ViewRead);
+
+  const uint64_t nsimd = grid->Nsimd();
+  const uint64_t sites = grid->oSites();
+
+#ifdef GRID_NVCC
+  // GPU
+  typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
+  typedef decltype(innerProduct(left_v[0],left_v[0])) norm_t;
+  Vector<inner_t> inner_tmp(sites);
+  Vector<norm_t> norm_tmp(sites);
+  auto inner_tmp_v = &inner_tmp[0];
+  auto norm_tmp_v = &norm_tmp[0];
+
+  accelerator_for( ss, sites, nsimd,{
+      auto left_tmp = left_v(ss);
+      coalescedWrite(inner_tmp_v[ss],innerProduct(left_tmp,right_v(ss)));
+      coalescedWrite(norm_tmp_v[ss],innerProduct(left_tmp,left_tmp));
+  });
+
+  tmp[0] = TensorRemove(sumD_gpu(inner_tmp_v,sites));
+  tmp[1] = TensorRemove(sumD_gpu(norm_tmp_v,sites));
+#else
+  // CPU
+  typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
+  typedef decltype(innerProductD(left_v[0],left_v[0])) norm_t;
+  Vector<inner_t> inner_tmp(sites);
+  Vector<norm_t> norm_tmp(sites);
+  auto inner_tmp_v = &inner_tmp[0];
+  auto norm_tmp_v = &norm_tmp[0];
+
+  accelerator_for( ss, sites, nsimd,{
+      auto left_tmp = left_v(ss);
+      inner_tmp_v[ss] = innerProductD(left_tmp,right_v(ss));
+      norm_tmp_v[ss] = innerProductD(left_tmp,left_tmp);
+  });
+  // Already promoted to double
+  tmp[0] = TensorRemove(sum(inner_tmp_v,sites));
+  tmp[1] = TensorRemove(sum(norm_tmp_v,sites));
+#endif
+  grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector
+  ip = tmp[0];
+  nrm = real(tmp[1]);
+}
+
 template<class Op,class T1>
 inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
  ->typename decltype(expr.op.func(eval(0,expr.arg1)))::scalar_object
@@ -37,6 +37,7 @@ NAMESPACE_BEGIN(Grid);
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Trace
 ////////////////////////////////////////////////////////////////////////////////////////////////////
+/*
 template<class vobj>
 inline auto trace(const Lattice<vobj> &lhs)  -> Lattice<decltype(trace(vobj()))>
 {
@@ -48,6 +49,7 @@ inline auto trace(const Lattice<vobj> &lhs)  -> Lattice<decltype(trace(vobj()))>
  });
  return ret;
 };
+*/
    
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Trace Index level dependent operation
@@ -6,6 +6,7 @@
    Copyright (C) 2015

 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Christoph Lehner <christoph@lhnr.de>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -63,6 +64,7 @@ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,con
    }
  });
 }
+
 template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
  int cb = half.Checkerboard();
  auto half_v = half.View();
@@ -81,25 +83,130 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
    }
  });
 }
-  
-template<class vobj,class CComplex,int nbasis>
+
+////////////////////////////////////////////////////////////////////////////////////////////
+// Flexible Type Conversion for internal promotion to double as well as graceful
+// treatment of scalar-compatible types
+////////////////////////////////////////////////////////////////////////////////////////////
+accelerator_inline void convertType(ComplexD & out, const std::complex<double> & in) {
+  out = in;
+}
+
+accelerator_inline void convertType(ComplexF & out, const std::complex<float> & in) {
+  out = in;
+}
+
+#ifdef __CUDA_ARCH__
+accelerator_inline void convertType(vComplexF & out, const ComplexF & in) {
+  ((ComplexF*)&out)[SIMTlane(vComplexF::Nsimd())] = in;
+}
+accelerator_inline void convertType(vComplexD & out, const ComplexD & in) {
+  ((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd())] = in;
+}
+accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) {
+  ((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd()*2)] = in;
+}
+#endif
+
+accelerator_inline void convertType(vComplexF & out, const vComplexD2 & in) {
+  out.v = Optimization::PrecisionChange::DtoS(in._internal[0].v,in._internal[1].v);
+}
+
+accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) {
+  Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v);
+}
+
+template<typename T1,typename T2,int N>
+  accelerator_inline void convertType(iMatrix<T1,N> & out, const iMatrix<T2,N> & in);
+template<typename T1,typename T2,int N>
+  accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & in);
+
+template<typename T1,typename T2, typename std::enable_if<!isGridScalar<T1>::value, T1>::type* = nullptr>
+accelerator_inline void convertType(T1 & out, const iScalar<T2> & in) {
+  convertType(out,in._internal);
+}
+
+template<typename T1,typename T2>
+accelerator_inline void convertType(iScalar<T1> & out, const T2 & in) {
+  convertType(out._internal,in);
+}
+
+template<typename T1,typename T2,int N>
+accelerator_inline void convertType(iMatrix<T1,N> & out, const iMatrix<T2,N> & in) {
+  for (int i=0;i<N;i++)
+    for (int j=0;j<N;j++)
+      convertType(out._internal[i][j],in._internal[i][j]);
+}
+
+template<typename T1,typename T2,int N>
+accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & in) {
+  for (int i=0;i<N;i++)
+    convertType(out._internal[i],in._internal[i]);
+}
+
+template<typename T, typename std::enable_if<isGridFundamental<T>::value, T>::type* = nullptr>
+accelerator_inline void convertType(T & out, const T & in) {
+  out = in;
+}
+
+template<typename T1,typename T2>
+accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
+  auto out_v = out.AcceleratorView(ViewWrite);
+  auto in_v  = in.AcceleratorView(ViewRead);
+
+  accelerator_for(ss,out_v.size(),T1::Nsimd(),{
+      convertType(out_v[ss],in_v(ss));
+    });
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////
+// precision-promoted local inner product
+////////////////////////////////////////////////////////////////////////////////////////////
+template<class vobj>
+inline auto localInnerProductD(const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
+-> Lattice<iScalar<decltype(TensorRemove(innerProductD2(lhs.View()[0],rhs.View()[0])))>>
+{
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
+  auto rhs_v = rhs.AcceleratorView(ViewRead);
+
+  typedef decltype(TensorRemove(innerProductD2(lhs_v[0],rhs_v[0]))) t_inner;
+  Lattice<iScalar<t_inner>> ret(lhs.Grid());
+  auto ret_v = ret.AcceleratorView(ViewWrite);
+
+  accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
+      convertType(ret_v[ss],innerProductD2(lhs_v(ss),rhs_v(ss)));
+    });
+
+  return ret;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////
+// block routines
+////////////////////////////////////////////////////////////////////////////////////////////
+template<class vobj,class CComplex,int nbasis,class VLattice>
 inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
-			  const             Lattice<vobj>   &fineData,
-			  const std::vector<Lattice<vobj> > &Basis)
+			   const             Lattice<vobj>   &fineData,
+			   const VLattice &Basis)
 {
  GridBase * fine  = fineData.Grid();
  GridBase * coarse= coarseData.Grid();

-  Lattice<CComplex> ip(coarse); 
+  Lattice<iScalar<CComplex>> ip(coarse);
+  Lattice<vobj>     fineDataRed = fineData;

  //  auto fineData_   = fineData.View();
-  auto coarseData_ = coarseData.View();
-  auto ip_         = ip.View();
+  auto coarseData_ = coarseData.AcceleratorView(ViewWrite);
+  auto ip_         = ip.AcceleratorView(ViewReadWrite);
  for(int v=0;v<nbasis;v++) {
-    blockInnerProduct(ip,Basis[v],fineData);
+    blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine>
    accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
-	coalescedWrite(coarseData_[sc](v),ip_(sc));
+	convertType(coarseData_[sc](v),ip_[sc]);
      });
+
+    // improve numerical stability of projection
+    // |fine> = |fine> - <basis|fine> |basis>
+    ip=-ip;
+    blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed); 
  }
 }

@@ -166,11 +273,11 @@ inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
  return;
 }

-template<class vobj,class CComplex>
-inline void blockZAXPY(Lattice<vobj> &fineZ,
-		       const Lattice<CComplex> &coarseA,
-		       const Lattice<vobj> &fineX,
-		       const Lattice<vobj> &fineY)
+template<class vobj,class vobj2,class CComplex>
+  inline void blockZAXPY(Lattice<vobj> &fineZ,
+			 const Lattice<CComplex> &coarseA,
+			 const Lattice<vobj2> &fineX,
+			 const Lattice<vobj> &fineY)
 {
  GridBase * fine  = fineZ.Grid();
  GridBase * coarse= coarseA.Grid();
@@ -182,7 +289,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
  conformable(fineX,fineZ);

  int _ndimension = coarse->_ndimension;
-  
+
  Coordinate  block_r      (_ndimension);

  // FIXME merge with subdivide checking routine as this is redundant
@@ -191,29 +298,65 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
    assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
  }

-  auto fineZ_  = fineZ.View();
-  auto fineX_  = fineX.View();
-  auto fineY_  = fineY.View();
-  auto coarseA_= coarseA.View();
+  auto fineZ_  = fineZ.AcceleratorView(ViewWrite);
+  auto fineX_  = fineX.AcceleratorView(ViewRead);
+  auto fineY_  = fineY.AcceleratorView(ViewRead);
+  auto coarseA_= coarseA.AcceleratorView(ViewRead);

  accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
-    
-    int sc;
-    Coordinate coor_c(_ndimension);
-    Coordinate coor_f(_ndimension);

-    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
-    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
-    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
+      int sc;
+      Coordinate coor_c(_ndimension);
+      Coordinate coor_f(_ndimension);

-    // z = A x + y
-    coalescedWrite(fineZ_[sf],coarseA_(sc)*fineX_(sf)+fineY_(sf));
+      Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
+      for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
+      Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);

-  });
+      // z = A x + y
+#ifdef __CUDA_ARCH__
+      typename vobj2::tensor_reduced::scalar_object cA;
+      typename vobj::scalar_object cAx;
+#else
+      typename vobj2::tensor_reduced cA;
+      vobj cAx;
+#endif
+      convertType(cA,TensorRemove(coarseA_(sc)));
+      auto prod = cA*fineX_(sf);
+      convertType(cAx,prod);
+      coalescedWrite(fineZ_[sf],cAx+fineY_(sf));
+
+    });

  return;
 }
+
 template<class vobj,class CComplex>
+  inline void blockInnerProductD(Lattice<CComplex> &CoarseInner,
+				 const Lattice<vobj> &fineX,
+				 const Lattice<vobj> &fineY)
+{
+  typedef iScalar<decltype(TensorRemove(innerProductD2(vobj(),vobj())))> dotp;
+
+  GridBase *coarse(CoarseInner.Grid());
+  GridBase *fine  (fineX.Grid());
+
+  Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
+  Lattice<dotp> coarse_inner(coarse);
+
+  auto CoarseInner_  = CoarseInner.AcceleratorView(ViewWrite);
+  auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite);
+
+  // Precision promotion
+  fine_inner = localInnerProductD(fineX,fineY);
+  blockSum(coarse_inner,fine_inner);
+  accelerator_for(ss, coarse->oSites(), 1, {
+      convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss]));
+    });
+ 
+}
+
+template<class vobj,class CComplex> // deprecate
 inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
 			      const Lattice<vobj> &fineX,
 			      const Lattice<vobj> &fineY)
@@ -227,8 +370,8 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
  Lattice<dotp> coarse_inner(coarse);

  // Precision promotion?
-  auto CoarseInner_  = CoarseInner.View();
-  auto coarse_inner_ = coarse_inner.View();
+  auto CoarseInner_  = CoarseInner.AcceleratorView(ViewWrite);
+  auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite);

  fine_inner = localInnerProduct(fineX,fineY);
  blockSum(coarse_inner,fine_inner);
@@ -236,6 +379,7 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
    CoarseInner_[ss] = coarse_inner_[ss];
  });
 }
+
 template<class vobj,class CComplex>
 inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
 {
@@ -248,7 +392,7 @@ inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
 // useful in multigrid project;
 // Generic name : Coarsen?
 template<class vobj>
-inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
+inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData) 
 {
  GridBase * fine  = fineData.Grid();
  GridBase * coarse= coarseData.Grid();
@@ -256,42 +400,41 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
  subdivides(coarse,fine); // require they map

  int _ndimension = coarse->_ndimension;
-  
+
  Coordinate  block_r      (_ndimension);
-  
+
  for(int d=0 ; d<_ndimension;d++){
    block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
  }
  int blockVol = fine->oSites()/coarse->oSites();

-  // Turn this around to loop threaded over sc and interior loop 
-  // over sf would thread better
-  auto coarseData_ = coarseData.View();
-  auto fineData_   = fineData.View();
+  auto coarseData_ = coarseData.AcceleratorView(ViewReadWrite);
+  auto fineData_   = fineData.AcceleratorView(ViewRead);

  accelerator_for(sc,coarse->oSites(),1,{

-    // One thread per sub block
-    Coordinate coor_c(_ndimension);
-    Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions);  // Block coordinate
-    coarseData_[sc]=Zero();
+      // One thread per sub block
+      Coordinate coor_c(_ndimension);
+      Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions);  // Block coordinate
+      coarseData_[sc]=Zero();

-    for(int sb=0;sb<blockVol;sb++){
-      
-      int sf;
-      Coordinate coor_b(_ndimension);
-      Coordinate coor_f(_ndimension);
-      Lexicographic::CoorFromIndex(coor_b,sb,block_r);               // Block sub coordinate
-      for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
-      Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
+      for(int sb=0;sb<blockVol;sb++){

-      coarseData_[sc]=coarseData_[sc]+fineData_[sf];
-    }
+	int sf;
+	Coordinate coor_b(_ndimension);
+	Coordinate coor_f(_ndimension);
+	Lexicographic::CoorFromIndex(coor_b,sb,block_r);               // Block sub coordinate
+	for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
+	Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);

-  });
+	coarseData_[sc]=coarseData_[sc]+fineData_[sf];
+      }
+
+    });
  return;
 }

+
 template<class vobj>
 inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vobj> &picked,Coordinate coor)
 {
@@ -313,8 +456,8 @@ inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vob
  }
 }

-template<class vobj,class CComplex>
-inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> > &Basis)
+template<class CComplex,class VLattice>
+inline void blockOrthonormalize(Lattice<CComplex> &ip,VLattice &Basis)
 {
  GridBase *coarse = ip.Grid();
  GridBase *fine   = Basis[0].Grid();
@@ -322,23 +465,30 @@ inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> >
  int       nbasis = Basis.size() ;

  // checks
-  subdivides(coarse,fine); 
+  subdivides(coarse,fine);
  for(int i=0;i<nbasis;i++){
    conformable(Basis[i].Grid(),fine);
  }

  for(int v=0;v<nbasis;v++) {
    for(int u=0;u<v;u++) {
-      //Inner product & remove component 
-      blockInnerProduct(ip,Basis[u],Basis[v]);
+      //Inner product & remove component
+      blockInnerProductD(ip,Basis[u],Basis[v]);
      ip = -ip;
-      blockZAXPY<vobj,CComplex> (Basis[v],ip,Basis[u],Basis[v]);
+      blockZAXPY(Basis[v],ip,Basis[u],Basis[v]);
    }
    blockNormalise(ip,Basis[v]);
  }
 }

+template<class vobj,class CComplex>
+inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> > &Basis) // deprecated inaccurate naming
+{
+  blockOrthonormalize(ip,Basis);
+}
+
 #if 0
+// TODO: CPU optimized version here
 template<class vobj,class CComplex,int nbasis>
 inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 			 Lattice<vobj>   &fineData,
@@ -383,24 +533,18 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
  
 }
 #else
-template<class vobj,class CComplex,int nbasis>
+template<class vobj,class CComplex,int nbasis,class VLattice>
 inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 			 Lattice<vobj>   &fineData,
-			 const std::vector<Lattice<vobj> > &Basis)
+			 const VLattice &Basis)
 {
  GridBase * fine  = fineData.Grid();
  GridBase * coarse= coarseData.Grid();
-
  fineData=Zero();
  for(int i=0;i<nbasis;i++) {
    Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
-    Lattice<CComplex> cip(coarse);
-    auto cip_ = cip.View();
-    auto  ip_ =  ip.View();
-    accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
-	coalescedWrite(cip_[sc], ip_(sc)());
-    });
-    blockZAXPY<vobj,CComplex >(fineData,cip,Basis[i],fineData);
+    auto  ip_ =  ip.AcceleratorView(ViewRead);
+    blockZAXPY(fineData,ip,Basis[i],fineData);
  }
 }
 #endif
@@ -470,8 +614,8 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
  Coordinate rdt = Tg->_rdimensions;
  Coordinate ist = Tg->_istride;
  Coordinate ost = Tg->_ostride;
-  auto t_v = To.View();
-  auto f_v = From.View();
+  auto t_v = To.AcceleratorView(ViewWrite);
+  auto f_v = From.AcceleratorView(ViewRead);
  accelerator_for(idx,Fg->lSites(),1,{
    sobj s;
    Coordinate Fcoor(nd);
@@ -38,6 +38,7 @@ NAMESPACE_BEGIN(Grid);
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Transpose
 ////////////////////////////////////////////////////////////////////////////////////////////////////
+/*
 template<class vobj>
 inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
  Lattice<vobj> ret(lhs.Grid());
@@ -48,7 +49,8 @@ inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
  });
  return ret;
 };
-    
+*/    
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Index level dependent transpose
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -341,7 +341,7 @@ class BinaryIO {
    int ieee32big = (format == std::string("IEEE32BIG"));
    int ieee32    = (format == std::string("IEEE32"));
    int ieee64big = (format == std::string("IEEE64BIG"));
-    int ieee64    = (format == std::string("IEEE64"));
+    int ieee64    = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
    assert(ieee64||ieee32|ieee64big||ieee32big);
    assert((ieee64+ieee32+ieee64big+ieee32big)==1);
    //////////////////////////////////////////////////////////////////////////////
@@ -301,6 +301,30 @@ struct GaugeSimpleUnmunger {
  };
 };

+template<class fobj,class sobj>
+struct GaugeDoubleStoredMunger{
+  void operator()(fobj &in, sobj &out) {
+    for (int mu = 0; mu < Nds; mu++) {
+      for (int i = 0; i < Nc; i++) {
+        for (int j = 0; j < Nc; j++) {
+          out(mu)()(i, j) = in(mu)()(i, j);
+        }}
+    }
+  };
+};
+
+template <class fobj, class sobj>
+struct GaugeDoubleStoredUnmunger {
+  void operator()(sobj &in, fobj &out) {
+    for (int mu = 0; mu < Nds; mu++) {
+      for (int i = 0; i < Nc; i++) {
+        for (int j = 0; j < Nc; j++) {
+          out(mu)()(i, j) = in(mu)()(i, j);
+        }}
+    }
+  };
+};
+
 template<class fobj,class sobj>
 struct Gauge3x2munger{
  void operator() (fobj &in,sobj &out){
@@ -146,7 +146,7 @@ public:
    int ieee32big = (format == std::string("IEEE32BIG"));
    int ieee32    = (format == std::string("IEEE32"));
    int ieee64big = (format == std::string("IEEE64BIG"));
-    int ieee64    = (format == std::string("IEEE64"));
+    int ieee64    = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));

    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
    // depending on datatype, set up munger;
@@ -0,0 +1,224 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/parallelIO/OpenQcdIO.h
+
+Copyright (C) 2015 - 2020
+
+Author: Daniel Richtmann <daniel.richtmann@ur.de>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+struct OpenQcdHeader : Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(OpenQcdHeader,
+                                  int,    Nt,
+                                  int,    Nx,
+                                  int,    Ny,
+                                  int,    Nz,
+                                  double, plaq);
+};
+
+class OpenQcdIO : public BinaryIO {
+public:
+  static constexpr double normalisationFactor = Nc; // normalisation difference: grid 18, openqcd 6
+
+  static inline int readHeader(std::string file, GridBase* grid, FieldMetaData& field) {
+    OpenQcdHeader header;
+
+    {
+      std::ifstream fin(file, std::ios::in | std::ios::binary);
+      fin.read(reinterpret_cast<char*>(&header), sizeof(OpenQcdHeader));
+      assert(!fin.fail());
+      field.data_start = fin.tellg();
+      fin.close();
+    }
+
+    header.plaq /= normalisationFactor;
+
+    // sanity check (should trigger on endian issues)
+    assert(0 < header.Nt && header.Nt <= 1024);
+    assert(0 < header.Nx && header.Nx <= 1024);
+    assert(0 < header.Ny && header.Ny <= 1024);
+    assert(0 < header.Nz && header.Nz <= 1024);
+
+    field.dimension[0] = header.Nx;
+    field.dimension[1] = header.Ny;
+    field.dimension[2] = header.Nz;
+    field.dimension[3] = header.Nt;
+
+    std::cout << GridLogDebug << "header: " << header << std::endl;
+    std::cout << GridLogDebug << "grid dimensions: " << grid->_fdimensions << std::endl;
+    std::cout << GridLogDebug << "file dimensions: " << field.dimension << std::endl;
+
+    assert(grid->_ndimension == Nd);
+    for(int d = 0; d < Nd; d++)
+      assert(grid->_fdimensions[d] == field.dimension[d]);
+
+    field.plaquette = header.plaq;
+
+    return field.data_start;
+  }
+
+  template<class vsimd>
+  static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
+                                       FieldMetaData&                        header,
+                                       std::string                           file) {
+    typedef Lattice<iDoubleStoredColourMatrix<vsimd>> DoubleStoredGaugeField;
+
+    assert(Ns == 4 and Nd == 4 and Nc == 3);
+
+    auto grid = dynamic_cast<GridCartesian*>(Umu.Grid());
+    assert(grid != nullptr); assert(grid->_ndimension == Nd);
+
+    uint64_t offset = readHeader(file, Umu.Grid(), header);
+
+    FieldMetaData clone(header);
+
+    std::string format("IEEE64"); // they always store little endian double precsision
+    uint32_t    nersc_csum, scidac_csuma, scidac_csumb;
+
+    GridCartesian*         grid_openqcd = createOpenQcdGrid(grid);
+    GridRedBlackCartesian* grid_rb      = SpaceTimeGrid::makeFourDimRedBlackGrid(grid);
+
+    typedef DoubleStoredColourMatrixD                                              fobj;
+    typedef typename DoubleStoredGaugeField::vector_object::scalar_object          sobj;
+    typedef typename DoubleStoredGaugeField::vector_object::Realified::scalar_type word;
+
+    word w = 0;
+
+    std::vector<fobj> iodata(grid_openqcd->lSites()); // Munge, checksum, byte order in here
+    std::vector<sobj> scalardata(grid->lSites());
+
+    IOobject(w, grid_openqcd, iodata, file, offset, format, BINARYIO_READ | BINARYIO_LEXICOGRAPHIC,
+             nersc_csum, scidac_csuma, scidac_csumb);
+
+    GridStopWatch timer;
+    timer.Start();
+
+    DoubleStoredGaugeField Umu_ds(grid);
+
+    auto munge = GaugeDoubleStoredMunger<DoubleStoredColourMatrixD, DoubleStoredColourMatrix>();
+
+    Coordinate ldim = grid->LocalDimensions();
+    thread_for(idx_g, grid->lSites(), {
+        Coordinate coor;
+        grid->LocalIndexToLocalCoor(idx_g, coor);
+
+        bool isOdd = grid_rb->CheckerBoard(coor) == Odd;
+
+        if(!isOdd) continue;
+
+        int idx_o = (coor[Tdir] * ldim[Xdir] * ldim[Ydir] * ldim[Zdir]
+                  +  coor[Xdir] * ldim[Ydir] * ldim[Zdir]
+                  +  coor[Ydir] * ldim[Zdir]
+                  +  coor[Zdir])/2;
+
+        munge(iodata[idx_o], scalardata[idx_g]);
+    });
+
+    grid->Barrier(); timer.Stop();
+    std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: munge overhead " << timer.Elapsed() << std::endl;
+
+    timer.Reset(); timer.Start();
+
+    vectorizeFromLexOrdArray(scalardata, Umu_ds);
+
+    grid->Barrier(); timer.Stop();
+    std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: vectorize overhead " << timer.Elapsed() << std::endl;
+
+    timer.Reset(); timer.Start();
+
+    undoDoubleStore(Umu, Umu_ds);
+
+    grid->Barrier(); timer.Stop();
+    std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: redistribute overhead " << timer.Elapsed() << std::endl;
+
+    GaugeStatistics(Umu, clone);
+
+    RealD plaq_diff = fabs(clone.plaquette - header.plaquette);
+
+    // clang-format off
+    std::cout << GridLogMessage << "OpenQcd Configuration " << file
+              << " plaquette " << clone.plaquette
+              << " header " << header.plaquette
+              << " difference " << plaq_diff
+              << std::endl;
+    // clang-format on
+
+    RealD precTol = (getPrecision<vsimd>::value == 1) ? 2e-7 : 2e-15;
+    RealD tol     = precTol * std::sqrt(grid->_Nprocessors); // taken from RQCD chroma code
+
+    if(plaq_diff >= tol)
+      std::cout << " Plaquette mismatch (diff = " << plaq_diff << ", tol = " << tol << ")" << std::endl;
+    assert(plaq_diff < tol);
+
+    std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl;
+  }
+
+  template<class vsimd>
+  static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
+                                        std::string                           file) {
+    std::cout << GridLogError << "Writing to openQCD file format is not implemented" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+private:
+  static inline GridCartesian* createOpenQcdGrid(GridCartesian* grid) {
+    // exploit GridCartesian to be able to still use IOobject
+    Coordinate gdim  = grid->GlobalDimensions();
+    Coordinate ldim  = grid->LocalDimensions();
+    Coordinate pcoor = grid->ThisProcessorCoor();
+
+    // openqcd does rb on the z direction
+    gdim[Zdir] /= 2;
+    ldim[Zdir] /= 2;
+
+    // and has the order T X Y Z (from slowest to fastest)
+    std::swap(gdim[Xdir], gdim[Zdir]);
+    std::swap(ldim[Xdir], ldim[Zdir]);
+    std::swap(pcoor[Xdir], pcoor[Zdir]);
+
+    GridCartesian* ret   = SpaceTimeGrid::makeFourDimGrid(gdim, grid->_simd_layout, grid->ProcessorGrid());
+    ret->_ldimensions    = ldim;
+    ret->_processor_coor = pcoor;
+    return ret;
+  }
+
+  template<class vsimd>
+  static inline void undoDoubleStore(Lattice<iLorentzColourMatrix<vsimd>>&            Umu,
+                                     Lattice<iDoubleStoredColourMatrix<vsimd>> const& Umu_ds) {
+    conformable(Umu.Grid(), Umu_ds.Grid());
+    Lattice<iColourMatrix<vsimd>> U(Umu.Grid());
+
+    // they store T+, T-, X+, X-, Y+, Y-, Z+, Z-
+    for(int mu_g = 0; mu_g < Nd; ++mu_g) {
+      int mu_o = (mu_g + 1) % Nd;
+      U        = PeekIndex<LorentzIndex>(Umu_ds, 2 * mu_o)
+               + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 2 * mu_o + 1), mu_g, +1);
+      PokeIndex<LorentzIndex>(Umu, U, mu_g);
+    }
+  }
+};
+
+NAMESPACE_END(Grid);
@@ -0,0 +1,281 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/parallelIO/OpenQcdIOChromaReference.h
+
+Copyright (C) 2015 - 2020
+
+Author: Daniel Richtmann <daniel.richtmann@ur.de>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+#include <ios>
+#include <iostream>
+#include <limits>
+#include <iomanip>
+#include <mpi.h>
+#include <ostream>
+#include <string>
+
+#define CHECK {std::cerr << __FILE__ << " @l " << __LINE__ << ": CHECK" << grid->ThisRank() << std::endl;}
+#define CHECK_VAR(a)   { std::cerr << __FILE__ << "@l" << __LINE__ << " on "<< grid->ThisRank() << ": " << __func__ << " " << #a << "=" << (a) << std::endl; }
+// #undef CHECK
+// #define CHECK
+
+NAMESPACE_BEGIN(Grid);
+
+class ParRdr {
+private:
+  bool const swap;
+
+  MPI_Status status;
+  MPI_File   fp;
+
+  int err;
+
+  MPI_Datatype oddSiteType;
+  MPI_Datatype fileViewType;
+
+  GridBase* grid;
+
+public:
+  ParRdr(MPI_Comm comm, std::string const& filename, GridBase* gridPtr)
+    : swap(false)
+    , grid(gridPtr) {
+    err = MPI_File_open(comm, const_cast<char*>(filename.c_str()), MPI_MODE_RDONLY, MPI_INFO_NULL, &fp);
+    assert(err == MPI_SUCCESS);
+  }
+
+  virtual ~ParRdr() { MPI_File_close(&fp); }
+
+  inline void errInfo(int const err, std::string const& func) {
+    static char estring[MPI_MAX_ERROR_STRING];
+    int         eclass = -1, len = 0;
+    MPI_Error_class(err, &eclass);
+    MPI_Error_string(err, estring, &len);
+    std::cerr << func << " - Error " << eclass << ": " << estring << std::endl;
+  }
+
+  int readHeader(FieldMetaData& field) {
+    assert((grid->_ndimension == Nd) && (Nd == 4));
+    assert(Nc == 3);
+
+    OpenQcdHeader header;
+
+    readBlock(reinterpret_cast<char*>(&header), 0, sizeof(OpenQcdHeader), MPI_CHAR);
+
+    header.plaq /= 3.; // TODO change this into normalizationfactor
+
+    // sanity check (should trigger on endian issues) TODO remove?
+    assert(0 < header.Nt && header.Nt <= 1024);
+    assert(0 < header.Nx && header.Nx <= 1024);
+    assert(0 < header.Ny && header.Ny <= 1024);
+    assert(0 < header.Nz && header.Nz <= 1024);
+
+    field.dimension[0] = header.Nx;
+    field.dimension[1] = header.Ny;
+    field.dimension[2] = header.Nz;
+    field.dimension[3] = header.Nt;
+
+    for(int d = 0; d < Nd; d++)
+      assert(grid->FullDimensions()[d] == field.dimension[d]);
+
+    field.plaquette = header.plaq;
+
+    field.data_start = sizeof(OpenQcdHeader);
+
+    return field.data_start;
+  }
+
+  void readBlock(void* const dest, uint64_t const pos, uint64_t const nbytes, MPI_Datatype const datatype) {
+    err = MPI_File_read_at_all(fp, pos, dest, nbytes, datatype, &status);
+    errInfo(err, "MPI_File_read_at_all");
+    // CHECK_VAR(err)
+
+    int read = -1;
+    MPI_Get_count(&status, datatype, &read);
+    // CHECK_VAR(read)
+    assert(nbytes == (uint64_t)read);
+    assert(err == MPI_SUCCESS);
+  }
+
+  void createTypes() {
+    constexpr int elem_size = Nd * 2 * 2 * Nc * Nc * sizeof(double); // 2_complex 2_fwdbwd
+
+    err = MPI_Type_contiguous(elem_size, MPI_BYTE, &oddSiteType); assert(err == MPI_SUCCESS);
+    err = MPI_Type_commit(&oddSiteType); assert(err == MPI_SUCCESS);
+
+    Coordinate const L = grid->GlobalDimensions();
+    Coordinate const l = grid->LocalDimensions();
+    Coordinate const i = grid->ThisProcessorCoor();
+
+    Coordinate sizes({L[2] / 2, L[1], L[0], L[3]});
+    Coordinate subsizes({l[2] / 2, l[1], l[0], l[3]});
+    Coordinate starts({i[2] * l[2] / 2, i[1] * l[1], i[0] * l[0], i[3] * l[3]});
+
+    err = MPI_Type_create_subarray(grid->_ndimension, &sizes[0], &subsizes[0], &starts[0], MPI_ORDER_FORTRAN, oddSiteType, &fileViewType); assert(err == MPI_SUCCESS);
+    err = MPI_Type_commit(&fileViewType); assert(err == MPI_SUCCESS);
+  }
+
+  void freeTypes() {
+    err = MPI_Type_free(&fileViewType); assert(err == MPI_SUCCESS);
+    err = MPI_Type_free(&oddSiteType); assert(err == MPI_SUCCESS);
+  }
+
+  bool readGauge(std::vector<ColourMatrixD>& domain_buff, FieldMetaData& meta) {
+    auto hdr_offset = readHeader(meta);
+    CHECK
+    createTypes();
+    err = MPI_File_set_view(fp, hdr_offset, oddSiteType, fileViewType, "native", MPI_INFO_NULL); errInfo(err, "MPI_File_set_view0"); assert(err == MPI_SUCCESS);
+    CHECK
+    int const domainSites = grid->lSites();
+    domain_buff.resize(Nd * domainSites); // 2_fwdbwd * 4_Nd * domainSites / 2_onlyodd
+
+    // the actual READ
+    constexpr uint64_t cm_size   = 2 * Nc * Nc * sizeof(double);    // 2_complex
+    constexpr uint64_t os_size   = Nd * 2 * cm_size;                // 2_fwdbwd
+    constexpr uint64_t max_elems = std::numeric_limits<int>::max(); // int adressable elems: floor is fine
+    uint64_t const     n_os      = domainSites / 2;
+
+    for(uint64_t os_idx = 0; os_idx < n_os;) {
+      uint64_t const read_os = os_idx + max_elems <= n_os ? max_elems : n_os - os_idx;
+      uint64_t const cm      = os_idx * Nd * 2;
+      readBlock(&(domain_buff[cm]), os_idx, read_os, oddSiteType);
+      os_idx += read_os;
+    }
+
+    CHECK
+    err = MPI_File_set_view(fp, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL);
+  errInfo(err, "MPI_File_set_view1");
+    assert(err == MPI_SUCCESS);
+    freeTypes();
+
+    std::cout << GridLogMessage << "read sum: " << n_os * os_size << " bytes" << std::endl;
+    return true;
+  }
+};
+
+class OpenQcdIOChromaReference : public BinaryIO {
+public:
+  template<class vsimd>
+  static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
+                                       Grid::FieldMetaData&                  header,
+                                       std::string                           file) {
+    typedef Lattice<iDoubleStoredColourMatrix<vsimd>> DoubledGaugeField;
+
+    assert(Ns == 4 and Nd == 4 and Nc == 3);
+
+    auto grid = Umu.Grid();
+
+    typedef ColourMatrixD fobj;
+
+    std::vector<fobj> iodata(
+      Nd * grid->lSites()); // actual size = 2*Nd*lsites but have only lsites/2 sites in file
+
+    {
+      ParRdr rdr(MPI_COMM_WORLD, file, grid);
+      rdr.readGauge(iodata, header);
+    } // equivalent to using binaryio
+
+    std::vector<iDoubleStoredColourMatrix<typename vsimd::scalar_type>> Umu_ds_scalar(grid->lSites());
+
+    copyToLatticeObject(Umu_ds_scalar, iodata, grid); // equivalent to munging
+
+    DoubledGaugeField Umu_ds(grid);
+
+    vectorizeFromLexOrdArray(Umu_ds_scalar, Umu_ds);
+
+    redistribute(Umu, Umu_ds); // equivalent to undoDoublestore
+
+    FieldMetaData clone(header);
+
+    GaugeStatistics(Umu, clone);
+
+    RealD plaq_diff = fabs(clone.plaquette - header.plaquette);
+
+    // clang-format off
+    std::cout << GridLogMessage << "OpenQcd Configuration " << file
+              << " plaquette " << clone.plaquette
+              << " header " << header.plaquette
+              << " difference " << plaq_diff
+              << std::endl;
+    // clang-format on
+
+    RealD precTol = (getPrecision<vsimd>::value == 1) ? 2e-7 : 2e-15;
+    RealD tol     = precTol * std::sqrt(grid->_Nprocessors); // taken from RQCD chroma code
+
+    if(plaq_diff >= tol)
+      std::cout << " Plaquette mismatch (diff = " << plaq_diff << ", tol = " << tol << ")" << std::endl;
+    assert(plaq_diff < tol);
+
+    std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl;
+  }
+
+private:
+  template<class vsimd>
+  static inline void redistribute(Lattice<iLorentzColourMatrix<vsimd>>&            Umu,
+                                  Lattice<iDoubleStoredColourMatrix<vsimd>> const& Umu_ds) {
+    Grid::conformable(Umu.Grid(), Umu_ds.Grid());
+    Lattice<iColourMatrix<vsimd>> U(Umu.Grid());
+
+    U = PeekIndex<LorentzIndex>(Umu_ds, 2) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 3), 0, +1); PokeIndex<LorentzIndex>(Umu, U, 0);
+    U = PeekIndex<LorentzIndex>(Umu_ds, 4) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 5), 1, +1); PokeIndex<LorentzIndex>(Umu, U, 1);
+    U = PeekIndex<LorentzIndex>(Umu_ds, 6) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 7), 2, +1); PokeIndex<LorentzIndex>(Umu, U, 2);
+    U = PeekIndex<LorentzIndex>(Umu_ds, 0) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 1), 3, +1); PokeIndex<LorentzIndex>(Umu, U, 3);
+  }
+
+  static inline void copyToLatticeObject(std::vector<DoubleStoredColourMatrix>& u_fb,
+                                         std::vector<ColourMatrixD> const&      node_buff,
+                                         GridBase*                              grid) {
+    assert(node_buff.size() == Nd * grid->lSites());
+
+    Coordinate const& l = grid->LocalDimensions();
+
+    Coordinate coord(Nd);
+    int&       x = coord[0];
+    int&       y = coord[1];
+    int&       z = coord[2];
+    int&       t = coord[3];
+
+    int buff_idx = 0;
+    for(t = 0; t < l[3]; ++t) // IMPORTANT: openQCD file ordering
+      for(x = 0; x < l[0]; ++x)
+        for(y = 0; y < l[1]; ++y)
+          for(z = 0; z < l[2]; ++z) {
+            if((t + z + y + x) % 2 == 0) continue;
+
+            int local_idx;
+            Lexicographic::IndexFromCoor(coord, local_idx, grid->LocalDimensions());
+            for(int mu = 0; mu < 2 * Nd; ++mu)
+              for(int c1 = 0; c1 < Nc; ++c1) {
+                for(int c2 = 0; c2 < Nc; ++c2) {
+                  u_fb[local_idx](mu)()(c1,c2) = node_buff[mu+buff_idx]()()(c1,c2);
+                }
+              }
+            buff_idx += 2 * Nd;
+          }
+
+    assert(node_buff.size() == buff_idx);
+  }
+};
+
+NAMESPACE_END(Grid);
@@ -95,7 +95,8 @@ inline uint64_t cyclecount(void){
 }
 #elif defined __x86_64__
 inline uint64_t cyclecount(void){ 
-  return __rdtsc();
+  uint64_t ret = __rdtsc();
+  return (uint64_t)ret;
 }
 #else

@@ -110,15 +110,15 @@ public:
 #endif
    accumulator = std::chrono::duration_cast<GridUsecs>(start-start); 
  }
-  GridTime Elapsed(void) {
+  GridTime Elapsed(void) const {
    assert(running == false);
    return std::chrono::duration_cast<GridTime>( accumulator );
  }
-  uint64_t useconds(void){
+  uint64_t useconds(void) const {
    assert(running == false);
    return (uint64_t) accumulator.count();
  }
-  bool isRunning(void){
+  bool isRunning(void) const {
    return running;
  }
 };
@@ -133,23 +133,23 @@ typedef iSpinColourMatrix<vComplex >    vSpinColourMatrix;
 typedef iSpinColourMatrix<vComplexF>    vSpinColourMatrixF;
 typedef iSpinColourMatrix<vComplexD>    vSpinColourMatrixD;

-    // SpinColourSpinColour matrix
-    typedef iSpinColourSpinColourMatrix<Complex  >    SpinColourSpinColourMatrix;
-    typedef iSpinColourSpinColourMatrix<ComplexF >    SpinColourSpinColourMatrixF;
-    typedef iSpinColourSpinColourMatrix<ComplexD >    SpinColourSpinColourMatrixD;
+// SpinColourSpinColour matrix
+typedef iSpinColourSpinColourMatrix<Complex  >    SpinColourSpinColourMatrix;
+typedef iSpinColourSpinColourMatrix<ComplexF >    SpinColourSpinColourMatrixF;
+typedef iSpinColourSpinColourMatrix<ComplexD >    SpinColourSpinColourMatrixD;

-    typedef iSpinColourSpinColourMatrix<vComplex >    vSpinColourSpinColourMatrix;
-    typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
-    typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;
+typedef iSpinColourSpinColourMatrix<vComplex >    vSpinColourSpinColourMatrix;
+typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
+typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;

-    // SpinColourSpinColour matrix
-    typedef iSpinColourSpinColourMatrix<Complex  >    SpinColourSpinColourMatrix;
-    typedef iSpinColourSpinColourMatrix<ComplexF >    SpinColourSpinColourMatrixF;
-    typedef iSpinColourSpinColourMatrix<ComplexD >    SpinColourSpinColourMatrixD;
+// SpinColourSpinColour matrix
+typedef iSpinColourSpinColourMatrix<Complex  >    SpinColourSpinColourMatrix;
+typedef iSpinColourSpinColourMatrix<ComplexF >    SpinColourSpinColourMatrixF;
+typedef iSpinColourSpinColourMatrix<ComplexD >    SpinColourSpinColourMatrixD;

-    typedef iSpinColourSpinColourMatrix<vComplex >    vSpinColourSpinColourMatrix;
-    typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
-    typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;
+typedef iSpinColourSpinColourMatrix<vComplex >    vSpinColourSpinColourMatrix;
+typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
+typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;

 // LorentzColour
 typedef iLorentzColourMatrix<Complex  > LorentzColourMatrix;
@@ -443,16 +443,16 @@ template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<Lorentz
 //////////////////////////////////////////////
 // Fermion <-> propagator assignements
 //////////////////////////////////////////////
-    //template <class Prop, class Ferm>
-    template <class Fimpl>
-      void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
+//template <class Prop, class Ferm>
+template <class Fimpl>
+void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
 {
  for(int j = 0; j < Ns; ++j)
    {
      auto pjs = peekSpin(p, j, s);
      auto fj  = peekSpin(f, j);
            
-            for(int i = 0; i < Fimpl::Dimension; ++i)
+      for(int i = 0; i < Fimpl::Dimension; ++i)
 	{
 	  pokeColour(pjs, peekColour(fj, i), i, c);
 	}
@@ -460,16 +460,16 @@ template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<Lorentz
    }
 }
    
-    //template <class Prop, class Ferm>
-    template <class Fimpl>
-      void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
+//template <class Prop, class Ferm>
+template <class Fimpl>
+void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
 {
  for(int j = 0; j < Ns; ++j)
    {
      auto pjs = peekSpin(p, j, s);
      auto fj  = peekSpin(f, j);
            
-            for(int i = 0; i < Fimpl::Dimension; ++i)
+      for(int i = 0; i < Fimpl::Dimension; ++i)
 	{
 	  pokeColour(fj, peekColour(pjs, i, c), i);
 	}
@@ -40,8 +40,8 @@ public:
 public:

  // override multiply
-  virtual RealD  M    (const FermionField &in, FermionField &out);
-  virtual RealD  Mdag (const FermionField &in, FermionField &out);
+  virtual void   M    (const FermionField &in, FermionField &out);
+  virtual void   Mdag (const FermionField &in, FermionField &out);

  // half checkerboard operations
  virtual void   Meooe       (const FermionField &in, FermionField &out);
@@ -141,7 +141,33 @@ public:
  Vector<iSinglet<Simd> >  MatpInvDag;
  Vector<iSinglet<Simd> >  MatmInvDag;

+  ///////////////////////////////////////////////////////////////
+  // Conserved current utilities
+  ///////////////////////////////////////////////////////////////
+
+  // Virtual can't template
+  void ContractConservedCurrent(PropagatorField &q_in_1,
+				PropagatorField &q_in_2,
+				PropagatorField &q_out,
+				PropagatorField &phys_src,
+				Current curr_type, 
+				unsigned int mu);
+
+   void SeqConservedCurrent(PropagatorField &q_in,
+			   PropagatorField &q_out,
+			   PropagatorField &phys_src,
+			   Current curr_type,
+			   unsigned int mu,
+			   unsigned int tmin,
+			   unsigned int tmax,
+			   ComplexField &lattice_cmplx);
+
+  void ContractJ5q(PropagatorField &q_in,ComplexField &J5q);
+  void ContractJ5q(FermionField &q_in,ComplexField &J5q);
+
+  ///////////////////////////////////////////////////////////////
  // Constructors
+  ///////////////////////////////////////////////////////////////
  CayleyFermion5D(GaugeField &_Umu,
 		  GridCartesian         &FiveDimGrid,
 		  GridRedBlackCartesian &FiveDimRedBlackGrid,
@@ -41,8 +41,8 @@ public:
 public:

  // override multiply
-  virtual RealD  M    (const FermionField &in, FermionField &out);
-  virtual RealD  Mdag (const FermionField &in, FermionField &out);
+  virtual void   M    (const FermionField &in, FermionField &out);
+  virtual void   Mdag (const FermionField &in, FermionField &out);

  // half checkerboard operaions
  virtual void   Meooe       (const FermionField &in, FermionField &out);
@@ -53,8 +53,8 @@ public:
  virtual void  DtildeInv  (const FermionField& in, FermionField& out);

  // override multiply
-  virtual RealD M          (const FermionField& in, FermionField& out);
-  virtual RealD Mdag       (const FermionField& in, FermionField& out);
+  virtual void  M          (const FermionField& in, FermionField& out);
+  virtual void  Mdag       (const FermionField& in, FermionField& out);

  // half checkerboard operations
  virtual void  Mooee      (const FermionField& in, FermionField& out);
@@ -58,8 +58,8 @@ public:
  virtual GridBase *GaugeRedBlackGrid(void)   =0;

  // override multiply
-  virtual RealD  M    (const FermionField &in, FermionField &out)=0;
-  virtual RealD  Mdag (const FermionField &in, FermionField &out)=0;
+  virtual void  M    (const FermionField &in, FermionField &out)=0;
+  virtual void  Mdag (const FermionField &in, FermionField &out)=0;

  // half checkerboard operaions
  virtual void   Meooe       (const FermionField &in, FermionField &out)=0;
@@ -86,15 +86,14 @@ public:
  virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
  virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;

-
  virtual void  Mdiag  (const FermionField &in, FermionField &out) { Mooee(in,out);};   // Same as Mooee applied to both CB's
  virtual void  Mdir   (const FermionField &in, FermionField &out,int dir,int disp)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
  virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac


-      virtual void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);};
+  virtual void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);};

-      virtual void  FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary,std::vector<double> twist) 
+  virtual void  FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary,std::vector<double> twist) 
      {
 	FFT theFFT((GridCartesian *) in.Grid());

@@ -148,15 +147,19 @@ public:
  virtual void ContractConservedCurrent(PropagatorField &q_in_1,
 					PropagatorField &q_in_2,
 					PropagatorField &q_out,
+					PropagatorField &phys_src,
 					Current curr_type,
-					unsigned int mu)=0;
+					unsigned int mu)
+  {assert(0);};
  virtual void SeqConservedCurrent(PropagatorField &q_in, 
 				   PropagatorField &q_out,
+				   PropagatorField &phys_src,
 				   Current curr_type,
 				   unsigned int mu,
 				   unsigned int tmin, 
 				   unsigned int tmax,
-				   ComplexField &lattice_cmplx)=0;
+				   ComplexField &lattice_cmplx)
+  {assert(0);};

      // Only reimplemented in Wilson5D 
      // Default to just a zero correlation function
@@ -38,6 +38,7 @@ public:
 static const bool isFundamental = Representation::isFundamental;
 static const int Nhcs = Options::Nhcs;
 static const bool LsVectorised=false;
+ static const bool isGparity=true;

 typedef ConjugateGaugeImpl< GaugeImplTypes<S,Dimension> > Gimpl;
 INHERIT_GIMPL_TYPES(Gimpl);
@@ -46,7 +47,7 @@ public:
 typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
      
 template <typename vtype> using iImplSpinor            = iVector<iVector<iVector<vtype, Dimension>, Ns>,   Ngp>;
- template <typename vtype> using iImplPropagator        = iVector<iMatrix<iMatrix<vtype, Dimension>, Ns>,   Ngp>;
+ template <typename vtype> using iImplPropagator        = iMatrix<iMatrix<iMatrix<vtype, Dimension>, Ns>,   Ngp>;
 template <typename vtype> using iImplHalfSpinor        = iVector<iVector<iVector<vtype, Dimension>, Nhs>,  Ngp>;
 template <typename vtype> using iImplHalfCommSpinor    = iVector<iVector<iVector<vtype, Dimension>, Nhcs>, Ngp>;
 template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>, Ngp>;
@@ -80,6 +81,7 @@ public:
  {
    assert(0);
  } 
+
  template<class _Spinor>
  static accelerator_inline void multLink(_Spinor &phi, 
 					  const SiteDoubledGaugeField &U,
@@ -191,6 +193,16 @@ public:
 #endif   
  }

+
+  template<class _SpinorField>
+  inline void multLinkField(_SpinorField & out,
+			    const DoubledGaugeField &Umu,
+			    const _SpinorField & phi,
+			    int mu)
+  {
+    assert(0);
+  }
+
  template <class ref>
  static accelerator_inline void loadLinkElement(Simd &reg, ref &memory) 
  {
@@ -71,8 +71,8 @@ public:
  // override multiply; cut number routines if pass dagger argument
  // and also make interface more uniformly consistent
  //////////////////////////////////////////////////////////////////
-  RealD M(const FermionField &in, FermionField &out);
-  RealD Mdag(const FermionField &in, FermionField &out);
+  void M(const FermionField &in, FermionField &out);
+  void Mdag(const FermionField &in, FermionField &out);

  /////////////////////////////////////////////////////////
  // half checkerboard operations
@@ -185,10 +185,12 @@ public:
  void ContractConservedCurrent(PropagatorField &q_in_1,
                                PropagatorField &q_in_2,
                                PropagatorField &q_out,
+                                PropagatorField &src,
                                Current curr_type,
                                unsigned int mu);
  void SeqConservedCurrent(PropagatorField &q_in,
                           PropagatorField &q_out,
+                           PropagatorField &srct,
                           Current curr_type,
                           unsigned int mu, 
                           unsigned int tmin,
@@ -1,4 +1,3 @@
-
 /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 
@@ -74,8 +73,8 @@ public:
  GridBase *FermionRedBlackGrid(void)    { return _FiveDimRedBlackGrid;}

  // full checkerboard operations; leave unimplemented as abstract for now
-  RealD  M    (const FermionField &in, FermionField &out);
-  RealD  Mdag (const FermionField &in, FermionField &out);
+  void  M    (const FermionField &in, FermionField &out);
+  void  Mdag (const FermionField &in, FermionField &out);

  // half checkerboard operations
  void   Meooe       (const FermionField &in, FermionField &out);
@@ -217,15 +216,17 @@ public:
  void ContractConservedCurrent(PropagatorField &q_in_1,
 				PropagatorField &q_in_2,
 				PropagatorField &q_out,
+				PropagatorField &src,
 				Current curr_type,
 				unsigned int mu);
  void SeqConservedCurrent(PropagatorField &q_in,
 			   PropagatorField &q_out,
+			   PropagatorField &src,
 			   Current curr_type,
 			   unsigned int mu, 
 			   unsigned int tmin,
-                             unsigned int tmax,
-                 	     ComplexField &lattice_cmplx);
+			   unsigned int tmax,
+			   ComplexField &lattice_cmplx);
 };

 NAMESPACE_END(Grid);
@@ -40,6 +40,11 @@ inline void convert(const Fieldi &from,Fieldo &to)
  to=from;
 }

+struct MADWFinnerIterCallbackBase{
+  virtual void operator()(const RealD current_resid){}
+  virtual ~MADWFinnerIterCallbackBase(){}
+};
+
 template<class Matrixo,class Matrixi,class PVinverter,class SchurSolver, class Guesser> 
 class MADWF 
 {
@@ -56,24 +61,30 @@ class MADWF

  RealD target_resid;
  int   maxiter;
- public:

+  //operator() is called on "callback" at the end of every inner iteration. This allows for example the adjustment of the inner
+  //tolerance to speed up subsequent iteration
+  MADWFinnerIterCallbackBase* callback;
+  
+ public:
  MADWF(Matrixo &_Mato,
-	Matrixi &_Mati, 
-	PVinverter &_PauliVillarsSolvero, 
+	Matrixi &_Mati,
+	PVinverter &_PauliVillarsSolvero,
 	SchurSolver &_SchurSolveri,
 	Guesser & _Guesseri,
 	RealD resid,
-	int _maxiter) :
+	int _maxiter,
+	MADWFinnerIterCallbackBase* _callback = NULL) :

  Mato(_Mato),Mati(_Mati),
    SchurSolveri(_SchurSolveri),
-    PauliVillarsSolvero(_PauliVillarsSolvero),Guesseri(_Guesseri)
-  {   
-    target_resid=resid;
-    maxiter     =_maxiter; 
-  };
-
+    PauliVillarsSolvero(_PauliVillarsSolvero),Guesseri(_Guesseri),
+    callback(_callback)
+    {
+      target_resid=resid;
+      maxiter     =_maxiter;
+    };
+   
  void operator() (const FermionFieldo &src4,FermionFieldo &sol5)
  {
    std::cout << GridLogMessage<< " ************************************************" << std::endl;
@@ -177,6 +188,8 @@ class MADWF
       std::cout << GridLogMessage << "Residual " << i << ": " << resid  << std::endl;
       std::cout << GridLogMessage << "***************************************" <<std::endl;

+       if(callback != NULL) (*callback)(resid);       
+       
       if (resid < target_resid) {
 	 return;
       }
@@ -56,8 +56,8 @@ public:
  virtual void  DtildeInv        (const FermionField& in, FermionField& out);

  // override multiply
-  virtual RealD M                (const FermionField& in, FermionField& out);
-  virtual RealD Mdag             (const FermionField& in, FermionField& out);
+  virtual void  M                (const FermionField& in, FermionField& out);
+  virtual void  Mdag             (const FermionField& in, FermionField& out);

  // half checkerboard operations
  virtual void  Mooee            (const FermionField& in, FermionField& out);
@@ -59,7 +59,7 @@ public:
  {
    RealD eps = 1.0;

-    std::cout<<GridLogMessage << "MobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Tanh approx"<<std::endl;
+    //    std::cout<<GridLogMessage << "MobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Tanh approx"<<std::endl;
    Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
    assert(zdata->n==this->Ls);
 	
@@ -47,8 +47,8 @@ public:
  void   M_internal(const FermionField &in, FermionField &out,int dag);

  // override multiply
-  virtual RealD  M    (const FermionField &in, FermionField &out);
-  virtual RealD  Mdag (const FermionField &in, FermionField &out);
+  virtual void   M    (const FermionField &in, FermionField &out);
+  virtual void   Mdag (const FermionField &in, FermionField &out);

  // half checkerboard operaions
  virtual void   Meooe       (const FermionField &in, FermionField &out);
@@ -109,9 +109,8 @@ public:
    ImportGauge(_Umu);
  }

-  virtual RealD M(const FermionField &in, FermionField &out);
-  virtual RealD Mdag(const FermionField &in, FermionField &out);
-
+  virtual void M(const FermionField &in, FermionField &out);
+  virtual void Mdag(const FermionField &in, FermionField &out);
  virtual void Mooee(const FermionField &in, FermionField &out);
  virtual void MooeeDag(const FermionField &in, FermionField &out);
  virtual void MooeeInv(const FermionField &in, FermionField &out);
@@ -78,8 +78,8 @@ public:
  // override multiply; cut number routines if pass dagger argument
  // and also make interface more uniformly consistent
  //////////////////////////////////////////////////////////////////
-  virtual RealD M(const FermionField &in, FermionField &out);
-  virtual RealD Mdag(const FermionField &in, FermionField &out);
+  virtual void  M(const FermionField &in, FermionField &out);
+  virtual void  Mdag(const FermionField &in, FermionField &out);

  /////////////////////////////////////////////////////////
  // half checkerboard operations
@@ -179,15 +179,17 @@ public:
  void ContractConservedCurrent(PropagatorField &q_in_1,
                                PropagatorField &q_in_2,
                                PropagatorField &q_out,
+                                PropagatorField &phys_src,
                                Current curr_type,
                                unsigned int mu);
  void SeqConservedCurrent(PropagatorField &q_in,
                           PropagatorField &q_out,
+                           PropagatorField &phys_src,
                           Current curr_type,
                           unsigned int mu, 
                           unsigned int tmin,
-                             unsigned int tmax,
-			     ComplexField &lattice_cmplx);
+			   unsigned int tmax,
+			   ComplexField &lattice_cmplx);
 };

 typedef WilsonFermion<WilsonImplF> WilsonFermionF;
@@ -1,4 +1,3 @@
-
 /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 
@@ -99,8 +98,8 @@ public:
  GridBase *FermionRedBlackGrid(void)    { return _FiveDimRedBlackGrid;}

  // full checkerboard operations; leave unimplemented as abstract for now
-  virtual RealD  M    (const FermionField &in, FermionField &out){assert(0); return 0.0;};
-  virtual RealD  Mdag (const FermionField &in, FermionField &out){assert(0); return 0.0;};
+  virtual void   M    (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   Mdag (const FermionField &in, FermionField &out){assert(0);};

  // half checkerboard operations; leave unimplemented as abstract for now
  virtual void   Meooe       (const FermionField &in, FermionField &out){assert(0);};
@@ -217,25 +216,7 @@ public:
    
  // Comms buffer
  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
-    
-  ///////////////////////////////////////////////////////////////
-  // Conserved current utilities
-  ///////////////////////////////////////////////////////////////
-  void ContractConservedCurrent(PropagatorField &q_in_1,
-				PropagatorField &q_in_2,
-				PropagatorField &q_out,
-				Current curr_type, 
-				unsigned int mu);
-  void SeqConservedCurrent(PropagatorField &q_in,
-			   PropagatorField &q_out,
-			   Current curr_type,
-			   unsigned int mu,
-			   unsigned int tmin,
-			   unsigned int tmax,
-			   ComplexField &lattice_cmplx);

-  void ContractJ5q(PropagatorField &q_in,ComplexField &J5q);
-  void ContractJ5q(FermionField &q_in,ComplexField &J5q);

 };

@@ -41,6 +41,7 @@ public:
  static const int Dimension = Representation::Dimension;
  static const bool isFundamental = Representation::isFundamental;
  static const bool LsVectorised=false;
+  static const bool isGparity=false;
  static const int Nhcs = Options::Nhcs;

  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
@@ -98,8 +99,21 @@ public:
  {
    multLink(phi,U,chi,mu);
  }
-    
-      
+
+  template<class _SpinorField> 
+  inline void multLinkField(_SpinorField & out,
+			    const DoubledGaugeField &Umu,
+			    const _SpinorField & phi,
+			    int mu)
+  {
+    auto out_v= out.View();
+    auto phi_v= phi.View();
+    auto Umu_v= Umu.View();
+    thread_for(sss,out.Grid()->oSites(),{
+	multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu);
+    });
+  }
+					   
  template <class ref>
  static accelerator_inline void loadLinkElement(Simd &reg, ref &memory) 
  {
@@ -66,41 +66,6 @@ public:
  static void DhopDirKernel(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
 			    int Ls, int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma);

-  //////////////////////////////////////////////////////////////////////////////
-  // Utilities for inserting Wilson conserved current.
-  //////////////////////////////////////////////////////////////////////////////
-  static void ContractConservedCurrentSiteFwd(const SitePropagator &q_in_1,
-                                       const SitePropagator &q_in_2,
-                                       SitePropagator &q_out,
-                                       DoubledGaugeFieldView &U,
-                                       unsigned int sU,
-                                       unsigned int mu,
-                                       bool switch_sign = false);
-
-  static void ContractConservedCurrentSiteBwd(const SitePropagator &q_in_1,
-                                       const SitePropagator &q_in_2,
-                                       SitePropagator &q_out,
-                                       DoubledGaugeFieldView &U,
-                                       unsigned int sU,
-                                       unsigned int mu,
-                                       bool switch_sign = false);
-
-  static void SeqConservedCurrentSiteFwd(const SitePropagator &q_in, 
-                                  SitePropagator &q_out,
-                                  DoubledGaugeFieldView &U,
-                                  unsigned int sU,
-                                  unsigned int mu,
-                                  vPredicate t_mask,
-                                  bool switch_sign = false);
-
-  static void SeqConservedCurrentSiteBwd(const SitePropagator &q_in,
-                                  SitePropagator &q_out,
-                                  DoubledGaugeFieldView &U,
-                                  unsigned int sU,
-                                  unsigned int mu,
-                                  vPredicate t_mask,
-                                  bool switch_sign = false);
-
 private:

  static accelerator_inline void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor * buf,
@@ -120,7 +120,8 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
    }
  }
  
-  virtual RealD M(const FermionField &in, FermionField &out) {
+  virtual void M(const FermionField &in, FermionField &out) 
+  {
    out.Checkerboard() = in.Checkerboard();
    this->Dhop(in, out, DaggerNo);
    FermionField tmp(out.Grid());
@@ -129,11 +130,12 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
      ComplexD b(0.0,this->mu[s]);
      axpbg5y_ssp(tmp,a,in,b,in,s,s);
    }
-    return axpy_norm(out, 1.0, tmp, out);
+    axpy(out, 1.0, tmp, out);
  }
  
  // needed for fast PV
-  void update(const std::vector<RealD>& _mass, const std::vector<RealD>& _mu) {
+  void update(const std::vector<RealD>& _mass, const std::vector<RealD>& _mu) 
+  {
    assert(_mass.size() == _mu.size());
    assert(_mass.size() == this->FermionGrid()->_fdimensions[0]);
    this->mass = _mass;
@@ -323,7 +323,7 @@ void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField
 }

 template<class Impl>
-RealD CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
+void CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
 {
  FermionField Din(psi.Grid());
  
@@ -335,11 +335,10 @@ RealD CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
  axpby(chi,1.0,1.0,chi,psi); 
  
  M5D(psi,chi);
-  return(norm2(chi));
 }

 template<class Impl>
-RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
+void CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
 {
  // Under adjoint
  //D1+        D1- P-    ->   D1+^dag   P+ D2-^dag
@@ -354,7 +353,6 @@ RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
  M5Ddag(psi,chi);
  // ((b D_W + D_w hop terms +1) on s-diag
  axpby (chi,1.0,1.0,chi,psi); 
-  return norm2(chi);
 }

 // half checkerboard operations
@@ -588,6 +586,356 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
  //  this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag);
 }

+
+template <class Impl>
+void CayleyFermion5D<Impl>::ContractJ5q(FermionField &q_in,ComplexField &J5q)
+{
+  conformable(this->GaugeGrid(), J5q.Grid());
+  conformable(q_in.Grid(), this->FermionGrid());
+  Gamma G5(Gamma::Algebra::Gamma5);
+  // 4d field
+  int Ls = this->Ls;
+  FermionField psi(this->GaugeGrid());
+  FermionField p_plus (this->GaugeGrid());
+  FermionField p_minus(this->GaugeGrid());
+  FermionField p(this->GaugeGrid());
+
+  ExtractSlice(p_plus , q_in, Ls/2-1 , 0);
+  ExtractSlice(p_minus, q_in, Ls/2   , 0);
+  p_plus = p_plus + G5*p_plus;
+  p_minus= p_minus - G5*p_minus;
+  p=0.5*(p_plus+p_minus);
+  J5q = localInnerProduct(p,p);
+}
+
+template <class Impl>
+void CayleyFermion5D<Impl>::ContractJ5q(PropagatorField &q_in,ComplexField &J5q)
+{
+  conformable(this->GaugeGrid(), J5q.Grid());
+  conformable(q_in.Grid(), this->FermionGrid());
+  Gamma G5(Gamma::Algebra::Gamma5);
+  // 4d field
+  int Ls = this->Ls;
+  PropagatorField psi(this->GaugeGrid());
+  PropagatorField p_plus (this->GaugeGrid());
+  PropagatorField p_minus(this->GaugeGrid());
+  PropagatorField p(this->GaugeGrid());
+
+  ExtractSlice(p_plus , q_in, Ls/2-1 , 0);
+  ExtractSlice(p_minus, q_in, Ls/2   , 0);
+  p_plus = p_plus + G5*p_plus;
+  p_minus= p_minus - G5*p_minus;
+  p=0.5*(p_plus+p_minus);
+  J5q = localInnerProduct(p,p);
+}
+
+#define Pp(Q) (0.5*(Q+g5*Q))
+#define Pm(Q) (0.5*(Q-g5*Q))
+#define Q_4d(Q) (Pm((Q)[0]) + Pp((Q)[Ls-1]))
+#define TopRowWithSource(Q) (phys_src + (1.0-mass)*Q_4d(Q))
+
+template <class Impl> 
+void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
+						      PropagatorField &q_in_2,
+						      PropagatorField &q_out,
+						      PropagatorField &phys_src,
+						      Current curr_type,
+						      unsigned int mu)
+{
+#ifndef GRID_NVCC
+  Gamma::Algebra Gmu [] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ,
+    Gamma::Algebra::GammaT,
+    Gamma::Algebra::Gamma5
+  };
+
+  auto UGrid= this->GaugeGrid();
+  auto FGrid= this->FermionGrid();
+  RealD sgn=1.0;
+  if ( curr_type == Current::Axial ) sgn = -1.0;
+
+  int Ls = this->Ls;
+
+  std::vector<PropagatorField> L_Q(Ls,UGrid); 
+  std::vector<PropagatorField> R_Q(Ls,UGrid); 
+  for(int s=0;s<Ls;s++){
+    ExtractSlice(L_Q[s], q_in_1, s , 0);
+    ExtractSlice(R_Q[s], q_in_2, s , 0);
+  }
+
+  Gamma g5(Gamma::Algebra::Gamma5);
+  PropagatorField C(UGrid); 
+  PropagatorField p5d(UGrid); 
+  PropagatorField us_p5d(UGrid); 
+  PropagatorField gp5d(UGrid); 
+  PropagatorField gus_p5d(UGrid); 
+
+  PropagatorField L_TmLsGq0(UGrid); 
+  PropagatorField L_TmLsTmp(UGrid);
+  PropagatorField R_TmLsGq0(UGrid); 
+  PropagatorField R_TmLsTmp(UGrid);
+  {
+    PropagatorField TermA(UGrid);
+    PropagatorField TermB(UGrid);
+    PropagatorField TermC(UGrid);
+    PropagatorField TermD(UGrid);
+    TermA = (Pp(Q_4d(L_Q)));
+    TermB = (Pm(Q_4d(L_Q)));
+    TermC = (Pm(TopRowWithSource(L_Q)));
+    TermD = (Pp(TopRowWithSource(L_Q)));
+
+    L_TmLsGq0 = (TermD - TermA + TermB);
+    L_TmLsTmp = (TermC - TermB + TermA);
+
+    TermA = (Pp(Q_4d(R_Q)));
+    TermB = (Pm(Q_4d(R_Q)));
+    TermC = (Pm(TopRowWithSource(R_Q)));
+    TermD = (Pp(TopRowWithSource(R_Q)));
+
+    R_TmLsGq0 = (TermD - TermA + TermB);
+    R_TmLsTmp = (TermC - TermB + TermA);
+  }
+
+  std::vector<PropagatorField> R_TmLsGq(Ls,UGrid);
+  std::vector<PropagatorField> L_TmLsGq(Ls,UGrid);
+  for(int s=0;s<Ls;s++){
+    R_TmLsGq[s] = (Pm((R_Q)[(s)]) + Pp((R_Q)[((s)-1+Ls)%Ls]));
+    L_TmLsGq[s] = (Pm((L_Q)[(s)]) + Pp((L_Q)[((s)-1+Ls)%Ls]));
+  }
+
+  Gamma gmu=Gamma(Gmu[mu]);
+
+  q_out = Zero();
+  PropagatorField tmp(UGrid); 
+  for(int s=0;s<Ls;s++){
+
+    int sp = (s+1)%Ls;
+    int sr = Ls-1-s;
+    int srp= (sr+1)%Ls;
+
+    // Mobius parameters
+    auto b=this->bs[s];
+    auto c=this->cs[s];
+    auto bpc = 1.0/(b+c);  // -0.5 factor in gauge links
+    if (s == 0) {
+      p5d    =(b*Pm(L_TmLsGq[Ls-1])+ c*Pp(L_TmLsGq[Ls-1]) + b*Pp(L_TmLsTmp)   + c*Pm(L_TmLsTmp     ));
+      tmp    =(b*Pm(R_TmLsGq0)     + c*Pp(R_TmLsGq0     ) + b*Pp(R_TmLsGq[1]) + c*Pm(R_TmLsGq[1]));
+    } else if (s == Ls-1) {
+      p5d    =(b*Pm(L_TmLsGq0)     + c*Pp(L_TmLsGq0     ) + b*Pp(L_TmLsGq[1]) + c*Pm(L_TmLsGq[1]));
+      tmp    =(b*Pm(R_TmLsGq[Ls-1])+ c*Pp(R_TmLsGq[Ls-1]) + b*Pp(R_TmLsTmp)   + c*Pm(R_TmLsTmp   ));
+    } else {
+      p5d    =(b*Pm(L_TmLsGq[sr]) + c*Pp(L_TmLsGq[sr])+ b*Pp(L_TmLsGq[srp])+ c*Pm(L_TmLsGq[srp]));
+      tmp    =(b*Pm(R_TmLsGq[s])  + c*Pp(R_TmLsGq[s]) + b*Pp(R_TmLsGq[sp ])+ c*Pm(R_TmLsGq[sp]));
+    }
+    tmp    = Cshift(tmp,mu,1);
+    Impl::multLinkField(us_p5d,this->Umu,tmp,mu);
+    
+    gp5d=g5*p5d*g5;
+    gus_p5d=gmu*us_p5d;
+
+    C = bpc*(adj(gp5d)*us_p5d);
+    C-= bpc*(adj(gp5d)*gus_p5d);
+
+    if (s == 0) {
+      p5d    =(b*Pm(R_TmLsGq0)     + c*Pp(R_TmLsGq0  )    + b*Pp(R_TmLsGq[1]) + c*Pm(R_TmLsGq[1]));
+      tmp    =(b*Pm(L_TmLsGq[Ls-1])+ c*Pp(L_TmLsGq[Ls-1]) + b*Pp(L_TmLsTmp)   + c*Pm(L_TmLsTmp  ));
+    } else if (s == Ls-1) {
+      p5d    =(b*Pm(R_TmLsGq[Ls-1])+ c*Pp(R_TmLsGq[Ls-1]) + b*Pp(R_TmLsTmp)   + c*Pm(R_TmLsTmp  ));
+      tmp    =(b*Pm(L_TmLsGq0)     + c*Pp(L_TmLsGq0  )    + b*Pp(L_TmLsGq[1]) + c*Pm(L_TmLsGq[1]));
+    } else {
+      p5d    =(b*Pm(R_TmLsGq[s])  + c*Pp(R_TmLsGq[s])  + b*Pp(R_TmLsGq[sp ])+ c*Pm(R_TmLsGq[sp]));
+      tmp    =(b*Pm(L_TmLsGq[sr]) + c*Pp(L_TmLsGq[sr]) + b*Pp(L_TmLsGq[srp])+ c*Pm(L_TmLsGq[srp]));
+    }
+    tmp    = Cshift(tmp,mu,1);
+    Impl::multLinkField(us_p5d,this->Umu,tmp,mu);
+
+    gp5d=gmu*p5d;
+    gus_p5d=g5*us_p5d*g5;
+
+    C-= bpc*(adj(gus_p5d)*gp5d);
+    C-= bpc*(adj(gus_p5d)*p5d);
+
+    if (s < Ls/2) q_out += sgn*C;
+    else          q_out +=     C;
+    
+  }
+#endif
+}
+
+template <class Impl>
+void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
+                                                PropagatorField &q_out,
+                                                PropagatorField &phys_src,
+                                                Current curr_type, 
+                                                unsigned int mu,
+                                                unsigned int tmin, 
+                                                unsigned int tmax,
+						ComplexField &ph)// Complex phase factor
+{
+  assert(mu>=0);
+  assert(mu<Nd);
+
+
+#if 0
+  int tshift = (mu == Nd-1) ? 1 : 0;
+  ////////////////////////////////////////////////
+  // SHAMIR CASE 
+  ////////////////////////////////////////////////
+  int Ls = this->Ls;
+  auto UGrid= this->GaugeGrid();
+  auto FGrid= this->FermionGrid();
+  Gamma::Algebra Gmu [] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ,
+    Gamma::Algebra::GammaT
+  };
+  Gamma gmu=Gamma(Gmu[mu]);
+
+  PropagatorField L_Q(UGrid); 
+  PropagatorField R_Q(UGrid); 
+
+  PropagatorField tmp(UGrid);
+  PropagatorField Utmp(UGrid);
+  LatticeInteger zz (UGrid);   zz=0.0;
+  LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
+  for (int s=0;s<Ls;s++) {
+
+    RealD G_s = (curr_type == Current::Axial  ) ? ((s < Ls/2) ? -1 : 1) : 1;
+
+    ExtractSlice(R_Q, q_in, s , 0);
+
+    tmp    = Cshift(R_Q,mu,1);
+    Impl::multLinkField(Utmp,this->Umu,tmp,mu);
+    tmp    = G_s*( Utmp*ph - gmu*Utmp*ph ); // Forward hop
+    tmp    = where((lcoor>=tmin),tmp,zz); // Mask the time
+    tmp    = where((lcoor<=tmax),tmp,zz);
+    L_Q = tmp;
+
+    tmp    = R_Q*ph;
+    tmp    = Cshift(tmp,mu,-1);
+    Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd);// Adjoint link
+    tmp    = -G_s*( Utmp + gmu*Utmp ); 
+    tmp    = where((lcoor>=tmin+tshift),tmp,zz); // Mask the time 
+    tmp    = where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
+    L_Q= L_Q+tmp;
+
+    InsertSlice(L_Q, q_out, s , 0);
+  }
+#endif
+
+#ifndef GRID_NVCC
+  int tshift = (mu == Nd-1) ? 1 : 0;
+  ////////////////////////////////////////////////
+  // GENERAL CAYLEY CASE
+  ////////////////////////////////////////////////
+  Gamma::Algebra Gmu [] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ,
+    Gamma::Algebra::GammaT,
+    Gamma::Algebra::Gamma5
+  };
+  Gamma gmu=Gamma(Gmu[mu]);
+  Gamma g5(Gamma::Algebra::Gamma5);
+
+  int Ls = this->Ls;
+  auto UGrid= this->GaugeGrid();
+  auto FGrid= this->FermionGrid();
+
+  std::vector<PropagatorField> R_Q(Ls,UGrid); 
+  PropagatorField L_Q(UGrid); 
+  PropagatorField tmp(UGrid);
+  PropagatorField Utmp(UGrid);
+
+  LatticeInteger zz (UGrid);   zz=0.0;
+  LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
+
+  for(int s=0;s<Ls;s++){
+    ExtractSlice(R_Q[s], q_in, s , 0);
+  }
+
+  PropagatorField R_TmLsGq0(UGrid); 
+  PropagatorField R_TmLsTmp(UGrid);
+  {
+    PropagatorField TermA(UGrid);
+    PropagatorField TermB(UGrid);
+    PropagatorField TermC(UGrid);
+    PropagatorField TermD(UGrid);
+
+    TermA = (Pp(Q_4d(R_Q)));
+    TermB = (Pm(Q_4d(R_Q)));
+    TermC = (Pm(TopRowWithSource(R_Q)));
+    TermD = (Pp(TopRowWithSource(R_Q)));
+
+    R_TmLsGq0 = (TermD - TermA + TermB);
+    R_TmLsTmp = (TermC - TermB + TermA);
+  }
+
+  std::vector<PropagatorField> R_TmLsGq(Ls,UGrid);
+  for(int s=0;s<Ls;s++){
+    R_TmLsGq[s] = (Pm((R_Q)[(s)]) + Pp((R_Q)[((s)-1+Ls)%Ls]));
+  }
+
+  std::vector<RealD> G_s(Ls,1.0);
+  if ( curr_type == Current::Axial ) {
+    for(int s=0;s<Ls/2;s++){
+      G_s[s] = -1.0;
+    }
+  }
+
+  for(int s=0;s<Ls;s++){
+
+    int sp = (s+1)%Ls;
+    int sr = Ls-1-s;
+    int srp= (sr+1)%Ls;
+
+    // Mobius parameters
+    auto b=this->bs[s];
+    auto c=this->cs[s];
+    //    auto bpc = G_s[s]*1.0/(b+c);  // -0.5 factor in gauge links
+
+    if (s == 0) {
+      tmp    =(b*Pm(R_TmLsGq0)     + c*Pp(R_TmLsGq0     ) + b*Pp(R_TmLsGq[1]) + c*Pm(R_TmLsGq[1]));
+    } else if (s == Ls-1) {
+      tmp    =(b*Pm(R_TmLsGq[Ls-1])+ c*Pp(R_TmLsGq[Ls-1]) + b*Pp(R_TmLsTmp)   + c*Pm(R_TmLsTmp   ));
+    } else {
+      tmp    =(b*Pm(R_TmLsGq[s])  + c*Pp(R_TmLsGq[s]) + b*Pp(R_TmLsGq[sp ])+ c*Pm(R_TmLsGq[sp]));
+    }
+
+    tmp    = Cshift(tmp,mu,1);
+    Impl::multLinkField(Utmp,this->Umu,tmp,mu);
+    tmp    = G_s[s]*( Utmp*ph - gmu*Utmp*ph ); // Forward hop
+    tmp    = where((lcoor>=tmin),tmp,zz); // Mask the time 
+    L_Q    = where((lcoor<=tmax),tmp,zz); // Position of current complicated
+
+    if (s == 0) {
+      tmp    =(b*Pm(R_TmLsGq0)     + c*Pp(R_TmLsGq0  )    + b*Pp(R_TmLsGq[1]) + c*Pm(R_TmLsGq[1]));
+    } else if (s == Ls-1) {
+      tmp    =(b*Pm(R_TmLsGq[Ls-1])+ c*Pp(R_TmLsGq[Ls-1]) + b*Pp(R_TmLsTmp)   + c*Pm(R_TmLsTmp  ));
+    } else {
+      tmp    =(b*Pm(R_TmLsGq[s])   + c*Pp(R_TmLsGq[s])    + b*Pp(R_TmLsGq[sp])+ c*Pm(R_TmLsGq[sp]));
+    }
+    tmp    = tmp *ph;
+    tmp    = Cshift(tmp,mu,-1);
+    Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
+    tmp = -G_s[s]*( Utmp + gmu*Utmp );
+    tmp    = where((lcoor>=tmin+tshift),tmp,zz); // Mask the time 
+    L_Q   += where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
+
+    InsertSlice(L_Q, q_out, s , 0);
+  }
+#endif
+}
+#undef Pp
+#undef Pm
+#undef Q_4d
+#undef TopRowWithSource
+
+
+
 #if 0
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInternalCompute(int dag, int inv,
@@ -94,7 +94,7 @@ void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Ap


 template<class Impl>
-RealD  ContinuedFractionFermion5D<Impl>::M           (const FermionField &psi, FermionField &chi)
+void ContinuedFractionFermion5D<Impl>::M           (const FermionField &psi, FermionField &chi)
 {
  int Ls = this->Ls;

@@ -116,15 +116,14 @@ RealD  ContinuedFractionFermion5D<Impl>::M           (const FermionField &psi, F
    }
    sign=-sign; 
  }
-  return norm2(chi);
 }
 template<class Impl>
-RealD  ContinuedFractionFermion5D<Impl>::Mdag        (const FermionField &psi, FermionField &chi)
+void ContinuedFractionFermion5D<Impl>::Mdag        (const FermionField &psi, FermionField &chi)
 {
  // This matrix is already hermitian. (g5 Dw) = Dw dag g5 = (g5 Dw)dag
  // The rest of matrix is symmetric.
  // Can ignore "dag"
-  return M(psi,chi);
+  M(psi,chi);
 }
 template<class Impl>
 void  ContinuedFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
@@ -89,7 +89,7 @@ void DomainWallEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionFiel
 /*****************************************************************************************************/

 template<class Impl>
-RealD DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
+void DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
 {
  FermionField Din(psi.Grid());

@@ -97,11 +97,10 @@ RealD DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
  this->DW(Din, chi, DaggerNo);
  axpby(chi, 1.0, 1.0, chi, psi);
  this->M5D(psi, chi);
-  return(norm2(chi));
 }

 template<class Impl>
-RealD DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
+void DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
 {
  FermionField Din(psi.Grid());

@@ -109,7 +108,6 @@ RealD DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& c
  this->MeooeDag5D(Din, chi);
  this->M5Ddag(psi, chi);
  axpby(chi, 1.0, 1.0, chi, psi);
-  return(norm2(chi));
 }

 /********************************************************************
@@ -548,21 +548,24 @@ void ImprovedStaggeredFermion5D<Impl>::MdirAll(const FermionField &in, std::vect
  assert(0);
 }
 template <class Impl>
-RealD ImprovedStaggeredFermion5D<Impl>::M(const FermionField &in, FermionField &out) {
+void ImprovedStaggeredFermion5D<Impl>::M(const FermionField &in, FermionField &out) 
+{
  out.Checkerboard() = in.Checkerboard();
  Dhop(in, out, DaggerNo);
-  return axpy_norm(out, mass, in, out);
+  axpy(out, mass, in, out);
 }

 template <class Impl>
-RealD ImprovedStaggeredFermion5D<Impl>::Mdag(const FermionField &in, FermionField &out) {
+void ImprovedStaggeredFermion5D<Impl>::Mdag(const FermionField &in, FermionField &out) 
+{
  out.Checkerboard() = in.Checkerboard();
  Dhop(in, out, DaggerYes);
-  return axpy_norm(out, mass, in, out);
+  axpy(out, mass, in, out);
 }

 template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out) {
+void ImprovedStaggeredFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out) 
+{
  if (in.Checkerboard() == Odd) {
    DhopEO(in, out, DaggerNo);
  } else {
@@ -570,7 +573,8 @@ void ImprovedStaggeredFermion5D<Impl>::Meooe(const FermionField &in, FermionFiel
  }
 }
 template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
+void ImprovedStaggeredFermion5D<Impl>::MeooeDag(const FermionField &in, FermionField &out) 
+{
  if (in.Checkerboard() == Odd) {
    DhopEO(in, out, DaggerYes);
  } else {
@@ -579,27 +583,30 @@ void ImprovedStaggeredFermion5D<Impl>::MeooeDag(const FermionField &in, FermionF
 }

 template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out) {
+void ImprovedStaggeredFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out) 
+{
  out.Checkerboard() = in.Checkerboard();
  typename FermionField::scalar_type scal(mass);
  out = scal * in;
 }

 template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
+void ImprovedStaggeredFermion5D<Impl>::MooeeDag(const FermionField &in, FermionField &out) 
+{
  out.Checkerboard() = in.Checkerboard();
  Mooee(in, out);
 }

 template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
+void ImprovedStaggeredFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out) 
+{
  out.Checkerboard() = in.Checkerboard();
  out = (1.0 / (mass)) * in;
 }

 template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::MooeeInvDag(const FermionField &in,
-						   FermionField &out) {
+void ImprovedStaggeredFermion5D<Impl>::MooeeInvDag(const FermionField &in,FermionField &out) 
+{
  out.Checkerboard() = in.Checkerboard();
  MooeeInv(in, out);
 }
@@ -611,6 +618,7 @@ template <class Impl>
 void ImprovedStaggeredFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
 								PropagatorField &q_in_2,
 								PropagatorField &q_out,
+								PropagatorField &src,
 								Current curr_type,
 								unsigned int mu)
 {
@@ -620,11 +628,12 @@ void ImprovedStaggeredFermion5D<Impl>::ContractConservedCurrent(PropagatorField
 template <class Impl>
 void ImprovedStaggeredFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 							   PropagatorField &q_out,
+							   PropagatorField &src,
 							   Current curr_type,
 							   unsigned int mu, 
 							   unsigned int tmin,
-                                              unsigned int tmax,
-					      ComplexField &lattice_cmplx)
+							   unsigned int tmax,
+							   ComplexField &lattice_cmplx)
 {
  assert(0);

@@ -171,21 +171,24 @@ void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin,const
 /////////////////////////////

 template <class Impl>
-RealD ImprovedStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) {
+void ImprovedStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) 
+{
  out.Checkerboard() = in.Checkerboard();
  Dhop(in, out, DaggerNo);
-  return axpy_norm(out, mass, in, out);
+  axpy(out, mass, in, out);
 }

 template <class Impl>
-RealD ImprovedStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
+void ImprovedStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) 
+{
  out.Checkerboard() = in.Checkerboard();
  Dhop(in, out, DaggerYes);
-  return axpy_norm(out, mass, in, out);
+  axpy(out, mass, in, out);
 }

 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
+void ImprovedStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out) 
+{
  if (in.Checkerboard() == Odd) {
    DhopEO(in, out, DaggerNo);
  } else {
@@ -193,7 +196,8 @@ void ImprovedStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField
  }
 }
 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
+void ImprovedStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) 
+{
  if (in.Checkerboard() == Odd) {
    DhopEO(in, out, DaggerYes);
  } else {
@@ -202,27 +206,30 @@ void ImprovedStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionFie
 }

 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
+void ImprovedStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out) 
+{
  out.Checkerboard() = in.Checkerboard();
  typename FermionField::scalar_type scal(mass);
  out = scal * in;
 }

 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
+void ImprovedStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) 
+{
  out.Checkerboard() = in.Checkerboard();
  Mooee(in, out);
 }

 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
+void ImprovedStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) 
+{
  out.Checkerboard() = in.Checkerboard();
  out = (1.0 / (mass)) * in;
 }

 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in,
-						 FermionField &out) {
+void ImprovedStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in,FermionField &out) 
+{
  out.Checkerboard() = in.Checkerboard();
  MooeeInv(in, out);
 }
@@ -234,7 +241,8 @@ void ImprovedStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in,
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, 
 						   GaugeField & mat,
-						   const FermionField &A, const FermionField &B, int dag) {
+						   const FermionField &A, const FermionField &B, int dag) 
+{
  assert((dag == DaggerNo) || (dag == DaggerYes));

  Compressor compressor;
@@ -284,8 +292,8 @@ void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGauge
 }

 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
-
+void ImprovedStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) 
+{
  conformable(U.Grid(), _grid);
  conformable(U.Grid(), V.Grid());
  conformable(U.Grid(), mat.Grid());
@@ -296,8 +304,8 @@ void ImprovedStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionFie
 }

 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
-
+void ImprovedStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) 
+{
  conformable(U.Grid(), _cbgrid);
  conformable(U.Grid(), V.Grid());
  conformable(U.Grid(), mat.Grid());
@@ -310,8 +318,8 @@ void ImprovedStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionF
 }

 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
-
+void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) 
+{
  conformable(U.Grid(), _cbgrid);
  conformable(U.Grid(), V.Grid());
  conformable(U.Grid(), mat.Grid());
@@ -600,6 +608,7 @@ template <class Impl>
 void ImprovedStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
 							      PropagatorField &q_in_2,
 							      PropagatorField &q_out,
+							      PropagatorField &src,
 							      Current curr_type,
 							      unsigned int mu)
 {
@@ -609,6 +618,7 @@ void ImprovedStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
                                                         PropagatorField &q_out,
+                                                         PropagatorField &src,
                                                         Current curr_type,
                                                         unsigned int mu, 
                                                         unsigned int tmin,
@@ -166,7 +166,7 @@ void MobiusEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& c
 /*****************************************************************************************************/

 template<class Impl>
-RealD MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
+void MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
 {
  FermionField Din(psi.Grid());

@@ -174,11 +174,10 @@ RealD MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
  this->DW(Din, chi, DaggerNo);
  axpby(chi, 1.0, 1.0, chi, psi);
  this->M5D(psi, chi);
-  return(norm2(chi));
 }

 template<class Impl>
-RealD MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
+void MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
 {
  FermionField Din(psi.Grid());

@@ -186,7 +185,6 @@ RealD MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
  this->MeooeDag5D(Din, chi);
  this->M5Ddag(psi, chi);
  axpby(chi, 1.0, 1.0, chi, psi);
-  return(norm2(chi));
 }

 /********************************************************************
@@ -269,16 +269,14 @@ void   PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi
 }

 template<class Impl>
-RealD  PartialFractionFermion5D<Impl>::M    (const FermionField &in, FermionField &out)
+void PartialFractionFermion5D<Impl>::M    (const FermionField &in, FermionField &out)
 {
  M_internal(in,out,DaggerNo);
-  return norm2(out);
 }
 template<class Impl>
-RealD  PartialFractionFermion5D<Impl>::Mdag (const FermionField &in, FermionField &out)
+void PartialFractionFermion5D<Impl>::Mdag (const FermionField &in, FermionField &out)
 {
  M_internal(in,out,DaggerYes);
-  return norm2(out);
 }

 template<class Impl>
@@ -35,7 +35,7 @@ NAMESPACE_BEGIN(Grid);

 // *NOT* EO
 template <class Impl>
-RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
 {
  FermionField temp(out.Grid());

@@ -47,11 +47,10 @@ RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
  Mooee(in, temp);

  out += temp;
-  return norm2(out);
 }

 template <class Impl>
-RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
+void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
 {
  FermionField temp(out.Grid());

@@ -63,7 +62,6 @@ RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
  MooeeDag(in, temp);

  out += temp;
-  return norm2(out);
 }

 template <class Impl>
@@ -132,14 +130,14 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
  pickCheckerboard(Even, CloverTermEven, CloverTerm);
  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);

-  pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
-  pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));
+  pickCheckerboard(Even, CloverTermDagEven, closure(adj(CloverTerm)));
+  pickCheckerboard(Odd, CloverTermDagOdd, closure(adj(CloverTerm)));

  pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
  pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);

-  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
-  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
+  pickCheckerboard(Even, CloverTermInvDagEven, closure(adj(CloverTermInv)));
+  pickCheckerboard(Odd, CloverTermInvDagOdd, closure(adj(CloverTermInv)));
 }

 template <class Impl>
@@ -861,7 +861,6 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
 * Conserved current utilities for Wilson fermions, for contracting propagators
 * to make a conserved current sink or inserting the conserved current 
 * sequentially.
- ******************************************************************************/

 // Helper macro to reverse Simd vector. Fixme: slow, generic implementation.
 #define REVERSE_LS(qSite, qSiteRev, Nsimd) \
@@ -877,220 +876,10 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
    merge(qSiteRev, qSiteVec); \
 }

-//          psi = chiralProjectPlus(Result_s[Ls/2-1]);
-//          psi+= chiralProjectMinus(Result_s[Ls/2]);
-//         PJ5q+=localInnerProduct(psi,psi);
-
-template<class vobj> 
-Lattice<vobj> spProj5p(const Lattice<vobj> & in)
-{
-  GridBase *grid=in.Grid();
-  Gamma G5(Gamma::Algebra::Gamma5);
-  Lattice<vobj> ret(grid);
-  auto ret_v = ret.View();
-  auto in_v  =  in.View();
-  thread_for(ss,grid->oSites(),{
-    ret_v[ss] = in_v[ss] + G5*in_v[ss];
-  });
-  return ret;
-}
-template<class vobj> 
-Lattice<vobj> spProj5m(const Lattice<vobj> & in)
-{
-  Gamma G5(Gamma::Algebra::Gamma5);
-  GridBase *grid=in.Grid();
-  Lattice<vobj> ret(grid);
-  auto ret_v = ret.View();
-  auto in_v  =  in.View();
-  thread_for(ss,grid->oSites(),{
-    ret_v[ss] = in_v[ss] - G5*in_v[ss];
-  });
-  return ret;
-}
-
-template <class Impl>
-void WilsonFermion5D<Impl>::ContractJ5q(FermionField &q_in,ComplexField &J5q)
-{
-  conformable(GaugeGrid(), J5q.Grid());
-  conformable(q_in.Grid(), FermionGrid());
-
-  // 4d field
-  int Ls = this->Ls;
-  FermionField psi(GaugeGrid());
-  FermionField p_plus (GaugeGrid());
-  FermionField p_minus(GaugeGrid());
-  FermionField p(GaugeGrid());
-
-  ExtractSlice(p_plus , q_in, Ls/2   , 0);
-  ExtractSlice(p_minus, q_in, Ls/2-1 , 0);
-  p_plus = spProj5p(p_plus );
-  p_minus= spProj5m(p_minus);
-  p=p_plus+p_minus;
-  J5q = localInnerProduct(p,p);
-}
-
-template <class Impl>
-void WilsonFermion5D<Impl>::ContractJ5q(PropagatorField &q_in,ComplexField &J5q)
-{
-  conformable(GaugeGrid(), J5q.Grid());
-  conformable(q_in.Grid(), FermionGrid());
-
-  // 4d field
-  int Ls = this->Ls;
-  PropagatorField psi(GaugeGrid());
-  PropagatorField p_plus (GaugeGrid());
-  PropagatorField p_minus(GaugeGrid());
-  PropagatorField p(GaugeGrid());
-
-  ExtractSlice(p_plus , q_in, Ls/2   , 0);
-  ExtractSlice(p_minus, q_in, Ls/2-1 , 0);
-  p_plus = spProj5p(p_plus );
-  p_minus= spProj5m(p_minus);
-  p=p_plus+p_minus;
-  J5q = localInnerProduct(p,p);
-}
-
-template <class Impl>
-void WilsonFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
-                                                     PropagatorField &q_in_2,
-                                                     PropagatorField &q_out,
-                                                     Current curr_type,
-                                                     unsigned int mu)
-{
-    conformable(q_in_1.Grid(), FermionGrid());
-    conformable(q_in_1.Grid(), q_in_2.Grid());
-    conformable(_FourDimGrid, q_out.Grid());
-
-    PropagatorField tmp1(FermionGrid()), tmp2(FermionGrid());
-    unsigned int LLs = q_in_1.Grid()->_rdimensions[0];
-    q_out = Zero();
-
-    // Forward, need q1(x + mu, s), q2(x, Ls - 1 - s). Backward, need q1(x, s), 
-    // q2(x + mu, Ls - 1 - s). 5D lattice so shift 4D coordinate mu by one.
-    tmp1 = Cshift(q_in_1, mu + 1, 1);
-    tmp2 = Cshift(q_in_2, mu + 1, 1);
-    auto q_in_1_v = q_in_1.View();
-    auto q_in_2_v = q_in_2.View();
-    auto tmp1_v   = tmp1.View();
-    auto tmp2_v   = tmp2.View();
-    auto q_out_v  = q_out.View();
-    auto Umu_v    = Umu.View();
-    thread_for(sU, Umu.Grid()->oSites(),{
-
-        unsigned int sF1 = sU * LLs;
-        unsigned int sF2 = (sU + 1) * LLs - 1;
-
-        for (unsigned int s = 0; s < LLs; ++s)
-        {
-            bool axial_sign = ((curr_type == Current::Axial) && \
-                               (s < (LLs / 2)));
-            SitePropagator qSite2, qmuSite2;
-
-            // If vectorised in 5th dimension, reverse q2 vector to match up
-            // sites correctly.
-            if (Impl::LsVectorised)
-            {
-                REVERSE_LS(q_in_2_v[sF2], qSite2, Ls / LLs);
-                REVERSE_LS(tmp2_v[sF2], qmuSite2, Ls / LLs);
-            }
-            else
-            {
-                qSite2   = q_in_2_v[sF2];
-                qmuSite2 = tmp2_v[sF2];
-            }
-            Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sF1], 
-                                                     qSite2, 
-                                                     q_out_v[sU],
-                                                     Umu_v, sU, mu, axial_sign);
-            Kernels::ContractConservedCurrentSiteBwd(q_in_1_v[sF1],
-                                                     qmuSite2,
-                                                     q_out_v[sU],
-                                                     Umu_v, sU, mu, axial_sign);
-            sF1++;
-            sF2--;
-        }
-    });
-}
+ ******************************************************************************/


-template <class Impl>
-void WilsonFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
-                                                PropagatorField &q_out,
-                                                Current curr_type, 
-                                                unsigned int mu,
-                                                unsigned int tmin, 
-                                                unsigned int tmax,
-						ComplexField &lattice_cmplx)
-{
-    conformable(q_in.Grid(), FermionGrid());
-    conformable(q_in.Grid(), q_out.Grid());
-    PropagatorField tmp(GaugeGrid()),tmp2(GaugeGrid());
-    unsigned int tshift = (mu == Tp) ? 1 : 0;
-    unsigned int LLs = q_in.Grid()->_rdimensions[0];
-    unsigned int LLt    = GridDefaultLatt()[Tp];

-    q_out = Zero();
-    LatticeInteger coords(_FourDimGrid);
-    LatticeCoordinate(coords, Tp);
-    
-    auto q_out_v = q_out.View();
-    auto tmp2_v  = tmp2.View();
-    auto coords_v= coords.View();
-    auto Umu_v   = Umu.View();
-    for (unsigned int s = 0; s < LLs; ++s)
-    {
-        bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
-	bool tadpole_sign = (curr_type == Current::Tadpole);
-	bool switch_sgn = tadpole_sign || axial_sign;
-
-
-        //forward direction: Need q(x + mu, s)*A(x)
-        ExtractSlice(tmp2, q_in, s, 0);  //q(x,s) 
-        tmp = Cshift(tmp2, mu, 1);	 //q(x+mu,s)
-        tmp2 = tmp*lattice_cmplx;	 //q(x+mu,s)*A(x)	
-
-    	thread_for(sU, Umu.Grid()->oSites(),{
-            // Compute the sequential conserved current insertion only if our simd
-            // object contains a timeslice we need.
-            vPredicate t_mask;
-	    t_mask() = ((coords_v[sU] >= tmin) && (coords_v[sU] <= tmax));
-            Integer timeSlices = Reduce(t_mask());
-
-            if (timeSlices > 0)
-            {
-		unsigned int sF = sU * LLs + s;
-                Kernels::SeqConservedCurrentSiteFwd(tmp2_v[sU], 
-						    q_out_v[sF], Umu_v, sU,
-						    mu, t_mask, switch_sgn);
-            }
-
-        });
-
-        //backward direction: Need q(x - mu, s)*A(x-mu)
-        ExtractSlice(tmp2, q_in, s, 0);  //q(x,s)
-        tmp = lattice_cmplx*tmp2;	 //q(x,s)*A(x)
-        tmp2 = Cshift(tmp, mu, -1);	 //q(x-mu,s)*A(x-mu,s)
-
-    	thread_for(sU, Umu.Grid()->oSites(),
-    	{
-	  vPredicate t_mask;
-	  t_mask()= ((coords_v[sU] >= (tmin + tshift)) && (coords_v[sU] <= (tmax + tshift)));
-
-	  //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
-	  unsigned int t0 = 0;
-	  if((tmax==LLt-1) && (tshift==1)) t_mask() = (t_mask() || (coords_v[sU] == t0 ));
-	  
-	  Integer timeSlices = Reduce(t_mask());
-	  
-	  if (timeSlices > 0) {
-	    unsigned int sF = sU * LLs + s; 
-	    Kernels::SeqConservedCurrentSiteBwd(tmp2_v[sU], 
-						q_out_v[sF], Umu_v, sU,
-						mu, t_mask, axial_sign);
-	  }
-	});
-    }
-}
  
 NAMESPACE_END(Grid);

@@ -102,21 +102,24 @@ void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
 /////////////////////////////

 template <class Impl>
-RealD WilsonFermion<Impl>::M(const FermionField &in, FermionField &out) {
+void WilsonFermion<Impl>::M(const FermionField &in, FermionField &out) 
+{
  out.Checkerboard() = in.Checkerboard();
  Dhop(in, out, DaggerNo);
-  return axpy_norm(out, diag_mass, in, out);
+  axpy(out, diag_mass, in, out);
 }

 template <class Impl>
-RealD WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
+void WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out) 
+{
  out.Checkerboard() = in.Checkerboard();
  Dhop(in, out, DaggerYes);
-  return axpy_norm(out, diag_mass, in, out);
+  axpy(out, diag_mass, in, out);
 }

 template <class Impl>
-void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
+void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out) 
+{
  if (in.Checkerboard() == Odd) {
    DhopEO(in, out, DaggerNo);
  } else {
@@ -125,7 +128,8 @@ void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
 }

 template <class Impl>
-void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
+void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) 
+{
  if (in.Checkerboard() == Odd) {
    DhopEO(in, out, DaggerYes);
  } else {
@@ -134,26 +138,30 @@ void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
 }
  
 template <class Impl>
-void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
+void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out) 
+{
  out.Checkerboard() = in.Checkerboard();
  typename FermionField::scalar_type scal(diag_mass);
  out = scal * in;
 }

 template <class Impl>
-void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
+void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) 
+{
  out.Checkerboard() = in.Checkerboard();
  Mooee(in, out);
 }

 template<class Impl>
-void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
+void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) 
+{
  out.Checkerboard() = in.Checkerboard();
  out = (1.0/(diag_mass))*in;
 }
  
 template<class Impl>
-void WilsonFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) {
+void WilsonFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) 
+{
  out.Checkerboard() = in.Checkerboard();
  MooeeInv(in,out);
 }
@@ -249,7 +257,8 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
 }

 template <class Impl>
-void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
+void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) 
+{
  conformable(U.Grid(), _grid);
  conformable(U.Grid(), V.Grid());
  conformable(U.Grid(), mat.Grid());
@@ -260,7 +269,8 @@ void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, cons
 }

 template <class Impl>
-void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
+void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) 
+{
  conformable(U.Grid(), _cbgrid);
  conformable(U.Grid(), V.Grid());
  //conformable(U.Grid(), mat.Grid()); not general, leaving as a comment (Guido)
@@ -274,7 +284,8 @@ void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, co
 }

 template <class Impl>
-void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
+void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) 
+{
  conformable(U.Grid(), _cbgrid);
  conformable(U.Grid(), V.Grid());
  //conformable(U.Grid(), mat.Grid());
@@ -287,7 +298,8 @@ void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, co
 }

 template <class Impl>
-void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) {
+void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) 
+{
  conformable(in.Grid(), _grid);  // verifies full grid
  conformable(in.Grid(), out.Grid());

@@ -297,7 +309,8 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da
 }

 template <class Impl>
-void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) {
+void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) 
+{
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check

@@ -308,7 +321,8 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
 }

 template <class Impl>
-void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) {
+void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) 
+{
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check

@@ -386,7 +400,8 @@ template <class Impl>
 void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
 						      DoubledGaugeField &U,
 						      const FermionField &in,
-						      FermionField &out, int dag) {
+						      FermionField &out, int dag) 
+{
  assert((dag == DaggerNo) || (dag == DaggerYes));

  Compressor compressor(dag);
@@ -436,7 +451,8 @@ template <class Impl>
 void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
                                       DoubledGaugeField &U,
                                       const FermionField &in,
-                                       FermionField &out, int dag) {
+                                       FermionField &out, int dag) 
+{
  assert((dag == DaggerNo) || (dag == DaggerYes));
  Compressor compressor(dag);
  st.HaloExchange(in, compressor);
@@ -459,6 +475,7 @@ template <class Impl>
 void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
                                                   PropagatorField &q_in_2,
                                                   PropagatorField &q_out,
+                                                   PropagatorField &src,
                                                   Current curr_type,
                                                   unsigned int mu)
 {
@@ -466,6 +483,7 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
  conformable(_grid, q_in_1.Grid());
  conformable(_grid, q_in_2.Grid());
  conformable(_grid, q_out.Grid());
+#if 0
  PropagatorField tmp1(_grid), tmp2(_grid);
  q_out = Zero();

@@ -489,12 +507,15 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
 					       q_out_v[sU],
 					       Umu_v, sU, mu);
  });
+#else
+#endif
 }


 template <class Impl>
 void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
                                              PropagatorField &q_out,
+                                              PropagatorField &src,
                                              Current curr_type,
                                              unsigned int mu,
                                              unsigned int tmin, 
@@ -503,6 +524,7 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 {
  conformable(_grid, q_in.Grid());
  conformable(_grid, q_out.Grid());
+#if 0

  //  Lattice<iSinglet<Simd>> ph(_grid), coor(_grid);
  Complex i(0.0,1.0);
@@ -556,6 +578,8 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 					  Umu_v, sU, mu, t_mask);
    }
  });
+#else
+#endif
 }

 NAMESPACE_END(Grid);
@@ -444,19 +444,19 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;}
 #ifndef GRID_NVCC
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
-     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSite); printf(".");    return;}
+     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSite);    return;}
 #endif
   } else if( interior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALLNB(GenericDhopSiteInt); return;}
 #ifndef GRID_NVCC
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt);    return;}
-     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt); printf("-");    return;}
+     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;}
 #endif
   } else if( exterior ) { 
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteExt); return;}
 #ifndef GRID_NVCC
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt);    return;}
-     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteExt); printf("+");    return;}
+     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteExt);    return;}
 #endif
   }
   assert(0 && " Kernel optimisation case not covered ");
@@ -493,131 +493,5 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
   assert(0 && " Kernel optimisation case not covered ");
  }

-/*******************************************************************************
- * Conserved current utilities for Wilson fermions, for contracting propagators
- * to make a conserved current sink or inserting the conserved current 
- * sequentially. Common to both 4D and 5D.
- ******************************************************************************/
-// N.B. Functions below assume a -1/2 factor within U.
-#define WilsonCurrentFwd(expr, mu) ((expr - Gamma::gmu[mu]*expr))
-#define WilsonCurrentBwd(expr, mu) ((expr + Gamma::gmu[mu]*expr))
-
-/*******************************************************************************
- * Name: ContractConservedCurrentSiteFwd
- * Operation: (1/2) * q2[x] * U(x) * (g[mu] - 1) * q1[x + mu]
- * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
- *        - Pass in q_in_1 shifted in +ve mu direction.
- ******************************************************************************/
-template<class Impl>
-void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd(const SitePropagator &q_in_1,
-							  const SitePropagator &q_in_2,
-							  SitePropagator &q_out,
-							  DoubledGaugeFieldView &U,
-							  unsigned int sU,
-							  unsigned int mu,
-							  bool switch_sign)
-{
-  SitePropagator result, tmp;
-  Gamma g5(Gamma::Algebra::Gamma5);
-
-  Impl::multLink(tmp, U[sU], q_in_1, mu);
-
-  result = g5 * adj(q_in_2) * g5 * WilsonCurrentFwd(tmp, mu);
-
-  if (switch_sign) {
-    q_out -= result;
-  } else {
-    q_out += result;
-  }
-}
-
-/*******************************************************************************
- * Name: ContractConservedCurrentSiteBwd
- * Operation: (1/2) * q2[x + mu] * U^dag(x) * (g[mu] + 1) * q1[x]
- * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
- *        - Pass in q_in_2 shifted in +ve mu direction.
- ******************************************************************************/
-template<class Impl>
-void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd(const SitePropagator &q_in_1,
-							  const SitePropagator &q_in_2,
-							  SitePropagator &q_out,
-							  DoubledGaugeFieldView &U,
-							  unsigned int sU,
-							  unsigned int mu,
-							  bool switch_sign)
-{
-  SitePropagator result, tmp;
-  Gamma g5(Gamma::Algebra::Gamma5);
-
-  Impl::multLink(tmp, U[sU], q_in_1, mu + Nd);
-
-  result = g5 * adj(q_in_2) * g5 * WilsonCurrentBwd(tmp, mu);
-  if (switch_sign) {
-    q_out += result;
-  } else {
-    q_out -= result;
-  }
-}
-
-/*******************************************************************************
- * Name: SeqConservedCurrentSiteFwd
- * Operation: (1/2) * U(x) * (g[mu] - 1) * q[x + mu]
- * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
- *        - Pass in q_in shifted in +ve mu direction.
- ******************************************************************************/
-template<class Impl>
-void WilsonKernels<Impl>::SeqConservedCurrentSiteFwd(const SitePropagator &q_in,
-                                                     SitePropagator &q_out,
-                                                     DoubledGaugeFieldView &U,
-                                                     unsigned int sU,
-                                                     unsigned int mu,
-                                                     vPredicate t_mask,
-                                                     bool switch_sign)
-{
-  SitePropagator result;
-  
-  Impl::multLink(result, U[sU], q_in, mu);
-  result = WilsonCurrentFwd(result, mu);
-
-  // Zero any unwanted timeslice entries.
-  result = predicatedWhere(t_mask, result, 0.*result);
-  
-  if (switch_sign) {
-    q_out -= result;
-  } else {
-    q_out += result;
-  }
-}
-
-/*******************************************************************************
- * Name: SeqConservedCurrentSiteFwd
- * Operation: (1/2) * U^dag(x) * (g[mu] + 1) * q[x - mu]
- * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
- *        - Pass in q_in shifted in -ve mu direction.
- ******************************************************************************/
-template<class Impl>
-void WilsonKernels<Impl>::SeqConservedCurrentSiteBwd(const SitePropagator &q_in, 
-                                                     SitePropagator &q_out,
-                                                     DoubledGaugeFieldView &U,
-                                                     unsigned int sU,
-                                                     unsigned int mu,
-                                                     vPredicate t_mask,
-                                                     bool switch_sign)
-{
-  SitePropagator result;
-  Impl::multLink(result, U[sU], q_in, mu + Nd);
-  result = WilsonCurrentBwd(result, mu);
-
-  // Zero any unwanted timeslice entries.
-  result = predicatedWhere(t_mask, result, 0.*result);
-  
-  if (switch_sign) {
-    q_out += result;
-  } else {
-    q_out -= result;
-  }
-}
-
-
 NAMESPACE_END(Grid);

@@ -1,44 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
-
-Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h>
-#include <Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h>
-
-			   //#include <Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h>
-			   //#include <Grid/qcd/action/fermion/implementation/CayleyFermion5Dgpu.h>
-
-NAMESPACE_BEGIN(Grid);
-
-#include "impl.h"
-template class CayleyFermion5D<IMPLEMENTATION>; 
-
-NAMESPACE_END(Grid);
-
@@ -0,0 +1 @@
+../CayleyFermion5DInstantiation.cc.master
@@ -1,38 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/ContinuedFractionFermion5D.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/ContinuedFractionFermion5D.h>
-#include <Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h>
-
-NAMESPACE_BEGIN(Grid);
-
-#include "impl.h"
-template class ContinuedFractionFermion5D<IMPLEMENTATION>; 
-
-NAMESPACE_END(Grid);
@@ -0,0 +1 @@
+../ContinuedFractionFermion5DInstantiation.cc.master
@@ -1,44 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/Grid_Eigen_Dense.h>
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-#include <Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h>
-#include <Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h>
-
-NAMESPACE_BEGIN(Grid);
-
-#include "impl.h"
-template class DomainWallEOFAFermion<IMPLEMENTATION>; 
-
-NAMESPACE_END(Grid);
@@ -0,0 +1 @@
+../DomainWallEOFAFermionInstantiation.cc.master
@@ -1,44 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/Grid_Eigen_Dense.h>
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
-#include <Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h>
-#include <Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h>
-
-NAMESPACE_BEGIN(Grid);
-
-#include "impl.h"
-template class MobiusEOFAFermion<IMPLEMENTATION>; 
-
-NAMESPACE_END(Grid);
@@ -0,0 +1 @@
+../MobiusEOFAFermionInstantiation.cc.master
@@ -1,39 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/PartialFractionFermion5D.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/PartialFractionFermion5D.h>
-#include <Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h>
-
-NAMESPACE_BEGIN(Grid);
- 
-#include "impl.h"
-template class PartialFractionFermion5D<IMPLEMENTATION>; 
-
-NAMESPACE_END(Grid);
-
@@ -0,0 +1 @@
+../PartialFractionFermion5DInstantiation.cc.master
@@ -1,40 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid
-
-    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
-
-    Copyright (C) 2017
-
-    Author: paboyle <paboyle@ph.ed.ac.uk>
-    Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/Grid.h>
-#include <Grid/qcd/spin/Dirac.h>
-#include <Grid/qcd/action/fermion/WilsonCloverFermion.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h>
-
-NAMESPACE_BEGIN(Grid);
-
-#include "impl.h"
-template class WilsonCloverFermion<IMPLEMENTATION>; 
-
-NAMESPACE_END(Grid);
@@ -0,0 +1 @@
+../WilsonCloverFermionInstantiation.cc.master
@@ -1,40 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
-
-Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h>
-
-NAMESPACE_BEGIN(Grid);
-
-#include "impl.h"
-template class WilsonFermion5D<IMPLEMENTATION>; 
-
-NAMESPACE_END(Grid);
-
@@ -0,0 +1 @@
+../WilsonFermion5DInstantiation.cc.master
@@ -1,40 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
-
-Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h>
-
-NAMESPACE_BEGIN(Grid);
-
-#include "impl.h"
-template class WilsonFermion<IMPLEMENTATION>; 
-
-NAMESPACE_END(Grid);
-
@@ -0,0 +1 @@
+../WilsonFermionInstantiation.cc.master
@@ -1,74 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
-
-Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h>
-
-NAMESPACE_BEGIN(Grid);
-
-// Move these
-
-#include "impl.h"
-
-// G-parity requires more specialised implementation.
-template <>
-void WilsonKernels<IMPLEMENTATION>::ContractConservedCurrentSiteFwd(const SitePropagator &q_in_1, 
-							  const SitePropagator &q_in_2, 
-							  SitePropagator &q_out, 
-							  DoubledGaugeFieldView &U,
-							  unsigned int sU, 
-							  unsigned int mu, 
-							  bool switch_sign)
-{			
-  assert(0);		
-}			
-template <>		
-void WilsonKernels<IMPLEMENTATION>::ContractConservedCurrentSiteBwd( const SitePropagator &q_in_1,
-							   const SitePropagator &q_in_2,
-							   SitePropagator &q_out,
-							   DoubledGaugeFieldView &U,
-							   unsigned int mu,
-							   unsigned int sU,
-							   bool switch_sign)
-{			
-  assert(0);		
-}
-
-HAND_SPECIALISE_GPARITY(IMPLEMENTATION);
-
-
-template class WilsonKernels<IMPLEMENTATION>; 
-
-
-NAMESPACE_END(Grid);
-
@@ -0,0 +1 @@
+../WilsonKernelsInstantiationGparity.cc.master
@@ -1,37 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/WilsonTMFermion.cc
-
-    Copyright (C) 2015
-
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/WilsonTMFermion.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonTMFermionImplementation.h>
-
-NAMESPACE_BEGIN(Grid);
-
-#include "impl.h"
-template class WilsonTMFermion<IMPLEMENTATION>; 
-
-NAMESPACE_END(Grid);
@@ -0,0 +1 @@
+../WilsonTMFermionInstantiation.cc.master
@@ -1,44 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
-
-Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h>
-#include <Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h>
-
-			   //#include <Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h>
-			   //#include <Grid/qcd/action/fermion/implementation/CayleyFermion5Dgpu.h>
-
-NAMESPACE_BEGIN(Grid);
-
-#include "impl.h"
-template class CayleyFermion5D<IMPLEMENTATION>; 
-
-NAMESPACE_END(Grid);
-
@@ -0,0 +1 @@
+../CayleyFermion5DInstantiation.cc.master
@@ -1,38 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/ContinuedFractionFermion5D.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/ContinuedFractionFermion5D.h>
-#include <Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h>
-
-NAMESPACE_BEGIN(Grid);
-
-#include "impl.h"
-template class ContinuedFractionFermion5D<IMPLEMENTATION>; 
-
-NAMESPACE_END(Grid);
@@ -0,0 +1 @@
+../ContinuedFractionFermion5DInstantiation.cc.master
@@ -1,44 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/Grid_Eigen_Dense.h>
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
-#include <Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h>
-#include <Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h>
-
-NAMESPACE_BEGIN(Grid);
-
-#include "impl.h"
-template class DomainWallEOFAFermion<IMPLEMENTATION>; 
-
-NAMESPACE_END(Grid);
@@ -0,0 +1 @@
+../DomainWallEOFAFermionInstantiation.cc.master
@@ -1,44 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.cc
-
-Copyright (C) 2017
-
-Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: David Murphy <dmurphy@phys.columbia.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-			   /*  END LEGAL */
-
-#include <Grid/Grid_Eigen_Dense.h>
-#include <Grid/qcd/action/fermion/FermionCore.h>
-#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
-#include <Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h>
-#include <Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h>
-
-NAMESPACE_BEGIN(Grid);
-
-#include "impl.h"
-template class MobiusEOFAFermion<IMPLEMENTATION>; 
-
-NAMESPACE_END(Grid);
@@ -0,0 +1 @@
+../MobiusEOFAFermionInstantiation.cc.master
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
portelli	86a9cc8c27	relative Eigen links, allows moving safely Grid's directory	2020-06-04 10:56:34 +01:00
Peter Boyle	c5c2dbc0ce	Optional CUDA info	2020-06-02 14:21:49 -04:00
portelli	5aa60be17d	SerialisableClassName method for serialisable enum, and boolean to test if a serialisable object is an enum	2020-05-15 20:00:34 +01:00
Peter Boyle	2e652431e5	No compile on summiit fix	2020-05-12 18:56:47 -04:00
Peter Boyle	8b5b55b682	Make tests all compile ccurrent Grid, mostly MdagM removal of norms fixes but a few minor issues fiixed too	2020-05-12 17:57:24 -04:00
Peter Boyle	0e3c49f687	TransposeIndex was broken by Christoph	2020-05-12 17:57:01 -04:00
Peter Boyle	cb7ee37562	Close expressions in arg to cshift	2020-05-12 17:56:40 -04:00
Peter Boyle	82f71643a4	Remove the norm in MdagM	2020-05-12 17:55:53 -04:00
Peter Boyle	ea08f193e7	Allocator cache spliit into large/small pools	2020-05-10 05:24:26 -04:00
Peter Boyle	2bb2c68e15	Separate pools for small and large allocations cache	2020-05-09 22:57:21 -04:00
Peter Boyle	efe5bc6a3c	Split allocator cache into two pools of different sizes	2020-05-09 22:27:56 -04:00
Peter Boyle	384da487bd	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2020-05-08 18:55:11 -04:00
Peter Boyle	ee1de82a53	Working ITT benchmark again	2020-05-08 18:54:50 -04:00
Peter Boyle	2b576fc185	Comment deadd codde remove	2020-05-08 18:54:29 -04:00
Peter Boyle	b01b7f761a	Merge pull request #283 from DanielRichtmann/feature/minor-fixes Some small fixes	2020-05-08 10:52:03 -04:00
Daniel Richtmann	c83471bfd0	Fix missing checkerboards for adj und conjugate	2020-05-08 16:44:03 +02:00
Daniel Richtmann	ab0c5d77fb	Correct NonHermitianSchurOperatorBase	2020-05-08 16:44:02 +02:00
Daniel Richtmann	779e3c7442	Const-correctness for retrieval routines of GridStopWatch	2020-05-08 16:43:52 +02:00
Daniel Richtmann	0c570824f2	Add missing declaration of GridCmdOptionInt	2020-05-08 16:43:51 +02:00
Peter Boyle	0dd1bdfa94	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2020-05-08 09:21:43 -04:00
Peter Boyle	1d65e2f62c	Slightly faster Chebyshev; ifdef'ed out the fastest until tested numerics Lifteed from HDCR setup	2020-05-08 09:20:54 -04:00
Peter Boyle	93920c4811	Remove verbose	2020-05-08 09:19:54 -04:00
Peter Boyle	6859a3e1d4	Schur operator	2020-05-08 09:19:12 -04:00
Peter Boyle	21ca182c36	Comments remove	2020-05-08 09:18:24 -04:00
portelli	053b4dd495	Merge pull request #282 from felixerben/baryon-reversal Baryon reversal	2020-05-07 18:09:17 +01:00
ferben	42bb5f0721	asserrtion	2020-05-07 18:06:12 +01:00
ferben	253bcc3426	back to old version	2020-05-07 18:03:17 +01:00
portelli	a887206413	Merge pull request #281 from felixerben/feature/baryonSpeedup Feature/baryon speedup	2020-05-07 13:41:29 +01:00
ferben	591ebb6213	Merge branch 'develop' of github.com:paboyle/Grid into feature/baryonSpeedup	2020-05-07 11:13:21 +01:00
ferben	56e2f7d088	deleted test routines. cleaned up fast version. assert Ns=4,Nc=3.	2020-05-07 10:03:45 +01:00
Peter Boyle	525418abfb	Merge pull request #273 from lehner/feature/gpt Feature/gpt	2020-05-06 10:10:51 -04:00
Peter Boyle	5f780806c2	Merge pull request #279 from paboyle/bugfix/nvcc-config configure fix for nvcc with extra arguments as CXX	2020-05-06 10:07:52 -04:00
Christoph Lehner	3c6ffcb48c	Merge branch 'develop' into feature/gpt	2020-05-06 15:03:35 +02:00
Christoph Lehner	87984ece7d	add Lattice_basis.h	2020-05-06 08:47:18 -04:00
Christoph Lehner	e9b295f967	Synchronize blocking infrastructure with GPT	2020-05-06 08:42:28 -04:00
Peter Boyle	224cbf0453	Merge pull request #280 from mmphys/bugfix/ET_go_home Bugfix/et go home	2020-05-05 17:56:51 -04:00
Michael Marshall	c1e57d4357	Merge branch 'develop' into bugfix/ET_go_home * develop: SYCL prep - no sycl just make it compile through DPC++ dpc++ didn't like rdtsc() Make compile if HAVE_LIME=0 Lime optional	2020-05-05 22:35:04 +01:00
Christoph Lehner	6b64727161	disable comments	2020-05-05 05:05:36 -04:00
Christoph Lehner	04863f8f38	debug new AcceleratorView	2020-05-04 16:07:03 -04:00
u37294	04927d2e40	SYCL prep - no sycl just make it compile through DPC++	2020-05-04 10:28:29 -07:00
u37294	7caed4edd9	dpc++ didn't like rdtsc()	2020-05-04 10:27:05 -07:00
u37294	59c51d2c35	Make compile if HAVE_LIME=0	2020-05-04 10:26:20 -07:00
u37294	ff53b231c8	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2020-05-04 10:25:10 -07:00
u37294	fc19cf905b	Lime optional	2020-05-04 10:24:48 -07:00
Christoph Lehner	2a1387e992	rankInnerProduct	2020-05-03 17:27:11 -04:00
Christoph Lehner	9bfa51bffb	cleanup comment	2020-05-03 09:12:52 -04:00
Christoph Lehner	38532753f4	interface cleanup	2020-05-03 08:58:32 -04:00
Christoph Lehner	949be9605c	fix pragmas	2020-05-02 16:20:03 -04:00
Christoph Lehner	63cf201ee7	Add AdviseInfrequentUse	2020-05-02 11:38:42 -04:00
Christoph Lehner	c8af498a2a	BinaryIO fix for alternative little-endian format name (used in 96I ensemble)	2020-05-01 03:45:50 -04:00
Christoph Lehner	ddb192bac7	re-work double precision promotion for summit	2020-04-30 16:09:57 -04:00
Michael Marshall	7666300a6f	Merge branch 'develop' into bugfix/ET_go_home * develop: Basis rotate stack passig to GPU reduction Clean up warning	2020-04-30 20:10:32 +01:00
Michael Marshall	4a4b9e305d	Fix: strToVec enters infinite loop and exhausts memory if operator>> fails before the end of string, e.g. if parsing "0_0_0" for momentum instead of "0 0 0".	2020-04-30 19:40:04 +01:00
Peter Boyle	9b2d2d0fc3	Basis rotate stack passig to GPU reduction	2020-04-30 12:31:07 -04:00
Peter Boyle	5011753f4f	Clean up warning	2020-04-30 10:23:48 -04:00
Michael Marshall	dbaeefaeef	All Eigen::TensorMap objects are fixed (i.e. cannot be dynamically resized)	2020-04-30 15:02:51 +01:00
Christopher Kelly	dee96cbf82	Added workaround in configure to still catch Cuda compiler when nvcc with extra arguments (eg -ccbin) is used as CXX	2020-04-29 10:37:11 -04:00
Peter Boyle	dd3ebc2ce4	Slow compile on NVCC switch off conserved current	2020-04-29 08:43:12 -04:00
Peter Boyle	103e7ae2f0	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2020-04-29 03:05:36 -04:00
Peter Boyle	29ae5615c0	Seqeuential fix	2020-04-29 03:05:15 -04:00
ferben	6240e02619	added assertion to avoid potential infinite loop	2020-04-27 18:50:53 +01:00
ferben	f4033ad8cb	baryon speedup by a factor 2	2020-04-27 17:46:14 +01:00
Christoph Lehner	f1fe444d4f	blocked precision promotion infrastructure upgrade	2020-04-24 06:27:20 -04:00
Peter Boyle	dae820aa96	Merge pull request #277 from mmphys/bugfix/grid-config Bugfix/grid config	2020-04-23 10:26:54 -04:00
Michael Marshall	5daf176f4a	Updated to expose GRID_CXXLD in addition to CXXLD. NB: CXXLD required as this is what drives linking behaviour.	2020-04-23 15:25:53 +01:00
Michael Marshall	e96c86ec14	Make grid-config message more specific for --cxx and --cxxld	2020-04-23 13:10:45 +01:00
Peter Boyle	c2c3cad20d	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2020-04-23 04:35:42 -04:00
Peter Boyle	edec9ee2e2	Conserved current rewrite done. Zmobius working	2020-04-23 04:34:01 -04:00
Peter Boyle	ed70cce542	Test for 5D DWF obserevables	2020-04-23 04:29:45 -04:00
Michael Marshall	4701201b5f	grid-config: Expose CXXLD (for GPU build) and update help	2020-04-22 18:42:30 +01:00
Peter Boyle	0782b76ed4	Merge pull request #274 from paboyle/feature/zmobius_paramcompute ZMobius parameter computation	2020-04-20 14:39:29 -04:00
Christopher Kelly	0896f2cead	Added missing include guards in bigfloat_double.h	2020-04-20 10:30:38 -04:00
Christopher Kelly	181709bba4	Merge branch 'develop' into feature/zmobius_paramcompute	2020-04-20 09:12:34 -04:00
Christoph Lehner	091d5c605e	towards more precise blocking	2020-04-17 04:25:28 -04:00
Peter Boyle	90229cfb0f	Merge pull request #270 from milc-qcd/feature/CGinfo feature/CGinfo	2020-04-16 11:46:08 -04:00
Peter Boyle	0475c46ecb	Merge pull request #256 from djm2131/feature/BiCGSTAB Import BiCGSTAB solvers and tests	2020-04-16 11:45:15 -04:00
Peter Boyle	3cca10e617	Merge pull request #276 from nils-asmussen/fix/regression_nt fix regression in tests/core/Test_qed.cc	2020-04-16 11:42:39 -04:00
Christoph Lehner	327da332bb	Merge branch 'develop' of https://github.com/paboyle/Grid into feature/gpt	2020-04-16 11:30:17 -04:00
asmussen	43dc2814dd	fix regression in core/Test_qed.cc	2020-04-15 16:10:15 +01:00
Peter Boyle	f3a8d039a2	Merge branch 'feature/hdcr' into develop	2020-04-10 22:01:52 -04:00
portelli	4e864e56c9	develop pull	2020-04-10 17:19:18 +01:00
Peter Boyle	014dbfa464	Compile fix with OpDirAll	2020-04-10 11:57:09 -04:00
Peter Boyle	3b0e07882f	Adding another form of polynomial	2020-04-10 11:28:33 -04:00
Peter Boyle	8e81a811d0	Merge branch 'feature/hdcr' into develop	2020-04-10 11:14:49 -04:00
Christoph Lehner	96e8e44fd4	Merge pull request #2 from DanielRichtmann/feature/fused-innerproduct-norm2 Fused innerProduct + norm2 on first argument operation	2020-04-06 13:16:58 +02:00
Daniel Richtmann	5fc8a273e7	Fused innerProduct + norm2 on first argument operation	2020-04-06 11:52:29 +02:00
portelli	d671a63e78	Update README.md	2020-04-03 19:52:15 +01:00
Christoph Lehner	856d168e41	global sum over vectors of uint64_t	2020-03-29 07:56:05 -04:00
portelli	6235c7ba98	IPP path fix in configure	2020-03-27 17:23:29 +00:00
portelli	7e13724882	removing Hadrons	2020-03-27 12:03:32 +00:00
Christoph Lehner	b6cbdd2aa3	Merge pull request #1 from DanielRichtmann/feature/read-openqcd Feature/read openqcd	2020-03-26 17:39:04 +01:00
Christoph Lehner	a2188ea875	remove debugging printf from WilsonKernelsImplementation	2020-03-26 09:12:36 -04:00
Daniel Richtmann	989af65807	Check in parallel reader for openqcd configs	2020-03-24 11:20:54 +01:00
Christoph Lehner	60db3133d3	make trace,adj,transpose unary operators	2020-03-16 17:59:56 -04:00
Christoph Lehner	c9b737a4e7	make trace,adj,transpose unary operators	2020-03-16 17:58:30 -04:00
Daniel Richtmann	037bb6ea73	Check in reader for openqcd configs This reader is suboptimal in the sense that it opens the entire config on every MPI rank.	2020-03-16 14:28:02 +01:00
portelli	05ebc458e2	Merge pull request #260 from mmphys/feature/distil Distillation: save eigenvalues of the Laplacian for all timeslices	2020-03-13 14:00:21 +00:00
Michael Marshall	3753508957	Making change 1) as simple as possible 2) as much like MSink/Point.hpp as possible	2020-03-12 13:47:51 +00:00
Michael Marshall	c1677fccf6	Merge branch 'develop' into feature/distil * develop: bugfix ZPerambulator registered module supporting ZMobius action changed to push_back according to request Added Hadrons_Error in case blockSize is set too large bugfix in perambulator module # Conflicts: # Hadrons/Modules/MDistil/Perambulator.hpp	2020-03-12 12:45:18 +00:00
portelli	35e8e31749	Merge pull request #272 from mmphys/feature/ZPeramb bugfix ZPerambulator	2020-03-12 12:28:04 +00:00
portelli	34813e9b04	Merge branch 'develop' into feature/ZPeramb	2020-03-12 12:27:56 +00:00
Felix Erben	373cf61abb	bugfix ZPerambulator	2020-03-12 11:44:43 +00:00
portelli	4e8fbc4b49	Merge pull request #271 from mmphys/feature/ZDistil registered module supporting ZMobius action	2020-03-12 10:54:07 +00:00
ferben	516ac1d4d5	registered module supporting ZMobius action	2020-03-12 10:52:27 +00:00
portelli	318f63eb34	Merge pull request #268 from mmphys/a2a-error-log Added Hadrons_Error in case blockSize is set too large	2020-03-11 11:09:00 +00:00
portelli	16503d7532	Merge pull request #267 from mmphys/feature/distil-bugfix bugfix in perambulator module	2020-03-11 11:08:23 +00:00
ferben	0fa93383b7	changed to push_back according to request	2020-03-11 09:05:01 +00:00
ferben	0a827aa7bf	Added Hadrons_Error in case blockSize is set too large	2020-03-11 08:52:52 +00:00
Carleton DeTar	165c68e28e	Change TrueResiduals to TrueResidualShift and IterationsToComplete to IterationsToCompleteShift	2020-02-29 17:51:51 -06:00
ferben	b32b1ca642	bugfix in perambulator module	2020-02-26 12:06:45 +00:00
Carleton DeTar	9479bc8486	Make IterationsToComplete and TrueResidual externally accessible	2020-02-19 17:43:57 -06:00
Michael Marshall	10192dfc71	Wall source momenta must be specified for spatial components only. So we don't break existing scripts, allow momentum in time direction as well, but only if zero. Fail early, so do the check in setup()	2020-01-31 15:02:03 +00:00
Michael Marshall	c69a3b6ef6	When saving eigenvectors, LapEvec now saves eigenvalues for every timeslice as well. I.e. nT x nVec eigenvalues are saved in FileName.evals.conf.h5. A new named tensor, "TimesliceEvals" can be used to simplify restoring these from disk. NB: The changes in BaseIO add support so that Eigen tensors can be easily used in MPI operations, e.g. GlobalSum. See LapEvec.hpp for an example of how this is done.	2020-01-29 21:20:20 +00:00
Michael Marshall	2ed39ebb7a	Perambulator won't even allocate memory for unsmeared sinks unless the filename is specified. Prior to this update, memory is allocated regardless of whether these are requested.	2020-01-24 13:01:06 +00:00
Christopher Kelly	96671bbb24	Added ability to pass callback to MADWF that is called every inner iteration and allows user to, for example, adjust the inner solver tolerance depending on residual Added a general implementation of the Remez algorithm for producing arbitrary rational polynomial approximation with optional restriction to even/odd polynomials Added implementation of computation of ZMobius parameters Added Test_zMADWF_prec to test ZMobius in MADWF	2020-01-17 12:45:30 -08:00
Michael Marshall	0ca1992151	Remove warning in tensor layout comparison. Make default names and index names visible for PerambTensor and NoiseTensor	2019-12-20 13:53:27 +00:00
Michael Marshall	df2b0c4e79	Merge branch 'develop' into feature/distil * develop: Missing conjugate in MooeeInvDag Allow subspace setup to no converge fp16 mandatory. Use SFW is not available as hdw	2019-12-20 13:24:59 +00:00
Peter Boyle	5d834486c9	Merge pull request #259 from grid-test-organisation/feature/5d-improvement-fix Missing conjugate in MooeeInvDag	2019-12-16 04:20:37 -05:00
gfilaci	f7373e97a4	Missing conjugate in MooeeInvDag	2019-12-16 10:05:50 +01:00
Peter Boyle	b8bd8cd2ae	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2019-12-13 21:32:10 -05:00
Michael Marshall	c7637a84ad	Documentation tweak for peculiarities of OpenMPI --prefix	2019-12-12 17:00:03 +00:00
Michael Marshall	a7772c827b	Documentation tweak	2019-12-12 16:05:22 +00:00
portelli	8e83398861	Merge pull request #257 from AndrewYongZhenNing/develop Added NamedTensor.hpp	2019-12-11 21:36:59 +00:00
David Murphy	843ca9350a	Fix naming conventions to be consistent with Peter	2019-12-11 11:46:18 -05:00
aznyong	f47b2b6e13	Added NamedTensor.hpp	2019-12-11 15:56:46 +00:00
David Murphy	4180a4a8a7	Import BiCGSTAB solvers and tests	2019-12-10 17:20:35 -05:00
				`@@ -0,0 +1 @@`
				`../ContinuedFractionFermion5DInstantiation.cc.master`
				`@@ -0,0 +1 @@`
				`../DomainWallEOFAFermionInstantiation.cc.master`
				`@@ -0,0 +1 @@`
				`../MobiusEOFAFermionInstantiation.cc.master`
				`@@ -0,0 +1 @@`
				`../PartialFractionFermion5DInstantiation.cc.master`
				`@@ -0,0 +1 @@`
				`../WilsonCloverFermionInstantiation.cc.master`
				`@@ -0,0 +1 @@`
				`../WilsonKernelsInstantiationGparity.cc.master`