From a263e78f8d5eb655f266df99da5c7c073f37dabe Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 5 Jun 2015 18:16:25 +0100
Subject: [PATCH 01/24] Conjugate residual added

---
 lib/Algorithms.h                             |  1 +
 lib/Make.inc                                 |  4 +-
 lib/algorithms/LinearOperator.h              | 18 ++++-
 lib/algorithms/SparseMatrix.h                |  1 +
 lib/algorithms/iterative/ConjugateResidual.h | 85 ++++++++++++++++++++
 lib/qcd/QCD.h                                |  8 +-
 lib/qcd/action/Actions.h                     | 35 +-------
 lib/qcd/action/fermion/g5HermitianLinop.h    | 38 +++++++++
 lib/qcd/{ => spin}/Dirac.cc                  |  0
 lib/qcd/{ => spin}/Dirac.h                   |  0
 lib/qcd/{ => spin}/TwoSpinor.h               |  0
 lib/qcd/{ => utils}/LinalgUtils.h            | 19 +++++
 lib/qcd/{ => utils}/SpaceTimeGrid.cc         |  0
 lib/qcd/{ => utils}/SpaceTimeGrid.h          |  0
 tests/Make.inc                               | 12 +++
 15 files changed, 181 insertions(+), 40 deletions(-)
 create mode 100644 lib/algorithms/iterative/ConjugateResidual.h
 create mode 100644 lib/qcd/action/fermion/g5HermitianLinop.h
 rename lib/qcd/{ => spin}/Dirac.cc (100%)
 rename lib/qcd/{ => spin}/Dirac.h (100%)
 rename lib/qcd/{ => spin}/TwoSpinor.h (100%)
 rename lib/qcd/{ => utils}/LinalgUtils.h (88%)
 rename lib/qcd/{ => utils}/SpaceTimeGrid.cc (100%)
 rename lib/qcd/{ => utils}/SpaceTimeGrid.h (100%)

diff --git a/lib/Algorithms.h b/lib/Algorithms.h
index b5a2814a..c2491cf4 100644
--- a/lib/Algorithms.h
+++ b/lib/Algorithms.h
@@ -5,6 +5,7 @@
 #include <algorithms/LinearOperator.h>
 
 #include <algorithms/iterative/ConjugateGradient.h>
+#include <algorithms/iterative/ConjugateResidual.h>
 #include <algorithms/iterative/NormalEquations.h>
 #include <algorithms/iterative/SchurRedBlack.h>
 
diff --git a/lib/Make.inc b/lib/Make.inc
index 94a566fb..ba35e25f 100644
--- a/lib/Make.inc
+++ b/lib/Make.inc
@@ -1,4 +1,4 @@
 
-HFILES=./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/LinearOperator.h ./algorithms/SparseMatrix.h ./Algorithms.h ./AlignedAllocator.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./Cartesian.h ./communicator/Communicator_base.h ./Communicator.h ./Comparison.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./Cshift.h ./Grid.h ./GridConfig.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_ET.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_where.h ./Lattice.h ./parallelIO/NerscIO.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/gauge/GaugeActionBase.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/Dirac.h ./qcd/LinalgUtils.h ./qcd/QCD.h ./qcd/SpaceTimeGrid.h ./qcd/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/WilsonLoops.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Old/Grid_vComplexD.h ./simd/Old/Grid_vComplexF.h ./simd/Old/Grid_vInteger.h ./simd/Old/Grid_vRealD.h ./simd/Old/Grid_vRealF.h ./Simd.h ./stencil/Lebesgue.h ./Stencil.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_inner.h ./tensors/Tensor_outer.h ./tensors/Tensor_peek.h ./tensors/Tensor_poke.h ./tensors/Tensor_reality.h ./tensors/Tensor_Ta.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./Tensors.h ./Threads.h
+HFILES=./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/LinearOperator.h ./algorithms/SparseMatrix.h ./Algorithms.h ./AlignedAllocator.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./Cartesian.h ./communicator/Communicator_base.h ./Communicator.h ./Comparison.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./Cshift.h ./Grid.h ./GridConfig.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_ET.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_where.h ./Lattice.h ./parallelIO/NerscIO.h ./qcd/action/Actions.h ./qcd/action/DiffAction.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/gauge/GaugeActionBase.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/QCD.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/WilsonLoops.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Old/Grid_vComplexD.h ./simd/Old/Grid_vComplexF.h ./simd/Old/Grid_vInteger.h ./simd/Old/Grid_vRealD.h ./simd/Old/Grid_vRealF.h ./Simd.h ./stencil/Lebesgue.h ./Stencil.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_inner.h ./tensors/Tensor_outer.h ./tensors/Tensor_peek.h ./tensors/Tensor_poke.h ./tensors/Tensor_reality.h ./tensors/Tensor_Ta.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./Tensors.h ./Threads.h
 
-CCFILES=./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./GridInit.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/Dirac.cc ./qcd/SpaceTimeGrid.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc
+CCFILES=./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./GridInit.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/spin/Dirac.cc ./qcd/utils/SpaceTimeGrid.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc
diff --git a/lib/algorithms/LinearOperator.h b/lib/algorithms/LinearOperator.h
index 710fcde7..492b34db 100644
--- a/lib/algorithms/LinearOperator.h
+++ b/lib/algorithms/LinearOperator.h
@@ -18,7 +18,8 @@ namespace Grid {
     public:
       virtual void Op     (const Field &in, Field &out) = 0; // Abstract base
       virtual void AdjOp  (const Field &in, Field &out) = 0; // Abstract base
-      virtual void HermOpAndNorm(const Field &in, Field &out,double &n1,double &n2)=0;
+      virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
+      virtual void HermOp(const Field &in, Field &out)=0;
     };
 
 
@@ -48,9 +49,13 @@ namespace Grid {
       void AdjOp     (const Field &in, Field &out){
 	_Mat.Mdag(in,out);
       }
-      void HermOpAndNorm(const Field &in, Field &out,double &n1,double &n2){
+      void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
 	_Mat.MdagM(in,out,n1,n2);
       }
+      void HermOp(const Field &in, Field &out){
+	RealD n1,n2;
+	HermOpAndNorm(in,out,n1,n2);
+      }
     };
 
     ////////////////////////////////////////////////////////////////////
@@ -67,7 +72,7 @@ namespace Grid {
       void AdjOp     (const Field &in, Field &out){
 	_Mat.M(in,out);
       }
-      void HermOpAndNorm(const Field &in, Field &out,double &n1,double &n2){
+      void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
 	ComplexD dot;
 
 	_Mat.M(in,out);
@@ -78,6 +83,9 @@ namespace Grid {
 	dot = innerProduct(out,out);
 	n2=real(dot);
       }
+      void HermOp(const Field &in, Field &out){
+	_Mat.M(in,out);
+      }
     };
 
     //////////////////////////////////////////////////////////
@@ -98,6 +106,10 @@ namespace Grid {
       void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
 	MpcDagMpc(in,out,n1,n2);
       }
+      void HermOp(const Field &in, Field &out){
+	RealD n1,n2;
+	HermOpAndNorm(in,out,n1,n2);
+      }
       void Op     (const Field &in, Field &out){
 	Mpc(in,out);
       }
diff --git a/lib/algorithms/SparseMatrix.h b/lib/algorithms/SparseMatrix.h
index 9146648f..a51657dc 100644
--- a/lib/algorithms/SparseMatrix.h
+++ b/lib/algorithms/SparseMatrix.h
@@ -18,6 +18,7 @@ namespace Grid {
 	Field tmp (in._grid);
 	ni=M(in,tmp);
 	no=Mdag(tmp,out);
+	std::cout << "MdagM "<< ni<<" "<<no<<std::endl;
       }
     };
 
diff --git a/lib/algorithms/iterative/ConjugateResidual.h b/lib/algorithms/iterative/ConjugateResidual.h
new file mode 100644
index 00000000..ae1f4378
--- /dev/null
+++ b/lib/algorithms/iterative/ConjugateResidual.h
@@ -0,0 +1,85 @@
+#ifndef GRID_CONJUGATE_RESIDUAL_H
+#define GRID_CONJUGATE_RESIDUAL_H
+
+namespace Grid {
+
+    /////////////////////////////////////////////////////////////
+    // Base classes for iterative processes based on operators
+    // single input vec, single output vec.
+    /////////////////////////////////////////////////////////////
+
+  template<class Field> 
+    class ConjugateResidual : public OperatorFunction<Field> {
+  public:                                                
+    RealD   Tolerance;
+    Integer MaxIterations;
+    int verbose;
+
+    ConjugateResidual(RealD tol,Integer maxit) : Tolerance(tol), MaxIterations(maxit) { 
+      verbose=1;
+    };
+
+    void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
+
+      RealD a, b, c, d;
+      RealD cp, ssq,rsq;
+      
+      RealD rAr, rAAr, rArp;
+      RealD pAp, pAAp;
+
+      GridBase *grid = src._grid;
+      psi=zero;
+      Field r(grid),  p(grid), Ap(grid), Ar(grid);
+      
+      r=src;
+      p=src;
+
+      Linop.HermOpAndNorm(p,Ap,pAp,pAAp);
+      Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
+
+      std::cout << "pAp, pAAp"<< pAp<<" "<<pAAp<<std::endl;
+      std::cout << "rAr, rAAr"<< rAr<<" "<<rAAr<<std::endl;
+
+      cp =norm2(r);
+      ssq=norm2(src);
+      rsq=Tolerance*Tolerance*ssq;
+
+      std::cout<<"ConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl;
+
+      for(int k=1;k<MaxIterations;k++){
+
+	a = rAr/pAAp;
+
+	axpy(psi,a,p,psi);
+
+	cp = axpy_norm(r,-a,Ap,r);
+
+	rArp=rAr;
+
+	Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
+
+	b   =rAr/rArp;
+ 
+	axpy(p,b,p,r);
+	pAAp=axpy_norm(Ap,b,Ap,Ar);
+
+	std::cout<<"ConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
+
+	if(cp<rsq) {
+	  Linop.HermOp(psi,Ap);
+	  axpy(r,-1.0,src,Ap);
+	  RealD true_resid = norm2(r);
+	  std::cout<<"ConjugateResidual: Converged on iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
+	  std::cout<<"ConjugateResidual: true   residual  is "<<true_resid<<std::endl;
+	  std::cout<<"ConjugateResidual: target residual was "<<Tolerance <<std::endl;
+	  return;
+	}
+
+      }
+
+      std::cout<<"ConjugateResidual did NOT converge"<<std::endl;
+      assert(0);
+    }
+  };
+}
+#endif
diff --git a/lib/qcd/QCD.h b/lib/qcd/QCD.h
index c3f70c43..f17c557b 100644
--- a/lib/qcd/QCD.h
+++ b/lib/qcd/QCD.h
@@ -307,10 +307,10 @@ namespace QCD {
 }   //namespace QCD
 } // Grid
 
-#include <qcd/SpaceTimeGrid.h>
-#include <qcd/Dirac.h>
-#include <qcd/TwoSpinor.h>
-#include <qcd/LinalgUtils.h>
+#include <qcd/utils/SpaceTimeGrid.h>
+#include <qcd/spin/Dirac.h>
+#include <qcd/spin/TwoSpinor.h>
+#include <qcd/utils/LinalgUtils.h>
 #include <qcd/utils/CovariantCshift.h>
 #include <qcd/utils/WilsonLoops.h>
 #include <qcd/action/Actions.h>
diff --git a/lib/qcd/action/Actions.h b/lib/qcd/action/Actions.h
index edef1b53..56b453c6 100644
--- a/lib/qcd/action/Actions.h
+++ b/lib/qcd/action/Actions.h
@@ -65,36 +65,9 @@
 #include <qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
 #include <qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h>
 
-
-    // Chroma interface defining FermionAction
-    /*
-     template<typename T, typename P, typename Q>  class FermAct4D : public FermionAction<T,P,Q>
-     virtual LinearOperator<T>* linOp(Handle< FermState<T,P,Q> > state) const = 0;
-     virtual LinearOperator<T>* lMdagM(Handle< FermState<T,P,Q> > state) const = 0;
-     virtual LinOpSystemSolver<T>* invLinOp(Handle< FermState<T,P,Q> > state,
-     virtual MdagMSystemSolver<T>* invMdagM(Handle< FermState<T,P,Q> > state,
-     virtual LinOpMultiSystemSolver<T>* mInvLinOp(Handle< FermState<T,P,Q> > state,
-     virtual MdagMMultiSystemSolver<T>* mInvMdagM(Handle< FermState<T,P,Q> > state,
-     virtual MdagMMultiSystemSolverAccumulate<T>* mInvMdagMAcc(Handle< FermState<T,P,Q> > state,
-     virtual SystemSolver<T>* qprop(Handle< FermState<T,P,Q> > state,
-     class DiffFermAct4D : public FermAct4D<T,P,Q>
-     virtual DiffLinearOperator<T,Q,P>* linOp(Handle< FermState<T,P,Q> > state) const = 0;
-     virtual DiffLinearOperator<T,Q,P>* lMdagM(Handle< FermState<T,P,Q> > state) const = 0;
-    */
-
-
-    // Chroma interface defining GaugeAction
-    /*
-      template<typename P, typename Q>   class GaugeAction
-  virtual const CreateGaugeState<P,Q>& getCreateState() const = 0;
-  virtual GaugeState<P,Q>* createState(const Q& q) const
-  virtual const GaugeBC<P,Q>& getGaugeBC() const
-  virtual const Set& getSet(void) const = 0;
-  virtual void deriv(P& result, const Handle< GaugeState<P,Q> >& state) const 
-  virtual Double S(const Handle< GaugeState<P,Q> >& state) const = 0;
-
-  class LinearGaugeAction : public GaugeAction< multi1d<LatticeColorMatrix>, multi1d<LatticeColorMatrix> >
-  typedef multi1d<LatticeColorMatrix>  P;
-    */
+///////////////////////////////////////////////////////////////////////////////
+// G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
+///////////////////////////////////////////////////////////////////////////////
+#include <qcd/action/fermion/g5HermitianLinop.h>
 
 #endif
diff --git a/lib/qcd/action/fermion/g5HermitianLinop.h b/lib/qcd/action/fermion/g5HermitianLinop.h
new file mode 100644
index 00000000..9a8cc2eb
--- /dev/null
+++ b/lib/qcd/action/fermion/g5HermitianLinop.h
@@ -0,0 +1,38 @@
+#ifndef G5_HERMITIAN_LINOP
+#define G5_HERMITIAN_LINOP
+namespace Grid {
+  namespace QCD {
+////////////////////////////////////////////////////////////////////
+// Wrap an already herm matrix
+////////////////////////////////////////////////////////////////////
+template<class Matrix,class Field>
+class Gamma5HermitianLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+public:
+  Gamma5HermitianLinearOperator(Matrix &Mat): _Mat(Mat){};
+  void Op     (const Field &in, Field &out){
+    _Mat.M(in,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    _Mat.M(in,out);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+
+    HermOp(in,out);
+    
+    ComplexD dot;
+    dot= innerProduct(in,out);
+    n1=real(dot);
+    
+    dot = innerProduct(out,out);
+    n2=real(dot);
+  }
+  void HermOp(const Field &in, Field &out){
+    Field tmp(in._grid);
+    _Mat.M(in,tmp);
+    G5R5(out,tmp);
+  }
+};
+
+}}
+#endif
diff --git a/lib/qcd/Dirac.cc b/lib/qcd/spin/Dirac.cc
similarity index 100%
rename from lib/qcd/Dirac.cc
rename to lib/qcd/spin/Dirac.cc
diff --git a/lib/qcd/Dirac.h b/lib/qcd/spin/Dirac.h
similarity index 100%
rename from lib/qcd/Dirac.h
rename to lib/qcd/spin/Dirac.h
diff --git a/lib/qcd/TwoSpinor.h b/lib/qcd/spin/TwoSpinor.h
similarity index 100%
rename from lib/qcd/TwoSpinor.h
rename to lib/qcd/spin/TwoSpinor.h
diff --git a/lib/qcd/LinalgUtils.h b/lib/qcd/utils/LinalgUtils.h
similarity index 88%
rename from lib/qcd/LinalgUtils.h
rename to lib/qcd/utils/LinalgUtils.h
index 2b83d115..ef85e728 100644
--- a/lib/qcd/LinalgUtils.h
+++ b/lib/qcd/utils/LinalgUtils.h
@@ -109,5 +109,24 @@ PARALLEL_FOR_LOOP
     vstream(z._odata[ss+s],tmp);
   }
 }
+
+template<class vobj> 
+void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
+{
+  GridBase *grid=x._grid;
+  z.checkerboard = x.checkerboard;
+  conformable(x,z);
+  int Ls = grid->_rdimensions[0];
+PARALLEL_FOR_LOOP
+  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+    vobj tmp;
+    for(int s=0;s<Ls;s++){
+      int sp = Ls-1-s;
+      multGamma5(tmp(),x._odata[ss+s]());
+      vstream(z._odata[ss+sp],tmp);
+    }
+  }
+}
+
 }}
 #endif 
diff --git a/lib/qcd/SpaceTimeGrid.cc b/lib/qcd/utils/SpaceTimeGrid.cc
similarity index 100%
rename from lib/qcd/SpaceTimeGrid.cc
rename to lib/qcd/utils/SpaceTimeGrid.cc
diff --git a/lib/qcd/SpaceTimeGrid.h b/lib/qcd/utils/SpaceTimeGrid.h
similarity index 100%
rename from lib/qcd/SpaceTimeGrid.h
rename to lib/qcd/utils/SpaceTimeGrid.h
diff --git a/tests/Make.inc b/tests/Make.inc
index 533cbf17..de9d0709 100644
--- a/tests/Make.inc
+++ b/tests/Make.inc
@@ -14,6 +14,10 @@ Test_cayley_even_odd_SOURCES=Test_cayley_even_odd.cc
 Test_cayley_even_odd_LDADD=-lGrid
 
 
+Test_cf_cr_unprec_SOURCES=Test_cf_cr_unprec.cc
+Test_cf_cr_unprec_LDADD=-lGrid
+
+
 Test_contfrac_cg_SOURCES=Test_contfrac_cg.cc
 Test_contfrac_cg_LDADD=-lGrid
 
@@ -42,6 +46,10 @@ Test_dwf_cg_unprec_SOURCES=Test_dwf_cg_unprec.cc
 Test_dwf_cg_unprec_LDADD=-lGrid
 
 
+Test_dwf_cr_unprec_SOURCES=Test_dwf_cr_unprec.cc
+Test_dwf_cr_unprec_LDADD=-lGrid
+
+
 Test_dwf_even_odd_SOURCES=Test_dwf_even_odd.cc
 Test_dwf_even_odd_LDADD=-lGrid
 
@@ -90,6 +98,10 @@ Test_wilson_cg_unprec_SOURCES=Test_wilson_cg_unprec.cc
 Test_wilson_cg_unprec_LDADD=-lGrid
 
 
+Test_wilson_cr_unprec_SOURCES=Test_wilson_cr_unprec.cc
+Test_wilson_cr_unprec_LDADD=-lGrid
+
+
 Test_wilson_even_odd_SOURCES=Test_wilson_even_odd.cc
 Test_wilson_even_odd_LDADD=-lGrid
 

From e2e076d307ff1cc78842c51982557200c7a5d699 Mon Sep 17 00:00:00 2001
From: Matt Spraggs <matthew.spraggs@gmail.com>
Date: Sun, 7 Jun 2015 17:06:25 +0100
Subject: [PATCH 02/24] Removed std::string calls from NerscIO map indexing

---
 lib/parallelIO/NerscIO.h | 44 ++++++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/lib/parallelIO/NerscIO.h b/lib/parallelIO/NerscIO.h
index 8a1775f1..4db182e0 100644
--- a/lib/parallelIO/NerscIO.h
+++ b/lib/parallelIO/NerscIO.h
@@ -164,37 +164,37 @@ inline int readNerscHeader(std::string file,GridBase *grid,  NerscField &field)
   // chomp the values
   //////////////////////////////////////////////////
 
-  field.hdr_version    = header[std::string("HDR_VERSION")];
-  field.data_type      = header[std::string("DATATYPE")];
-  field.storage_format = header[std::string("STORAGE_FORMAT")];
+  field.hdr_version    = header["HDR_VERSION"];
+  field.data_type      = header["DATATYPE"];
+  field.storage_format = header["STORAGE_FORMAT"];
   
-  field.dimension[0] = std::stol(header[std::string("DIMENSION_1")]);
-  field.dimension[1] = std::stol(header[std::string("DIMENSION_2")]);
-  field.dimension[2] = std::stol(header[std::string("DIMENSION_3")]);
-  field.dimension[3] = std::stol(header[std::string("DIMENSION_4")]);
+  field.dimension[0] = std::stol(header["DIMENSION_1"]);
+  field.dimension[1] = std::stol(header["DIMENSION_2"]);
+  field.dimension[2] = std::stol(header["DIMENSION_3"]);
+  field.dimension[3] = std::stol(header["DIMENSION_4"]);
 
   assert(grid->_ndimension == 4);
   for(int d=0;d<4;d++){
     assert(grid->_fdimensions[d]==field.dimension[d]);
   }
 
-  field.link_trace = std::stod(header[std::string("LINK_TRACE")]);
-  field.plaquette  = std::stod(header[std::string("PLAQUETTE")]);
+  field.link_trace = std::stod(header["LINK_TRACE"]);
+  field.plaquette  = std::stod(header["PLAQUETTE"]);
 
-  field.boundary[0] = header[std::string("BOUNDARY_1")];
-  field.boundary[1] = header[std::string("BOUNDARY_2")];
-  field.boundary[2] = header[std::string("BOUNDARY_3")];
-  field.boundary[3] = header[std::string("BOUNDARY_4")];
+  field.boundary[0] = header["BOUNDARY_1"];
+  field.boundary[1] = header["BOUNDARY_2"];
+  field.boundary[2] = header["BOUNDARY_3"];
+  field.boundary[3] = header["BOUNDARY_4"];
 
-  field.checksum = std::stoul(header[std::string("CHECKSUM")],0,16);
-  field.ensemble_id      = header[std::string("ENSEMBLE_ID")];
-  field.ensemble_label   = header[std::string("ENSEMBLE_LABEL")];
-  field.sequence_number  = std::stol(header[std::string("SEQUENCE_NUMBER")]);
-  field.creator          = header[std::string("CREATOR")];
-  field.creator_hardware = header[std::string("CREATOR_HARDWARE")];
-  field.creation_date    = header[std::string("CREATION_DATE")];
-  field.archive_date     = header[std::string("ARCHIVE_DATE")];
-  field.floating_point   = header[std::string("FLOATING_POINT")];
+  field.checksum = std::stoul(header["CHECKSUM"],0,16);
+  field.ensemble_id      = header["ENSEMBLE_ID"];
+  field.ensemble_label   = header["ENSEMBLE_LABEL"];
+  field.sequence_number  = std::stol(header["SEQUENCE_NUMBER"]);
+  field.creator          = header["CREATOR"];
+  field.creator_hardware = header["CREATOR_HARDWARE"];
+  field.creation_date    = header["CREATION_DATE"];
+  field.archive_date     = header["ARCHIVE_DATE"];
+  field.floating_point   = header["FLOATING_POINT"];
 
   return field.data_start;
 }

From aad51ffe3acfae689b8d1d9e5a3856c708f8663d Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 8 Jun 2015 12:02:26 +0100
Subject: [PATCH 03/24] Prep for mgrid

---
 lib/Algorithms.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/Algorithms.h b/lib/Algorithms.h
index c2491cf4..0252606f 100644
--- a/lib/Algorithms.h
+++ b/lib/Algorithms.h
@@ -3,6 +3,7 @@
 
 #include <algorithms/SparseMatrix.h>
 #include <algorithms/LinearOperator.h>
+#include <algorithms/CoarsenedMatrix.h>
 
 #include <algorithms/iterative/ConjugateGradient.h>
 #include <algorithms/iterative/ConjugateResidual.h>

From 42c22d4caecc6ed6afb0f8ac631cc54ffde2fa8a Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 8 Jun 2015 12:02:53 +0100
Subject: [PATCH 04/24] Prep for multigrid

---
 lib/algorithms/LinearOperator.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/lib/algorithms/LinearOperator.h b/lib/algorithms/LinearOperator.h
index 492b34db..ee0b2ec1 100644
--- a/lib/algorithms/LinearOperator.h
+++ b/lib/algorithms/LinearOperator.h
@@ -16,6 +16,11 @@ namespace Grid {
   /////////////////////////////////////////////////////////////////////////////////////////////
     template<class Field> class LinearOperatorBase {
     public:
+
+      // Support for coarsening to a multigrid
+      virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base
+      virtual void OpDir  (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base
+
       virtual void Op     (const Field &in, Field &out) = 0; // Abstract base
       virtual void AdjOp  (const Field &in, Field &out) = 0; // Abstract base
       virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
@@ -43,6 +48,14 @@ namespace Grid {
       Matrix &_Mat;
     public:
     MdagMLinearOperator(Matrix &Mat): _Mat(Mat){};
+
+      // Support for coarsening to a multigrid
+      void OpDiag (const Field &in, Field &out) {
+	_Mat.Mdiag(in,out);
+      }
+      void OpDir  (const Field &in, Field &out,int dir,int disp) {
+	_Mat.Mdir(in,out,dir,disp);
+      }
       void Op     (const Field &in, Field &out){
 	_Mat.M(in,out);
       }
@@ -66,6 +79,13 @@ namespace Grid {
       Matrix &_Mat;
     public:
     HermitianLinearOperator(Matrix &Mat): _Mat(Mat){};
+      // Support for coarsening to a multigrid
+      void OpDiag (const Field &in, Field &out) {
+	_Mat.Mdiag(in,out);
+      }
+      void OpDir  (const Field &in, Field &out,int dir,int disp) {
+	_Mat.Mdir(in,out,dir,disp);
+      }
       void Op     (const Field &in, Field &out){
 	_Mat.M(in,out);
       }
@@ -116,6 +136,14 @@ namespace Grid {
       void AdjOp     (const Field &in, Field &out){ 
 	MpcDag(in,out);
       }
+      // Support for coarsening to a multigrid
+      void OpDiag (const Field &in, Field &out) {
+	assert(0); // must coarsen the unpreconditioned system
+      }
+      void OpDir  (const Field &in, Field &out,int dir,int disp) {
+	assert(0);
+      }
+
     };
     template<class Matrix,class Field>
       class SchurDiagMooeeOperator :  public SchurOperatorBase<Field> {

From 769ef7b0f5d9e1edb45dc31d4ce51f430a5b0db8 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 8 Jun 2015 12:03:36 +0100
Subject: [PATCH 05/24] sqrt

---
 tests/Test_remez.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/Test_remez.cc b/tests/Test_remez.cc
index b5993917..0d65ac7b 100644
--- a/tests/Test_remez.cc
+++ b/tests/Test_remez.cc
@@ -35,7 +35,7 @@ void MultiShiftFunction::csv(std::ostream &out)
 {
   for (double x=lo; x<hi; x*=1.05) {
     double f = approx(x);
-    double r = sqrt(x);
+    double r = std::sqrt(x);
     out<< x<<","<<r<<","<<f<<","<<r-f<<std::endl;
   }
   return;
@@ -80,8 +80,8 @@ int main (int argc, char ** argv)
   InvSqrt.gnuplot(gnuplot);
 
   double x=0.6789;
-  double sx=sqrt(x);
-  double ssx=sqrt(sx);
+  double sx=std::sqrt(x);
+  double ssx=std::sqrt(sx);
   double isx=1.0/sx;
   double issx=1.0/ssx;
 

From b0873e7ed2ec6ffd91f4a18cb86e2333dda7d1ba Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 8 Jun 2015 12:04:59 +0100
Subject: [PATCH 06/24] Conjugate residual algorithm; some more unary functions

---
 lib/Make.inc                                  |   2 +-
 lib/Simd.h                                    |   3 +
 lib/Tensors.h                                 |   1 +
 lib/algorithms/CoarsenedMatrix.h              | 146 ++++++++++++++
 lib/algorithms/SparseMatrix.h                 |   3 +-
 lib/algorithms/iterative/SchurRedBlack.h      |   2 +-
 lib/lattice/Lattice_base.h                    |   1 +
 lib/lattice/Lattice_local.h                   |   3 +-
 lib/lattice/Lattice_reality.h                 |   8 +-
 lib/lattice/Lattice_transfer.h                | 186 +++++++++++++-----
 lib/lattice/Lattice_unary.h                   |  32 +++
 lib/qcd/action/fermion/CayleyFermion5D.cc     |  21 ++
 lib/qcd/action/fermion/CayleyFermion5D.h      |   4 +
 .../fermion/ContinuedFractionFermion5D.cc     |  12 ++
 .../fermion/ContinuedFractionFermion5D.h      |   3 +
 lib/qcd/action/fermion/FermionOperator.h      |   3 +
 .../fermion/PartialFractionFermion5D.cc       |  16 ++
 .../action/fermion/PartialFractionFermion5D.h |   3 +
 lib/qcd/action/fermion/WilsonFermion.cc       |  20 ++
 lib/qcd/action/fermion/WilsonFermion.h        |   6 +-
 lib/qcd/action/fermion/WilsonFermion5D.cc     |  22 +++
 lib/qcd/action/fermion/WilsonFermion5D.h      |   4 +
 lib/qcd/action/fermion/WilsonKernels.cc       | 148 ++++++++++++--
 lib/qcd/action/fermion/WilsonKernels.h        |   3 +
 lib/qcd/action/fermion/g5HermitianLinop.h     |  15 +-
 lib/simd/Grid_avx.h                           |   5 +-
 lib/simd/Grid_vector_types.h                  |  30 ++-
 lib/simd/Grid_vector_unops.h                  |  60 ++++++
 lib/tensors/Tensor_class.h                    |   7 +
 lib/tensors/Tensor_unary.h                    |  37 ++++
 tests/Make.inc                                |  10 +-
 tests/Test_GaugeAction.cc                     |   2 +-
 tests/Test_cayley_coarsen_support.cc          | 108 ++++++++++
 tests/Test_cf_coarsen_support.cc              |  87 ++++++++
 tests/Test_cf_cr_unprec.cc                    |  58 ++++++
 tests/Test_dwf_cr_unprec.cc                   |  63 ++++++
 tests/Test_nersc_io.cc                        |   2 +-
 tests/Test_wilson_cr_unprec.cc                |  59 ++++++
 38 files changed, 1116 insertions(+), 79 deletions(-)
 create mode 100644 lib/algorithms/CoarsenedMatrix.h
 create mode 100644 lib/lattice/Lattice_unary.h
 create mode 100644 lib/simd/Grid_vector_unops.h
 create mode 100644 lib/tensors/Tensor_unary.h
 create mode 100644 tests/Test_cayley_coarsen_support.cc
 create mode 100644 tests/Test_cf_coarsen_support.cc
 create mode 100644 tests/Test_cf_cr_unprec.cc
 create mode 100644 tests/Test_dwf_cr_unprec.cc
 create mode 100644 tests/Test_wilson_cr_unprec.cc

diff --git a/lib/Make.inc b/lib/Make.inc
index ba35e25f..069e818d 100644
--- a/lib/Make.inc
+++ b/lib/Make.inc
@@ -1,4 +1,4 @@
 
-HFILES=./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/LinearOperator.h ./algorithms/SparseMatrix.h ./Algorithms.h ./AlignedAllocator.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./Cartesian.h ./communicator/Communicator_base.h ./Communicator.h ./Comparison.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./Cshift.h ./Grid.h ./GridConfig.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_ET.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_where.h ./Lattice.h ./parallelIO/NerscIO.h ./qcd/action/Actions.h ./qcd/action/DiffAction.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/gauge/GaugeActionBase.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/QCD.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/WilsonLoops.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Old/Grid_vComplexD.h ./simd/Old/Grid_vComplexF.h ./simd/Old/Grid_vInteger.h ./simd/Old/Grid_vRealD.h ./simd/Old/Grid_vRealF.h ./Simd.h ./stencil/Lebesgue.h ./Stencil.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_inner.h ./tensors/Tensor_outer.h ./tensors/Tensor_peek.h ./tensors/Tensor_poke.h ./tensors/Tensor_reality.h ./tensors/Tensor_Ta.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./Tensors.h ./Threads.h
+HFILES=./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/CoarsenedMatrix.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/LinearOperator.h ./algorithms/SparseMatrix.h ./Algorithms.h ./AlignedAllocator.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./Cartesian.h ./communicator/Communicator_base.h ./Communicator.h ./Comparison.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./Cshift.h ./Grid.h ./GridConfig.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_ET.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_where.h ./Lattice.h ./parallelIO/NerscIO.h ./qcd/action/Actions.h ./qcd/action/DiffAction.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/gauge/GaugeActionBase.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/QCD.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/WilsonLoops.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Old/Grid_vComplexD.h ./simd/Old/Grid_vComplexF.h ./simd/Old/Grid_vInteger.h ./simd/Old/Grid_vRealD.h ./simd/Old/Grid_vRealF.h ./Simd.h ./stencil/Lebesgue.h ./Stencil.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_inner.h ./tensors/Tensor_outer.h ./tensors/Tensor_peek.h ./tensors/Tensor_poke.h ./tensors/Tensor_reality.h ./tensors/Tensor_Ta.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./Tensors.h ./Threads.h
 
 CCFILES=./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./GridInit.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/spin/Dirac.cc ./qcd/utils/SpaceTimeGrid.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc
diff --git a/lib/Simd.h b/lib/Simd.h
index 26edb4c9..f93c3070 100644
--- a/lib/Simd.h
+++ b/lib/Simd.h
@@ -35,6 +35,8 @@ namespace Grid {
   inline RealD conjugate(const RealD  & r){ return r; }
   inline RealD real(const RealD  & r){ return r; }
 
+  inline RealD sqrt(const RealD  & r){ return std::sqrt(r); }
+
   inline ComplexD conjugate(const ComplexD& r){ return(conj(r)); }
   inline ComplexD adj(const ComplexD& r){ return(conjugate(r)); }
   inline ComplexF conjugate(const ComplexF& r ){ return(conj(r)); }
@@ -112,6 +114,7 @@ namespace Grid {
 };
 
 #include <simd/Grid_vector_types.h>
+#include <simd/Grid_vector_unops.h>
 
 namespace Grid {
   // Default precision
diff --git a/lib/Tensors.h b/lib/Tensors.h
index fa0ca7a9..e585b116 100644
--- a/lib/Tensors.h
+++ b/lib/Tensors.h
@@ -12,6 +12,7 @@
 #include <tensors/Tensor_peek.h>
 #include <tensors/Tensor_poke.h>
 #include <tensors/Tensor_reality.h>
+#include <tensors/Tensor_unary.h>
 #include <tensors/Tensor_extract_merge.h>
     
 #endif
diff --git a/lib/algorithms/CoarsenedMatrix.h b/lib/algorithms/CoarsenedMatrix.h
new file mode 100644
index 00000000..54ea279e
--- /dev/null
+++ b/lib/algorithms/CoarsenedMatrix.h
@@ -0,0 +1,146 @@
+#ifndef  GRID_ALGORITHM_COARSENED_MATRIX_H
+#define  GRID_ALGORITHM_COARSENED_MATRIX_H
+
+#include <Grid.h>
+
+namespace Grid {
+
+  class Geometry {
+  public:
+    int npoint;
+    int dimension;
+    std::vector<int> directions   ;
+    std::vector<int> displacements;
+
+    Geometry(int _d) : dimension(_d), npoint(2*_d+1), directions(npoint), displacements(npoint) {
+      for(int d=0;d<dimension;d++){
+	directions[2*d  ] = d;
+	directions[2*d+1] = d;
+	displacements[2*d  ] = +1;
+	displacements[2*d+1] = -1;
+      }
+      directions   [2*dimension]=0;
+      displacements[2*dimension]=0;
+    }
+    
+    std::vector<int> GetDelta(int point) {
+      std::vector<int> delta(dimension,0);
+      delta[directions[point]] = displacements[point];
+      return delta;
+    };
+
+  };
+  
+  // Fine Object == (per site) type of fine field
+  // nbasis      == number of deflation vectors
+  template<class Fobj,class CComplex,int nbasis>
+  class CoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<vComplex,nbasis > > >  {
+  public:
+    
+    typedef iVector<vComplex,nbasis > siteVector;
+    typedef Lattice<iVector<vComplex,nbasis > > CoarseVector;
+    typedef Lattice<iMatrix<vComplex,nbasis > > CoarseMatrix;
+
+    typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
+    typedef Lattice<Fobj >        FineField;
+
+    ////////////////////
+    // Data members
+    ////////////////////
+    Geometry         geom;
+    GridBase *       _grid; 
+    CartesianStencil Stencil; 
+    std::vector<CoarseMatrix> A;
+    std::vector<siteVector,alignedAllocator<siteVector> >   comm_buf;
+      
+    ///////////////////////
+    // Interface
+    ///////////////////////
+    GridBase * Grid(void)         { return _grid; };   // this is all the linalg routines need to know
+
+    RealD M    (const CoarseVector &in, CoarseVector &out){
+      
+      SimpleCompressor<siteVector> compressor;
+      Stencil.HaloExchange(in,comm_buf,compressor);
+
+      //PARALLEL_FOR_LOOP
+      for(int ss=0;ss<Grid()->oSites();ss++){
+        siteVector res = zero;
+	siteVector tmp;
+	siteVector nbr;
+
+	int offset,local,perm;
+	for(int point=0;point<geom.npoint;point++){
+	  offset = Stencil._offsets [point][ss];
+	  local  = Stencil._is_local[point][ss];
+	  perm   = Stencil._permute[point][ss];
+	  
+	  if(local&&perm) { 
+	    permute(nbr,in._odata[offset],perm);
+	  } else if(local) { 
+	    nbr = in._odata[offset];
+	  } else {
+	    nbr = comm_buf[offset];
+	  }
+	  res = res + A[point]._odata[ss]*nbr;
+	}
+	vstream(out._odata[ss],res);
+      }
+      return norm2(out);
+    };
+
+    RealD Mdag (const CoarseVector &in, CoarseVector &out){ 
+      return M(in,out);
+    };
+    // Defer support for further coarsening for now
+    void Mdiag    (const CoarseVector &in,  CoarseVector &out){};
+    void Mdir     (const CoarseVector &in,  CoarseVector &out,int dir, int disp){};
+
+    CoarsenedMatrix(GridCartesian &CoarseGrid) 	: 
+
+      _grid(&CoarseGrid),
+      geom(CoarseGrid._ndimension),
+      Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
+      A(geom.npoint,&CoarseGrid)
+    {
+      comm_buf.resize(Stencil._unified_buffer_size);
+    };
+
+    void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,std::vector<Lattice<Fobj> > & subspace){
+
+      FineField  phi(FineGrid);
+      FineField Mphi(FineGrid);
+      CoarseVector Proj(Grid()); 
+      CoarseScalar InnerProd(Grid()); 
+
+      // Orthogonalise the subblocks over the basis
+      blockOrthogonalise(InnerProd,subspace);
+
+      // Compute the matrix elements of linop between this orthonormal
+      // set of vectors.
+      for(int i=0;i<nbasis;i++){
+	phi=subspace[i];
+	for(int p=0;p<geom.npoint;p++){ 
+	  int dir = geom.directions[p];
+	  int disp= geom.displacements[p];
+
+	  if ( disp==0 )linop.OpDiag(phi,Mphi);
+	  else linop.OpDir(phi,Mphi,dir,disp); 
+
+	  blockProject(Proj,Mphi,subspace);
+
+	  for(int ss=0;ss<Grid()->oSites();ss++){
+	    for(int j=0;j<nbasis;j++){
+	      A[p]._odata[ss](j,i) = Proj._odata[ss](j);
+	    }
+	  }
+
+	}	
+      }
+      std::cout<<"Computed Coarse Operator"<<std::endl;
+    }
+    
+  };
+
+}
+#endif
diff --git a/lib/algorithms/SparseMatrix.h b/lib/algorithms/SparseMatrix.h
index a51657dc..eb807555 100644
--- a/lib/algorithms/SparseMatrix.h
+++ b/lib/algorithms/SparseMatrix.h
@@ -18,8 +18,9 @@ namespace Grid {
 	Field tmp (in._grid);
 	ni=M(in,tmp);
 	no=Mdag(tmp,out);
-	std::cout << "MdagM "<< ni<<" "<<no<<std::endl;
       }
+      virtual  void Mdiag    (const Field &in, Field &out)=0;
+      virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
     };
 
   /////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lib/algorithms/iterative/SchurRedBlack.h b/lib/algorithms/iterative/SchurRedBlack.h
index 01f2e3d5..3ebc5f1d 100644
--- a/lib/algorithms/iterative/SchurRedBlack.h
+++ b/lib/algorithms/iterative/SchurRedBlack.h
@@ -108,7 +108,7 @@ namespace Grid {
       RealD ns = norm2(in);
       RealD nr = norm2(resid);
 
-      std::cout << "SchurRedBlackDiagMooee solver true unprec resid "<< sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
+      std::cout << "SchurRedBlackDiagMooee solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
     }     
   };
 
diff --git a/lib/lattice/Lattice_base.h b/lib/lattice/Lattice_base.h
index 6b5fe261..ce2990a5 100644
--- a/lib/lattice/Lattice_base.h
+++ b/lib/lattice/Lattice_base.h
@@ -299,6 +299,7 @@ PARALLEL_FOR_LOOP
 #include <lattice/Lattice_reality.h>
 #include <lattice/Lattice_coordinate.h>
 #include <lattice/Lattice_rng.h>
+#include <lattice/Lattice_unary.h>
 #include <lattice/Lattice_transfer.h>
 
 
diff --git a/lib/lattice/Lattice_local.h b/lib/lattice/Lattice_local.h
index 7b8bdf9b..ad7a1ff5 100644
--- a/lib/lattice/Lattice_local.h
+++ b/lib/lattice/Lattice_local.h
@@ -25,8 +25,7 @@ PARALLEL_FOR_LOOP
     
     // localInnerProduct
     template<class vobj>
-    inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
-      -> Lattice<typename vobj::tensor_reduced>
+    inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
     {
       Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
 PARALLEL_FOR_LOOP
diff --git a/lib/lattice/Lattice_reality.h b/lib/lattice/Lattice_reality.h
index 4ad6fb7b..8c8bc17a 100644
--- a/lib/lattice/Lattice_reality.h
+++ b/lib/lattice/Lattice_reality.h
@@ -27,9 +27,9 @@ PARALLEL_FOR_LOOP
         return ret;
     };
 
-    template<class vobj> inline auto real(const Lattice<vobj> &z) -> Lattice<decltype(real(z._odata[0]))>
+    template<class vobj> inline auto real(const Lattice<vobj> &z) -> Lattice<vobj>
     {
-      Lattice<decltype(real(z._odata[0]))> ret(z._grid);
+      Lattice<vobj> ret(z._grid);
 PARALLEL_FOR_LOOP
         for(int ss=0;ss<z._grid->oSites();ss++){
             ret._odata[ss] = real(z._odata[ss]);
@@ -37,9 +37,9 @@ PARALLEL_FOR_LOOP
       return ret;
     }
 
-    template<class vobj> inline auto imag(const Lattice<vobj> &z) -> Lattice<decltype(imag(z._odata[0]))>
+    template<class vobj> inline auto imag(const Lattice<vobj> &z) -> Lattice<vobj>
     {
-      Lattice<decltype(imag(z._odata[0]))> ret(z._grid);
+      Lattice<vobj> ret(z._grid);
 PARALLEL_FOR_LOOP
         for(int ss=0;ss<z._grid->oSites();ss++){
             ret._odata[ss] = imag(z._odata[ss]);
diff --git a/lib/lattice/Lattice_transfer.h b/lib/lattice/Lattice_transfer.h
index f0046b0e..871ed893 100644
--- a/lib/lattice/Lattice_transfer.h
+++ b/lib/lattice/Lattice_transfer.h
@@ -56,10 +56,10 @@ PARALLEL_FOR_LOOP
   }
   
 
-template<class vobj,int nbasis>
-inline void projectBlockBasis(Lattice<iVector<vComplex,nbasis > > &coarseData,
-			      const             Lattice<vobj>   &fineData,
-			      const std::vector<Lattice<vobj> > &Basis)
+template<class vobj,class CComplex,int nbasis>
+inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
+			 const             Lattice<vobj>   &fineData,
+			 const std::vector<Lattice<vobj> > &Basis)
 {
   GridBase * fine  = fineData._grid;
   GridBase * coarse= coarseData._grid;
@@ -69,13 +69,14 @@ inline void projectBlockBasis(Lattice<iVector<vComplex,nbasis > > &coarseData,
   assert( nbasis == Basis.size() );
   subdivides(coarse,fine); 
   for(int i=0;i<nbasis;i++){
-    conformable(Basis,fineData);
+    conformable(Basis[i],fineData);
   }
 
   std::vector<int>  block_r      (_ndimension);
   
   for(int d=0 ; d<_ndimension;d++){
     block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
+    assert(block_r[d]*coarse->_rdimensions[d] == fine->_rdimensions[d]);
   }
 
   coarseData=zero;
@@ -92,20 +93,149 @@ inline void projectBlockBasis(Lattice<iVector<vComplex,nbasis > > &coarseData,
 
     for(int i=0;i<nbasis;i++) {
       
-      coarseData._odata[sc][i]=coarseData._odata[sc][i]
-	+ innerProduct(Basis[i]._odata[sf],fineData._odata[sf]);
+      coarseData._odata[sc](i)=coarseData._odata[sc](i)
+	+ TensorRemove( innerProduct(Basis[i]._odata[sf],fineData._odata[sf]));
 
     }
   }
   return;
+}
+
+template<class vobj,class CComplex>
+inline void blockZAXPY(Lattice<vobj> &fineZ,
+		       const Lattice<CComplex> &coarseA,
+		       const Lattice<vobj> &fineX,
+		       const Lattice<vobj> &fineY)
+{
+  GridBase * fine  = fineZ._grid;
+  GridBase * coarse= coarseA._grid;
+
+  fineZ.checkerboard=fineX.checkerboard;
+  subdivides(coarse,fine); // require they map
+  conformable(fineX,fineY);
+  conformable(fineX,fineZ);
+
+  int _ndimension = coarse->_ndimension;
   
+  std::vector<int>  block_r      (_ndimension);
+
+  // FIXME merge with subdivide checking routine as this is redundant
+  for(int d=0 ; d<_ndimension;d++){
+    block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
+    assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
+  }
+
+PARALLEL_FOR_LOOP
+  for(int sf=0;sf<fine->oSites();sf++){
+    
+    int sc;
+    std::vector<int> coor_c(_ndimension);
+    std::vector<int> coor_f(_ndimension);
+
+    GridBase::CoorFromIndex(coor_f,sf,fine->_rdimensions);
+    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
+    GridBase::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
+
+    // z = A x + y
+    fineZ._odata[sf]=coarseA._odata[sc]*fineX._odata[sf]+fineY._odata[sf];
+
+  }
+
+  return;
+}
+template<class vobj,class CComplex>
+  inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
+				const Lattice<vobj> &fineX,
+				const Lattice<vobj> &fineY)
+{
+  typedef decltype(innerProduct(fineX._odata[0],fineY._odata[0])) dotp;
+
+  GridBase *coarse(CoarseInner._grid);
+  GridBase *fine  (fineX._grid);
+
+  Lattice<dotp> fine_inner(fine);
+  Lattice<dotp> coarse_inner(coarse);
+
+  fine_inner = localInnerProduct(fineX,fineY);
+  blockSum(coarse_inner,fine_inner);
+  for(int ss=0;ss<coarse->oSites();ss++){
+    CoarseInner._odata[ss] = coarse_inner._odata[ss];
+  }
+}
+template<class vobj,class CComplex>
+inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
+{
+  GridBase *coarse = ip._grid;
+  blockInnerProduct(ip,fineX,fineX);
+  ip = rsqrt(ip);
+  blockZAXPY(fineX,ip,fineX,fineX);
+}
+// useful in multigrid project;
+// Generic name : Coarsen?
+template<class vobj>
+inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
+{
+  GridBase * fine  = fineData._grid;
+  GridBase * coarse= coarseData._grid;
+
+  subdivides(coarse,fine); // require they map
+
+  int _ndimension = coarse->_ndimension;
+  
+  std::vector<int>  block_r      (_ndimension);
+  
+  for(int d=0 ; d<_ndimension;d++){
+    block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
+  }
+
+  coarseData=zero;
+  for(int sf=0;sf<fine->oSites();sf++){
+    
+    int sc;
+    std::vector<int> coor_c(_ndimension);
+    std::vector<int> coor_f(_ndimension);
+
+    GridBase::CoorFromIndex(coor_f,sf,fine->_rdimensions);
+    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
+    GridBase::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
+
+    coarseData._odata[sc]=coarseData._odata[sc]+fineData._odata[sf];
+
+  }
+  return;
 }
 
 
-template<class vobj,int nbasis>
-inline void promoteBlockBasis(const Lattice<iVector<vComplex,nbasis > > &coarseData,
-			                        Lattice<vobj>   &fineData,
-			      const std::vector<Lattice<vobj> > &Basis)
+template<class vobj,class CComplex>
+inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> > &Basis)
+{
+  GridBase *coarse = ip._grid;
+  GridBase *fine   = Basis[0]._grid;
+
+  int       nbasis = Basis.size() ;
+  int  _ndimension = coarse->_ndimension;
+
+  // checks
+  subdivides(coarse,fine); 
+  for(int i=0;i<nbasis;i++){
+    conformable(Basis[i]._grid,fine);
+  }
+
+  for(int v=0;v<nbasis;v++) {
+    for(int u=0;u<v;u++) {
+      //Inner product & remove component 
+      blockInnerProduct(ip,Basis[u],Basis[v]);
+      ip = -ip;
+      blockZAXPY<vobj,CComplex> (Basis[v],ip,Basis[u],Basis[v]);
+    }
+    blockNormalise(ip,Basis[v]);
+  }
+}
+
+template<class vobj,class CComplex,int nbasis>
+inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
+			 Lattice<vobj>   &fineData,
+			 const std::vector<Lattice<vobj> > &Basis)
 {
   GridBase * fine  = fineData._grid;
   GridBase * coarse= coarseData._grid;
@@ -146,40 +276,6 @@ inline void promoteBlockBasis(const Lattice<iVector<vComplex,nbasis > > &coarseD
   
 }
 
-// useful in multigrid project;
-// Generic name : Coarsen?
-template<class vobj>
-inline void sumBlocks(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
-{
-  GridBase * fine  = fineData._grid;
-  GridBase * coarse= coarseData._grid;
-
-  subdivides(coarse,fine); // require they map
-
-  int _ndimension = coarse->_ndimension;
-  
-  std::vector<int>  block_r      (_ndimension);
-  
-  for(int d=0 ; d<_ndimension;d++){
-    block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
-  }
-
-  coarseData=zero;
-  for(int sf=0;sf<fine->oSites();sf++){
-    
-    int sc;
-    std::vector<int> coor_c(_ndimension);
-    std::vector<int> coor_f(_ndimension);
-
-    GridBase::CoorFromIndex(coor_f,sf,fine->_rdimensions);
-    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
-    GridBase::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
-
-    coarseData._odata[sc]=coarseData._odata[sc]+fineData._odata[sf];
-
-  }
-  return;
-}
 
 }
 #endif
diff --git a/lib/lattice/Lattice_unary.h b/lib/lattice/Lattice_unary.h
new file mode 100644
index 00000000..131d7d68
--- /dev/null
+++ b/lib/lattice/Lattice_unary.h
@@ -0,0 +1,32 @@
+#ifndef GRID_LATTICE_UNARY_H
+#define GRID_LATTICE_UNARY_H
+
+namespace Grid {
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  //  avoid copy back routines for mult, mac, sub, add
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class obj> Lattice<obj> sqrt(const Lattice<obj> &rhs){
+    Lattice<obj> ret(rhs._grid);
+    ret.checkerboard = rhs.checkerboard;
+    conformable(ret,rhs);
+PARALLEL_FOR_LOOP
+    for(int ss=0;ss<rhs._grid->oSites();ss++){
+      ret._odata[ss]=sqrt(rhs._odata[ss]);
+    }
+    return ret;
+  }
+  template<class obj> Lattice<obj> rsqrt(const Lattice<obj> &rhs){
+    Lattice<obj> ret(rhs._grid);
+    ret.checkerboard = rhs.checkerboard;
+    conformable(ret,rhs);
+PARALLEL_FOR_LOOP
+    for(int ss=0;ss<rhs._grid->oSites();ss++){
+      ret._odata[ss]=rsqrt(rhs._odata[ss]);
+    }
+    return ret;
+  }
+
+
+}
+#endif
diff --git a/lib/qcd/action/fermion/CayleyFermion5D.cc b/lib/qcd/action/fermion/CayleyFermion5D.cc
index e47ff331..7db67dd7 100644
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@@ -165,6 +165,27 @@ namespace QCD {
     }
   }
 
+  void  CayleyFermion5D::Mdir (const LatticeFermion &psi, LatticeFermion &chi,int dir,int disp){
+    LatticeFermion tmp(psi._grid);
+    // Assemble the 5d matrix
+    for(int s=0;s<Ls;s++){
+      if ( s==0 ) {
+	//	tmp = bs psi[s] + cs[s] psi[s+1}
+	//      tmp+= -mass*cs[s] psi[s+1}
+	axpby_ssp_pminus(tmp,beo[s],psi,-ceo[s],psi ,s, s+1);
+	axpby_ssp_pplus(tmp,1.0,tmp,mass*ceo[s],psi,s,Ls-1);
+      } else if ( s==(Ls-1)) { 
+	axpby_ssp_pminus(tmp,beo[s],psi,mass*ceo[s],psi,s,0);
+	axpby_ssp_pplus(tmp,1.0,tmp,-ceo[s],psi,s,s-1);
+      } else {
+	axpby_ssp_pminus(tmp,beo[s],psi,-ceo[s],psi,s,s+1);
+	axpby_ssp_pplus (tmp,1.0,tmp,-ceo[s],psi,s,s-1);
+      }
+    }
+    // Apply 4d dslash fragment
+    DhopDir(tmp,chi,dir,disp);
+  }
+
   void CayleyFermion5D::MooeeDag    (const LatticeFermion &psi, LatticeFermion &chi)
   {
     for (int s=0;s<Ls;s++){
diff --git a/lib/qcd/action/fermion/CayleyFermion5D.h b/lib/qcd/action/fermion/CayleyFermion5D.h
index e2175d77..208ab0bb 100644
--- a/lib/qcd/action/fermion/CayleyFermion5D.h
+++ b/lib/qcd/action/fermion/CayleyFermion5D.h
@@ -21,6 +21,10 @@ namespace Grid {
       virtual void   MooeeInv    (const LatticeFermion &in, LatticeFermion &out);
       virtual void   MooeeInvDag (const LatticeFermion &in, LatticeFermion &out);
       virtual void   Instantiatable(void)=0;
+
+      // Efficient support for multigrid coarsening
+      virtual void  Mdir (const LatticeFermion &in, LatticeFermion &out,int dir,int disp);
+
       //    protected:
       RealD mass;
 
diff --git a/lib/qcd/action/fermion/ContinuedFractionFermion5D.cc b/lib/qcd/action/fermion/ContinuedFractionFermion5D.cc
index 6f99aa7f..0d54f272 100644
--- a/lib/qcd/action/fermion/ContinuedFractionFermion5D.cc
+++ b/lib/qcd/action/fermion/ContinuedFractionFermion5D.cc
@@ -90,6 +90,18 @@ namespace Grid {
       // Can ignore "dag"
       return M(psi,chi);
     }
+    void  ContinuedFractionFermion5D::Mdir (const LatticeFermion &psi, LatticeFermion &chi,int dir,int disp){
+      DhopDir(psi,chi,dir,disp); // Dslash on diagonal. g5 Dslash is hermitian
+      int sign=1;
+      for(int s=0;s<Ls;s++){
+	if ( s==(Ls-1) ){
+	  ag5xpby_ssp(chi,Beta[s]*ZoloHiInv,chi,0.0,chi,s,s);
+	} else {
+	  ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*ZoloHiInv,chi,0.0,chi,s,s);
+	}
+	sign=-sign; 
+      }
+    }
     void   ContinuedFractionFermion5D::Meooe       (const LatticeFermion &psi, LatticeFermion &chi)
     {
       // Apply 4d dslash
diff --git a/lib/qcd/action/fermion/ContinuedFractionFermion5D.h b/lib/qcd/action/fermion/ContinuedFractionFermion5D.h
index b7e321c9..43b0f756 100644
--- a/lib/qcd/action/fermion/ContinuedFractionFermion5D.h
+++ b/lib/qcd/action/fermion/ContinuedFractionFermion5D.h
@@ -24,6 +24,9 @@ namespace Grid {
       //      virtual void   Instantiatable(void)=0;
       virtual void   Instantiatable(void) =0;
 
+      // Efficient support for multigrid coarsening
+      virtual void  Mdir (const LatticeFermion &in, LatticeFermion &out,int dir,int disp);
+
       // Constructors
       ContinuedFractionFermion5D(LatticeGaugeField &_Umu,
 				 GridCartesian         &FiveDimGrid,
diff --git a/lib/qcd/action/fermion/FermionOperator.h b/lib/qcd/action/fermion/FermionOperator.h
index 47c47478..1aa2f8f0 100644
--- a/lib/qcd/action/fermion/FermionOperator.h
+++ b/lib/qcd/action/fermion/FermionOperator.h
@@ -40,6 +40,9 @@ namespace Grid {
       virtual void DhopOE(const FermionField &in, FermionField &out,int dag)=0;
       virtual void DhopEO(const FermionField &in, FermionField &out,int dag)=0;
 
+      virtual void  Mdiag(const FermionField &in, FermionField &out) { Mooee(in,out);};   // Same as Mooee applied to both CB's
+      virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
+      virtual void  DhopDir(const FermionField &in, FermionField &out,int dir,int disp)=0; // implemented by WilsonFermion and WilsonFermion5D
 
     };
 
diff --git a/lib/qcd/action/fermion/PartialFractionFermion5D.cc b/lib/qcd/action/fermion/PartialFractionFermion5D.cc
index 285144c6..a148fe40 100644
--- a/lib/qcd/action/fermion/PartialFractionFermion5D.cc
+++ b/lib/qcd/action/fermion/PartialFractionFermion5D.cc
@@ -2,6 +2,22 @@
 namespace Grid {
   namespace QCD {
 
+    void  PartialFractionFermion5D::Mdir (const LatticeFermion &psi, LatticeFermion &chi,int dir,int disp){
+      // this does both dag and undag but is trivial; make a common helper routing
+
+      int sign = 1;
+
+      DhopDir(psi,chi,dir,disp);
+
+      int nblock=(Ls-1)/2;
+      for(int b=0;b<nblock;b++){
+	int s = 2*b;
+	ag5xpby_ssp(chi,-scale,chi,0.0,chi,s,s); 
+	ag5xpby_ssp(chi, scale,chi,0.0,chi,s+1,s+1); 
+      }
+      ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);
+
+    }
     void   PartialFractionFermion5D::Meooe_internal(const LatticeFermion &psi, LatticeFermion &chi,int dag)
     {
       // this does both dag and undag but is trivial; make a common helper routing
diff --git a/lib/qcd/action/fermion/PartialFractionFermion5D.h b/lib/qcd/action/fermion/PartialFractionFermion5D.h
index 301b07c0..e734caa0 100644
--- a/lib/qcd/action/fermion/PartialFractionFermion5D.h
+++ b/lib/qcd/action/fermion/PartialFractionFermion5D.h
@@ -30,6 +30,9 @@ namespace Grid {
 
       virtual void   Instantiatable(void) =0; // ensure no make-eee
 
+      // Efficient support for multigrid coarsening
+      virtual void  Mdir (const LatticeFermion &in, LatticeFermion &out,int dir,int disp);
+
       // Constructors
       PartialFractionFermion5D(LatticeGaugeField &_Umu,
 				    GridCartesian         &FiveDimGrid,
diff --git a/lib/qcd/action/fermion/WilsonFermion.cc b/lib/qcd/action/fermion/WilsonFermion.cc
index 9f2da251..13f16c5a 100644
--- a/lib/qcd/action/fermion/WilsonFermion.cc
+++ b/lib/qcd/action/fermion/WilsonFermion.cc
@@ -93,6 +93,26 @@ void WilsonFermion::MooeeInvDag(const LatticeFermion &in, LatticeFermion &out)
   out.checkerboard = in.checkerboard;
   MooeeInv(in,out);
 }
+void WilsonFermion::Mdir (const LatticeFermion &in, LatticeFermion &out,int dir,int disp)
+{
+  DhopDir(in,out,dir,disp);
+}
+void WilsonFermion::DhopDir(const LatticeFermion &in, LatticeFermion &out,int dir,int disp){
+  WilsonCompressor compressor(DaggerNo);
+  Stencil.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
+  
+  assert( (disp==1)||(disp==-1) );
+
+  int skip = (disp==1) ? 0 : 1;
+
+  int dirdisp = dir+skip*4;
+
+PARALLEL_FOR_LOOP
+  for(int sss=0;sss<in._grid->oSites();sss++){
+    DiracOpt::DhopDir(Stencil,Umu,comm_buf,sss,sss,in,out,dirdisp);
+  }
+
+};
 
 void WilsonFermion::DhopInternal(CartesianStencil & st,LatticeDoubledGaugeField & U,
 				const LatticeFermion &in, LatticeFermion &out,int dag)
diff --git a/lib/qcd/action/fermion/WilsonFermion.h b/lib/qcd/action/fermion/WilsonFermion.h
index 6040d328..60726fdd 100644
--- a/lib/qcd/action/fermion/WilsonFermion.h
+++ b/lib/qcd/action/fermion/WilsonFermion.h
@@ -26,7 +26,7 @@ namespace Grid {
       void   MeooeDag    (const LatticeFermion &in, LatticeFermion &out);
       virtual void   Mooee       (const LatticeFermion &in, LatticeFermion &out); // remain virtual so we 
       virtual void   MooeeDag    (const LatticeFermion &in, LatticeFermion &out); // can derive Clover
-      virtual void   MooeeInv    (const LatticeFermion &in, LatticeFermion &out); // from Wilson bas
+      virtual void   MooeeInv    (const LatticeFermion &in, LatticeFermion &out); // from Wilson base
       virtual void   MooeeInvDag (const LatticeFermion &in, LatticeFermion &out);
 
       // non-hermitian hopping term; half cb or both
@@ -34,6 +34,10 @@ namespace Grid {
       void DhopOE(const LatticeFermion &in, LatticeFermion &out,int dag);
       void DhopEO(const LatticeFermion &in, LatticeFermion &out,int dag);
 
+      // Multigrid assistance
+      void   Mdir (const LatticeFermion &in, LatticeFermion &out,int dir,int disp);
+      void DhopDir(const LatticeFermion &in, LatticeFermion &out,int dir,int disp);
+
       ///////////////////////////////////////////////////////////////
       // Extra methods added by derived
       ///////////////////////////////////////////////////////////////
diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc
index d22701b0..4ddd14ab 100644
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -82,6 +82,28 @@ void WilsonFermion5D::DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGau
     pokeIndex<LorentzIndex>(Uds,U,mu+4);
   }
 }
+void WilsonFermion5D::DhopDir(const LatticeFermion &in, LatticeFermion &out,int dir,int disp)
+{
+  assert( (disp==1)||(disp==-1) );
+
+  WilsonCompressor compressor(DaggerNo);
+  Stencil.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
+  
+  int skip = (disp==1) ? 0 : 1;
+
+  int dirdisp = dir+skip*4;
+
+PARALLEL_FOR_LOOP
+  for(int ss=0;ss<Umu._grid->oSites();ss++){
+    for(int s=0;s<Ls;s++){
+      int sU=ss;
+      int sF = s+Ls*sU; 
+      DiracOpt::DhopDir(Stencil,Umu,comm_buf,sF,sU,in,out,dirdisp);
+    }
+  }
+
+};
+
 void WilsonFermion5D::DhopInternal(CartesianStencil & st, LebesgueOrder &lo,
 				   LatticeDoubledGaugeField & U,
 				   const LatticeFermion &in, LatticeFermion &out,int dag)
diff --git a/lib/qcd/action/fermion/WilsonFermion5D.h b/lib/qcd/action/fermion/WilsonFermion5D.h
index 4d4a7aed..10138991 100644
--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@@ -57,6 +57,10 @@ namespace Grid {
       void DhopOE(const LatticeFermion &in, LatticeFermion &out,int dag);
       void DhopEO(const LatticeFermion &in, LatticeFermion &out,int dag);
 
+      // add a DhopComm
+      // -- suboptimal interface will presently trigger multiple comms.
+      void DhopDir(const LatticeFermion &in, LatticeFermion &out,int dir,int disp);
+
       ///////////////////////////////////////////////////////////////
       // New methods added 
       ///////////////////////////////////////////////////////////////
diff --git a/lib/qcd/action/fermion/WilsonKernels.cc b/lib/qcd/action/fermion/WilsonKernels.cc
index 879bbb7a..518b8004 100644
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -13,7 +13,6 @@ void DiracOpt::DhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U,
     vHalfSpinColourVector Uchi;
     int offset,local,perm, ptype;
 
-    //#define VERBOSE( A)  if ( ss<10 ) { std::cout << "site " <<ss << " " #A " neigh " << offset << " perm "<< perm <<std::endl;}    
 
     // Xp
     int ss = sF;
@@ -33,12 +32,6 @@ void DiracOpt::DhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U,
     mult(&Uchi(),&U._odata[sU](Xp),&chi());
     spReconXp(result,Uchi);
 
-    //    std::cout << "XP_RECON"<<std::endl;
-    //    std::cout << result()(0)(0) <<" "<<result()(0)(1) <<" "<<result()(0)(2) <<std::endl;
-    //    std::cout << result()(1)(0) <<" "<<result()(1)(1) <<" "<<result()(1)(2) <<std::endl;
-    //    std::cout << result()(2)(0) <<" "<<result()(2)(1) <<" "<<result()(2)(2) <<std::endl;
-    //    std::cout << result()(3)(0) <<" "<<result()(3)(1) <<" "<<result()(3)(2) <<std::endl;
-
     // Yp
     offset = st._offsets [Yp][ss];
     local  = st._is_local[Yp][ss];
@@ -93,8 +86,7 @@ void DiracOpt::DhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U,
     perm   = st._permute[Xm][ss];
     ptype  = st._permute_type[Xm];
 
-    if ( local && perm ) 
-    {
+    if ( local && perm ) {
       spProjXm(tmp,in._odata[offset]);
       permute(chi,tmp,ptype);
     } else if ( local ) {
@@ -104,12 +96,6 @@ void DiracOpt::DhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U,
     }
     mult(&Uchi(),&U._odata[sU](Xm),&chi());
     accumReconXm(result,Uchi);
-    //  std::cout << "XM_RECON_ACCUM"<<std::endl;
-    //    std::cout << result()(0)(0) <<" "<<result()(0)(1) <<" "<<result()(0)(2) <<std::endl;
-    //    std::cout << result()(1)(0) <<" "<<result()(1)(1) <<" "<<result()(1)(2) <<std::endl;
-    //    std::cout << result()(2)(0) <<" "<<result()(2)(1) <<" "<<result()(2)(2) <<std::endl;
-    //    std::cout << result()(3)(0) <<" "<<result()(3)(1) <<" "<<result()(3)(2) <<std::endl;
-
 
     // Ym
     offset = st._offsets [Ym][ss];
@@ -308,4 +294,136 @@ void DiracOpt::DhopSiteDag(CartesianStencil &st,LatticeDoubledGaugeField &U,
 
     vstream(out._odata[ss],result*(-0.5));
 }
+
+void DiracOpt::DhopDir(CartesianStencil &st,LatticeDoubledGaugeField &U,
+			std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
+		       int sF,int sU,const LatticeFermion &in, LatticeFermion &out,int dirdisp)
+{
+    vHalfSpinColourVector  tmp;    
+    vHalfSpinColourVector  chi;    
+    vSpinColourVector result;
+    vHalfSpinColourVector Uchi;
+    int offset,local,perm, ptype;
+    int ss=sF;
+    
+    offset = st._offsets [dirdisp][ss];
+    local  = st._is_local[dirdisp][ss];
+    perm   = st._permute[dirdisp][ss];
+    ptype  = st._permute_type[dirdisp];
+
+    // Xp
+    if(dirdisp==Xp){
+      if ( local && perm ) {
+	spProjXp(tmp,in._odata[offset]);
+	permute(chi,tmp,ptype);
+      } else if ( local ) {
+	spProjXp(chi,in._odata[offset]);
+      } else { 
+	chi=buf[offset];
+      }
+      mult(&Uchi(),&U._odata[sU](Xp),&chi());
+      spReconXp(result,Uchi);
+    }
+
+    // Yp
+    if ( dirdisp==Yp ){
+      if ( local && perm ) {
+	spProjYp(tmp,in._odata[offset]);
+	permute(chi,tmp,ptype);
+      } else if ( local ) {
+	spProjYp(chi,in._odata[offset]);
+      } else { 
+	chi=buf[offset];
+      }
+      mult(&Uchi(),&U._odata[sU](Yp),&chi());
+      spReconYp(result,Uchi);
+    }
+
+    // Zp
+    if ( dirdisp ==Zp ){
+      if ( local && perm ) {
+	spProjZp(tmp,in._odata[offset]);
+	permute(chi,tmp,ptype);
+      } else if ( local ) {
+	spProjZp(chi,in._odata[offset]);
+      } else { 
+	chi=buf[offset];
+      }
+      mult(&Uchi(),&U._odata[sU](Zp),&chi());
+      spReconZp(result,Uchi);
+    }
+
+    // Tp
+    if ( dirdisp ==Tp ){
+      if ( local && perm ) {
+	spProjTp(tmp,in._odata[offset]);
+	permute(chi,tmp,ptype);
+      } else if ( local ) {
+	spProjTp(chi,in._odata[offset]);
+      } else { 
+	chi=buf[offset];
+      }
+      mult(&Uchi(),&U._odata[sU](Tp),&chi());
+      spReconTp(result,Uchi);
+    }
+
+    // Xm
+    if ( dirdisp==Xm ){
+      if ( local && perm ) {
+	spProjXm(tmp,in._odata[offset]);
+	permute(chi,tmp,ptype);
+      } else if ( local ) {
+	spProjXm(chi,in._odata[offset]);
+      } else { 
+	chi=buf[offset];
+      }
+      mult(&Uchi(),&U._odata[sU](Xm),&chi());
+      spReconXm(result,Uchi);
+    }
+
+    // Ym
+    if ( dirdisp == Ym ){
+      if ( local && perm ) {
+	spProjYm(tmp,in._odata[offset]);
+	permute(chi,tmp,ptype);
+      } else if ( local ) {
+	spProjYm(chi,in._odata[offset]);
+      } else { 
+	chi=buf[offset];
+      }
+      mult(&Uchi(),&U._odata[sU](Ym),&chi());
+      spReconYm(result,Uchi);
+    }
+
+    // Zm
+    if ( dirdisp == Zm ){
+      if ( local && perm ) {
+	spProjZm(tmp,in._odata[offset]);
+	permute(chi,tmp,ptype);
+      } else if ( local ) {
+	spProjZm(chi,in._odata[offset]);
+      } else { 
+	chi=buf[offset];
+      }
+      mult(&Uchi(),&U._odata[sU](Zm),&chi());
+      spReconZm(result,Uchi);
+    }
+
+    // Tm
+    if ( dirdisp==Tm ) {
+      if ( local && perm ) {
+	spProjTm(tmp,in._odata[offset]);
+	permute(chi,tmp,ptype);
+      } else if ( local ) {
+	spProjTm(chi,in._odata[offset]);
+      } else { 
+	chi=buf[offset];
+      }
+      mult(&Uchi(),&U._odata[sU](Tm),&chi());
+      spReconTm(result,Uchi);
+    }
+
+    vstream(out._odata[ss],result*(-0.5));
+}
+
 }}
diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h
index cc535022..db384ddf 100644
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -20,6 +20,9 @@ namespace Grid {
       static void DhopSiteDag(CartesianStencil &st,LatticeDoubledGaugeField &U,
 			      std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
 			      int sF,int sU,const LatticeFermion &in, LatticeFermion &out);
+      static void DhopDir(CartesianStencil &st,LatticeDoubledGaugeField &U,
+			   std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
+			  int sF,int sU,const LatticeFermion &in, LatticeFermion &out,int dirdisp);
 
     };
 
diff --git a/lib/qcd/action/fermion/g5HermitianLinop.h b/lib/qcd/action/fermion/g5HermitianLinop.h
index 9a8cc2eb..5b3411ec 100644
--- a/lib/qcd/action/fermion/g5HermitianLinop.h
+++ b/lib/qcd/action/fermion/g5HermitianLinop.h
@@ -11,11 +11,22 @@ class Gamma5HermitianLinearOperator : public LinearOperatorBase<Field> {
 public:
   Gamma5HermitianLinearOperator(Matrix &Mat): _Mat(Mat){};
   void Op     (const Field &in, Field &out){
-    _Mat.M(in,out);
+    HermOp(in,out);
   }
   void AdjOp     (const Field &in, Field &out){
-    _Mat.M(in,out);
+    HermOp(in,out);
   }
+  void OpDiag (const Field &in, Field &out) {
+    Field tmp(in._grid);
+    _Mat.Mdiag(in,tmp);
+    G5R5(out,tmp);
+  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {
+    Field tmp(in._grid);
+    _Mat.Mdir(in,tmp,dir,disp);
+    G5R5(out,tmp);
+  }
+
   void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
 
     HermOp(in,out);
diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h
index ee4d03d5..ba023146 100644
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@@ -314,9 +314,9 @@ namespace Optimization {
   template<>
     inline Grid::ComplexF Reduce<Grid::ComplexF, __m256>::operator()(__m256 in){
     __m256 v1,v2;
-    Optimization::permute(v1,in,0); // sse 128; paired complex single
+    Optimization::permute(v1,in,0); // avx 256; quad complex single
     v1 = _mm256_add_ps(v1,in);
-    Optimization::permute(v2,v1,1); // avx 256; quad complex single
+    Optimization::permute(v2,v1,1); 
     v1 = _mm256_add_ps(v1,v2);
     u256f conv; conv.v = v1;
     return Grid::ComplexF(conv.f[0],conv.f[1]);
@@ -367,7 +367,6 @@ namespace Optimization {
     assert(0);
   }
   
-  
 }
 
 //////////////////////////////////////////////////////////////////////////////////////
diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h
index aa1abb47..9525e45e 100644
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -78,11 +78,18 @@ namespace Grid {
     typedef typename RealPart < Scalar_type >::type Real; 
     typedef Vector_type     vector_type;
     typedef Scalar_type     scalar_type;
+
+
+    typedef union conv_t_union {
+	Vector_type v;
+	Scalar_type s[sizeof(Vector_type)/sizeof(Scalar_type)];
+      conv_t_union(){};
+      } conv_t;
+    
    
     Vector_type v;
     
     static inline int Nsimd(void) { return sizeof(Vector_type)/sizeof(Scalar_type);}
-
     
     Grid_simd& operator=(const Grid_simd&& rhs){v=rhs.v;return *this;};
     Grid_simd& operator=(const Grid_simd& rhs){v=rhs.v;return *this;}; //faster than not declaring it and leaving to the compiler
@@ -192,6 +199,27 @@ namespace Grid {
       return *this;
     }
 
+
+    ///////////////////////////////////////
+    // Not all functions are supported
+    // through SIMD and must breakout to 
+    // scalar type and back again. This
+    // provides support
+    ///////////////////////////////////////
+
+    template<class functor> friend inline Grid_simd SimdApply (const functor &func,const Grid_simd &v) {
+
+      Grid_simd ret;
+      Grid_simd::conv_t conv;
+
+      conv.v = v.v;
+      for(int i=0;i<Nsimd();i++){
+	conv.s[i]=func(conv.s[i]);
+      }
+      ret.v = conv.v;
+      return ret;
+    }
+
     ////////////////////////////////////////////////////////////////////
     // General permute; assumes vector length is same across 
     // all subtypes; may not be a good assumption, but could
diff --git a/lib/simd/Grid_vector_unops.h b/lib/simd/Grid_vector_unops.h
new file mode 100644
index 00000000..9aeec350
--- /dev/null
+++ b/lib/simd/Grid_vector_unops.h
@@ -0,0 +1,60 @@
+#ifndef GRID_VECTOR_UNOPS
+#define GRID_VECTOR_UNOPS
+
+namespace Grid { 
+
+  template<class scalar> struct SqrtRealFunctor {
+    scalar operator()(const scalar &a) const {
+      return sqrt(real(a));
+    }
+  };
+
+  template<class scalar> struct RSqrtRealFunctor {
+    scalar operator()(const scalar &a)  const {
+      return scalar(1.0/sqrt(real(a)));
+    }
+  };
+
+  template<class scalar> struct CosRealFunctor {
+    scalar operator()(const scalar &a)  const {
+      return cos(real(a));
+    }
+  };
+
+  template<class scalar> struct SinRealFunctor {
+    scalar operator()(const scalar &a)  const {
+      return sin(real(a));
+    }
+  };
+
+  template<class scalar> struct PowRealFunctor {
+    double y;
+  PowRealFunctor(double _y) : y(_y) {};
+    scalar operator()(const scalar &a)  const {
+      return pow(real(a),y);
+    }
+  };
+
+  template < class S, class V > 
+  inline Grid_simd<S,V> sqrt(const Grid_simd<S,V> &r) {
+    return SimdApply(SqrtRealFunctor<S>(),r);
+  }
+  template < class S, class V > 
+  inline Grid_simd<S,V> rsqrt(const Grid_simd<S,V> &r) {
+    return SimdApply(RSqrtRealFunctor<S>(),r);
+  }
+  template < class S, class V > 
+  inline Grid_simd<S,V> cos(const Grid_simd<S,V> &r) {
+    return SimdApply(CosRealFunctor<S>(),r);
+  }
+  template < class S, class V > 
+  inline Grid_simd<S,V> sin(const Grid_simd<S,V> &r) {
+    return SimdApply(CosRealFunctor<S>(),r);
+  }
+  template < class S, class V > 
+  inline Grid_simd<S,V> pow(const Grid_simd<S,V> &r,double y) {
+    return SimdApply(PowRealFunctor<S>(y),r);
+  }
+
+}
+#endif
diff --git a/lib/tensors/Tensor_class.h b/lib/tensors/Tensor_class.h
index bd1902b8..3ed4f865 100644
--- a/lib/tensors/Tensor_class.h
+++ b/lib/tensors/Tensor_class.h
@@ -123,6 +123,13 @@ public:
   typedef iScalar<tensor_reduced_v> tensor_reduced;
   typedef iVector<recurse_scalar_object,N> scalar_object;
 
+  template<class T,typename std::enable_if<!isGridTensor<T>::value, T>::type* = nullptr > strong_inline auto operator = (T arg) -> iVector<vtype,N>
+    { 
+      zeroit(*this);
+      for(int i=0;i<N;i++)
+	_internal[i] = arg;
+      return *this;
+    }
 
   enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
   iVector(const Zero &z){ *this = zero; };
diff --git a/lib/tensors/Tensor_unary.h b/lib/tensors/Tensor_unary.h
new file mode 100644
index 00000000..1fbec985
--- /dev/null
+++ b/lib/tensors/Tensor_unary.h
@@ -0,0 +1,37 @@
+#ifndef GRID_TENSOR_UNARY_H
+#define GRID_TENSOR_UNARY_H
+namespace Grid {
+
+#define UNARY_REAL(func)\
+template<class obj> inline auto func(const iScalar<obj> &z) -> iScalar<obj>\
+{\
+    iScalar<obj> ret;\
+    ret._internal = func( (z._internal));\
+    return ret;\
+}\
+template<class obj,int N> inline auto func(const iVector<obj,N> &z) -> iVector<obj,N>\
+{\
+    iVector<obj,N> ret;\
+    for(int c1=0;c1<N;c1++){\
+      ret._internal[c1] = func( (z._internal[c1]));\
+    }\
+    return ret;\
+}\
+template<class obj,int N> inline auto func(const iMatrix<obj,N> &z) -> iMatrix<obj,N>\
+{\
+    iMatrix<obj,N> ret;\
+    for(int c1=0;c1<N;c1++){\
+    for(int c2=0;c2<N;c2++){\
+      ret._internal[c1][c2] = func( (z._internal[c1][c2]));\
+    }}\
+    return ret;\
+}
+
+UNARY_REAL(sqrt);
+UNARY_REAL(rsqrt);
+UNARY_REAL(sin);
+UNARY_REAL(cos);
+
+
+}
+#endif
diff --git a/tests/Make.inc b/tests/Make.inc
index de9d0709..d417ab56 100644
--- a/tests/Make.inc
+++ b/tests/Make.inc
@@ -1,5 +1,5 @@
 
-bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_even_odd Test_contfrac_cg Test_contfrac_even_odd Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_even_odd Test_gamma Test_main Test_nersc_io Test_remez Test_rng Test_rng_fixed Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_even_odd
+bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cf_coarsen_support Test_cf_cr_unprec Test_contfrac_cg Test_contfrac_even_odd Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_gamma Test_main Test_nersc_io Test_remez Test_rng Test_rng_fixed Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd
 
 
 Test_GaugeAction_SOURCES=Test_GaugeAction.cc
@@ -10,10 +10,18 @@ Test_cayley_cg_SOURCES=Test_cayley_cg.cc
 Test_cayley_cg_LDADD=-lGrid
 
 
+Test_cayley_coarsen_support_SOURCES=Test_cayley_coarsen_support.cc
+Test_cayley_coarsen_support_LDADD=-lGrid
+
+
 Test_cayley_even_odd_SOURCES=Test_cayley_even_odd.cc
 Test_cayley_even_odd_LDADD=-lGrid
 
 
+Test_cf_coarsen_support_SOURCES=Test_cf_coarsen_support.cc
+Test_cf_coarsen_support_LDADD=-lGrid
+
+
 Test_cf_cr_unprec_SOURCES=Test_cf_cr_unprec.cc
 Test_cf_cr_unprec_LDADD=-lGrid
 
diff --git a/tests/Test_GaugeAction.cc b/tests/Test_GaugeAction.cc
index 0dde6d8f..bbfd8717 100644
--- a/tests/Test_GaugeAction.cc
+++ b/tests/Test_GaugeAction.cc
@@ -115,7 +115,7 @@ int main (int argc, char ** argv)
   Complex l  = TensorRemove(Tl);
   std::cout << "calculated link trace " <<l*LinkTraceScale<<std::endl;
 
-  sumBlocks(cPlaq,Plaq);
+  blockSum(cPlaq,Plaq);
   TComplex TcP = sum(cPlaq);
   Complex ll= TensorRemove(TcP);
   std::cout << "coarsened plaquettes sum to " <<ll*PlaqScale<<std::endl;
diff --git a/tests/Test_cayley_coarsen_support.cc b/tests/Test_cayley_coarsen_support.cc
new file mode 100644
index 00000000..acd96da3
--- /dev/null
+++ b/tests/Test_cayley_coarsen_support.cc
@@ -0,0 +1,108 @@
+#include <Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::GammaMatrix Gmu [] = {
+    Gamma::GammaX,
+    Gamma::GammaY,
+    Gamma::GammaZ,
+    Gamma::GammaT
+  };
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+
+
+  const int Ls=8;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  // Construct a coarsened grid
+  std::vector<int> clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/2;
+  }
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+
+  LatticeFermion    src(FGrid); random(RNG5,src);
+  LatticeFermion result(FGrid); result=zero;
+  LatticeFermion    ref(FGrid); ref=zero;
+  LatticeFermion    tmp(FGrid);
+  LatticeFermion    err(FGrid);
+  LatticeGaugeField Umu(UGrid); random(RNG4,Umu);
+
+  std::vector<LatticeColourMatrix> U(4,UGrid);
+
+  for(int mu=0;mu<Nd;mu++){
+    U[mu] = peekIndex<LorentzIndex>(Umu,mu);
+  }
+  
+  RealD mass=0.5;
+  RealD M5=1.8;
+
+  DomainWallFermion Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  Gamma5HermitianLinearOperator<DomainWallFermion,LatticeFermion> HermIndefOp(Ddwf);
+
+  HermIndefOp.Op(src,ref);
+  HermIndefOp.OpDiag(src,result);
+  
+  for(int d=0;d<4;d++){
+    HermIndefOp.OpDir(src,tmp,d,+1); result=result+tmp; 
+    std::cout<<"dir "<<d<<" tmp "<<norm2(tmp)<<std::endl;
+    HermIndefOp.OpDir(src,tmp,d,-1); result=result+tmp;
+    std::cout<<"dir "<<d<<" tmp "<<norm2(tmp)<<std::endl;
+  }
+  err = result-ref;
+  std::cout<<"Error "<<norm2(err)<<std::endl;
+
+  const int nbasis = 8;
+  std::vector<LatticeFermion> subspace(nbasis,FGrid);
+  
+  for(int b=0;b<nbasis;b++){
+    random(RNG5,subspace[b]);
+  }
+  std::cout << "Computed randoms"<< std::endl;
+
+  CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOp(*Coarse5d);
+
+  LittleDiracOp.CoarsenOperator(FGrid,HermIndefOp,subspace);
+
+  typedef   Lattice<iVector<vComplex,nbasis > > coarse_vec;
+  
+  coarse_vec c_src (Coarse5d);  c_src= zero;
+  coarse_vec c_res (Coarse5d);
+  
+  Complex one(1.0);
+  c_src = one;  // 1 in every element for vector 1.
+
+  //  TODO
+  // -- promote from subspace, check we get the vector we wanted
+  // -- apply ldop; check we get the same as inner product of M times big vec
+  // -- pick blocks one by one. Evaluate matrix elements.
+
+  std::cout << "Multiplying by LittleDiracOp "<< std::endl;
+  LittleDiracOp.M(c_src,c_res);
+
+  std::cout << "Done "<< std::endl;
+  Grid_finalize();
+}
diff --git a/tests/Test_cf_coarsen_support.cc b/tests/Test_cf_coarsen_support.cc
new file mode 100644
index 00000000..9d81a798
--- /dev/null
+++ b/tests/Test_cf_coarsen_support.cc
@@ -0,0 +1,87 @@
+#include <Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::GammaMatrix Gmu [] = {
+    Gamma::GammaX,
+    Gamma::GammaY,
+    Gamma::GammaZ,
+    Gamma::GammaT
+  };
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=9;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+
+  LatticeFermion    src(FGrid); random(RNG5,src);
+  LatticeFermion result(FGrid); result=zero;
+  LatticeFermion    ref(FGrid); ref=zero;
+  LatticeFermion    tmp(FGrid);
+  LatticeFermion    err(FGrid);
+  LatticeGaugeField Umu(UGrid); random(RNG4,Umu);
+
+  std::vector<LatticeColourMatrix> U(4,UGrid);
+  for(int mu=0;mu<Nd;mu++){
+    U[mu] = peekIndex<LorentzIndex>(Umu,mu);
+  }
+  
+  RealD mass=0.1;
+  RealD M5=1.8;
+
+  {
+    OverlapWilsonContFracTanhFermion Dcf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.0);
+    HermitianLinearOperator<OverlapWilsonContFracTanhFermion,LatticeFermion> HermIndefOp(Dcf);
+
+    HermIndefOp.Op(src,ref);
+    HermIndefOp.OpDiag(src,result);
+    
+    for(int d=0;d<4;d++){
+      HermIndefOp.OpDir(src,tmp,d,+1); result=result+tmp; 
+      std::cout<<"dir "<<d<<" tmp "<<norm2(tmp)<<std::endl;
+      HermIndefOp.OpDir(src,tmp,d,-1); result=result+tmp;
+      std::cout<<"dir "<<d<<" tmp "<<norm2(tmp)<<std::endl;
+    }
+    err = result-ref;
+    std::cout<<"Error "<<norm2(err)<<std::endl;
+  }
+
+  {
+    OverlapWilsonPartialFractionTanhFermion Dpf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.0);
+    HermitianLinearOperator<OverlapWilsonPartialFractionTanhFermion,LatticeFermion> HermIndefOp(Dpf);
+    
+    HermIndefOp.Op(src,ref);
+    HermIndefOp.OpDiag(src,result);
+    
+    for(int d=0;d<4;d++){
+      HermIndefOp.OpDir(src,tmp,d,+1); result=result+tmp; 
+      std::cout<<"dir "<<d<<" tmp "<<norm2(tmp)<<std::endl;
+      HermIndefOp.OpDir(src,tmp,d,-1); result=result+tmp;
+      std::cout<<"dir "<<d<<" tmp "<<norm2(tmp)<<std::endl;
+    }
+
+    err = result-ref;
+    std::cout<<"Error "<<norm2(err)<<std::endl;
+  }
+
+
+  Grid_finalize();
+}
diff --git a/tests/Test_cf_cr_unprec.cc b/tests/Test_cf_cr_unprec.cc
new file mode 100644
index 00000000..9c1aea60
--- /dev/null
+++ b/tests/Test_cf_cr_unprec.cc
@@ -0,0 +1,58 @@
+#include <Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::GammaMatrix Gmu [] = {
+    Gamma::GammaX,
+    Gamma::GammaY,
+    Gamma::GammaZ,
+    Gamma::GammaT
+  };
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=9;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+
+  LatticeFermion    src(FGrid); random(RNG5,src);
+  LatticeFermion result(FGrid); result=zero;
+  LatticeGaugeField Umu(UGrid); random(RNG4,Umu);
+
+  std::vector<LatticeColourMatrix> U(4,UGrid);
+  for(int mu=0;mu<Nd;mu++){
+    U[mu] = peekIndex<LorentzIndex>(Umu,mu);
+  }
+  
+  RealD mass=0.1;
+  RealD M5=1.8;
+
+  OverlapWilsonContFracTanhFermion Dcf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.0);
+
+  ConjugateResidual<LatticeFermion> MCR(1.0e-8,10000);
+
+  MdagMLinearOperator<OverlapWilsonContFracTanhFermion,LatticeFermion> HermPosDefOp(Dcf);
+  MCR(HermPosDefOp,src,result);
+
+  HermitianLinearOperator<OverlapWilsonContFracTanhFermion,LatticeFermion> HermIndefOp(Dcf);
+  MCR(HermIndefOp,src,result);
+
+  Grid_finalize();
+}
diff --git a/tests/Test_dwf_cr_unprec.cc b/tests/Test_dwf_cr_unprec.cc
new file mode 100644
index 00000000..cffa726c
--- /dev/null
+++ b/tests/Test_dwf_cr_unprec.cc
@@ -0,0 +1,63 @@
+#include <Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::GammaMatrix Gmu [] = {
+    Gamma::GammaX,
+    Gamma::GammaY,
+    Gamma::GammaZ,
+    Gamma::GammaT
+  };
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+
+
+  const int Ls=8;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+
+  LatticeFermion    src(FGrid); random(RNG5,src);
+  LatticeFermion result(FGrid); result=zero;
+  LatticeGaugeField Umu(UGrid); random(RNG4,Umu);
+
+  std::vector<LatticeColourMatrix> U(4,UGrid);
+
+  for(int mu=0;mu<Nd;mu++){
+    U[mu] = peekIndex<LorentzIndex>(Umu,mu);
+  }
+
+  ConjugateResidual<LatticeFermion> MCR(1.0e-8,10000);
+  
+  RealD mass=0.5;
+  RealD M5=1.8;
+  DomainWallFermion Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+
+  MdagMLinearOperator<DomainWallFermion,LatticeFermion> HermOp(Ddwf);
+  MCR(HermOp,src,result);
+
+  Gamma5HermitianLinearOperator<DomainWallFermion,LatticeFermion> g5HermOp(Ddwf);
+  MCR(g5HermOp,src,result);
+
+
+  Grid_finalize();
+}
diff --git a/tests/Test_nersc_io.cc b/tests/Test_nersc_io.cc
index 80d78291..403d2cab 100644
--- a/tests/Test_nersc_io.cc
+++ b/tests/Test_nersc_io.cc
@@ -83,7 +83,7 @@ int main (int argc, char ** argv)
   Complex l  = TensorRemove(Tl);
   std::cout << "calculated link trace " <<l*LinkTraceScale<<std::endl;
 
-  sumBlocks(cPlaq,Plaq);
+  blockSum(cPlaq,Plaq);
   TComplex TcP = sum(cPlaq);
   Complex ll= TensorRemove(TcP);
   std::cout << "coarsened plaquettes sum to " <<ll*PlaqScale<<std::endl;
diff --git a/tests/Test_wilson_cr_unprec.cc b/tests/Test_wilson_cr_unprec.cc
new file mode 100644
index 00000000..7fbf8c8f
--- /dev/null
+++ b/tests/Test_wilson_cr_unprec.cc
@@ -0,0 +1,59 @@
+#include <Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::GammaMatrix Gmu [] = {
+    Gamma::GammaX,
+    Gamma::GammaY,
+    Gamma::GammaZ,
+    Gamma::GammaT
+  };
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout);
+
+  std::vector<int> seeds({1,2,3,4});
+  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
+
+  LatticeFermion src(&Grid); random(pRNG,src);
+  RealD nrm = norm2(src);
+  LatticeFermion result(&Grid); result=zero;
+  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
+
+  std::vector<LatticeColourMatrix> U(4,&Grid);
+
+  double volume=1;
+  for(int mu=0;mu<Nd;mu++){
+    volume=volume*latt_size[mu];
+  }  
+
+  for(int mu=0;mu<Nd;mu++){
+    U[mu] = peekIndex<LorentzIndex>(Umu,mu);
+  }
+  
+  RealD mass=0.5;
+  WilsonFermion Dw(Umu,Grid,RBGrid,mass);
+
+  MdagMLinearOperator<WilsonFermion,LatticeFermion> HermOp(Dw);
+
+  ConjugateResidual<LatticeFermion> MCR(1.0e-8,10000);
+
+  MCR(HermOp,src,result);
+
+  Grid_finalize();
+}

From a6ac2abb649148a13cd6f4f294663445e0bca703 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 8 Jun 2015 12:12:13 +0100
Subject: [PATCH 07/24] No compile fix after merge

---
 tests/Test_multishift_sqrt.cc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/Test_multishift_sqrt.cc b/tests/Test_multishift_sqrt.cc
index 8e6b08d4..1ecc1fde 100644
--- a/tests/Test_multishift_sqrt.cc
+++ b/tests/Test_multishift_sqrt.cc
@@ -20,12 +20,20 @@ public:
     sqrtscale = sqrtscale * adj(sqrtscale);// force real pos def
     scale = sqrtscale * sqrtscale; 
   }
+  // Support for coarsening to a multigrid
+  void OpDiag (const Field &in, Field &out) {};
+  void OpDir  (const Field &in, Field &out,int dir,int disp){};
+
   void Op     (const Field &in, Field &out){
     out = scale * in;
   }
   void AdjOp  (const Field &in, Field &out){
     out = scale * in;
   }
+  void HermOp(const Field &in, Field &out){
+    double n1, n2;
+    HermOpAndNorm(in,out,n1,n2);
+  }
   void HermOpAndNorm(const Field &in, Field &out,double &n1,double &n2){
     ComplexD dot;
 

From 744ac33e8b9d479069ac2f822fc57b521fd49548 Mon Sep 17 00:00:00 2001
From: neo <cossu@post.kek.jp>
Date: Tue, 9 Jun 2015 15:46:21 +0900
Subject: [PATCH 08/24] Experimental support for ARM

---
 .gitignore                         |   2 +-
 INSTALL                            |   2 +-
 configure                          |  29 ++-
 configure.ac                       |  22 ++-
 lib/GridConfig.h                   |  14 +-
 lib/GridConfig.h.in                |  14 +-
 lib/Make.inc                       |   4 +-
 lib/algorithms/approx/.dirstamp    |   0
 lib/communicator/.dirstamp         |   0
 lib/qcd/spin/.dirstamp             |   0
 lib/qcd/utils/.dirstamp            |   0
 lib/simd/Grid_avx.h                |   8 +-
 lib/simd/Grid_avx512.h             |   8 +-
 lib/simd/Grid_empty.h              | 289 +++++++++++++++++++++++++++
 lib/simd/Grid_neon.h               | 308 +++++++++++++++++++++++++++++
 lib/simd/Grid_sse4.h               |   9 +-
 lib/simd/Grid_vector_types.h       |  10 +-
 lib/stencil/.dirstamp              |   0
 scripts/arm_configure.experimental |   1 +
 tests/Make.inc                     |  10 +-
 20 files changed, 693 insertions(+), 37 deletions(-)
 create mode 100644 lib/algorithms/approx/.dirstamp
 create mode 100644 lib/communicator/.dirstamp
 create mode 100644 lib/qcd/spin/.dirstamp
 create mode 100644 lib/qcd/utils/.dirstamp
 create mode 100644 lib/simd/Grid_empty.h
 create mode 100644 lib/simd/Grid_neon.h
 create mode 100644 lib/stencil/.dirstamp
 create mode 100644 scripts/arm_configure.experimental

diff --git a/.gitignore b/.gitignore
index 82e09bc0..be00eb30 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,7 +49,7 @@ config.status
 /stamp-h1
 /config.sub
 /config.guess
-
+/INSTALL
 
 # Packages #
 ############
diff --git a/INSTALL b/INSTALL
index 80a61507..f812f5a2 120000
--- a/INSTALL
+++ b/INSTALL
@@ -1 +1 @@
-/opt/local/share/automake-1.15/INSTALL
\ No newline at end of file
+/usr/share/automake-1.14/INSTALL
\ No newline at end of file
diff --git a/configure b/configure
index 36f85edb..702a52db 100755
--- a/configure
+++ b/configure
@@ -2574,7 +2574,7 @@ test -n "$target_alias" &&
     NONENONEs,x,x, &&
   program_prefix=${target_alias}-
 
-am__api_version='1.15'
+am__api_version='1.14'
 
 # Find a good install program.  We prefer a C program (faster),
 # so one script is as good as another.  But avoid the broken or
@@ -2746,8 +2746,8 @@ test "$program_suffix" != NONE &&
 ac_script='s/[\\$]/&&/g;s/;s,x,x,$//'
 program_transform_name=`$as_echo "$program_transform_name" | sed "$ac_script"`
 
-# Expand $ac_aux_dir to an absolute path.
-am_aux_dir=`cd "$ac_aux_dir" && pwd`
+# expand $ac_aux_dir to an absolute path
+am_aux_dir=`cd $ac_aux_dir && pwd`
 
 if test x"${MISSING+set}" != xset; then
   case $am_aux_dir in
@@ -2766,7 +2766,7 @@ else
 $as_echo "$as_me: WARNING: 'missing' script is too old or missing" >&2;}
 fi
 
-if test x"${install_sh+set}" != xset; then
+if test x"${install_sh}" != xset; then
   case $am_aux_dir in
   *\ * | *\	*)
     install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
@@ -3094,8 +3094,8 @@ MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"}
 # <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
 mkdir_p='$(MKDIR_P)'
 
-# We need awk for the "check" target (and possibly the TAP driver).  The
-# system "awk" is bad on some platforms.
+# We need awk for the "check" target.  The system "awk" is bad on
+# some platforms.
 # Always define AMTAR for backward compatibility.  Yes, it's still used
 # in the wild :-(  We should find a proper way to deprecate it ...
 AMTAR='$${TAR-tar}'
@@ -3154,7 +3154,6 @@ END
 fi
 
 
-
 ac_config_headers="$ac_config_headers lib/GridConfig.h"
 
 # Check whether --enable-silent-rules was given.
@@ -6003,6 +6002,7 @@ $as_echo "$as_me: WARNING: Your processor supports fma instructions but not your
 
 
 
+
 # Checks for libraries.
 #AX_GCC_VAR_ATTRIBUTE(aligned)
 
@@ -6709,8 +6709,21 @@ $as_echo "#define AVX512 1" >>confdefs.h
 
        supported="cross compilation"
      ;;
+     NEONv7)
+       echo Configuring for experimental ARMv7 support
+
+$as_echo "#define NEONv7 1" >>confdefs.h
+
+       supported="cross compilation"
+     ;;
+     DEBUG)
+       echo Configuring without SIMD support - only for compiler DEBUGGING!
+
+$as_echo "#define EMPTY_SIMD 1" >>confdefs.h
+
+      ;;
      *)
-     as_fn_error $? "${ac_SIMD} unsupported --enable-simd option" "$LINENO" 5;
+     as_fn_error $? "${ac_SIMD} flag unsupported as --enable-simd option\nRun ./configure --help for the list of options" "$LINENO" 5;
      ;;
 esac
 
diff --git a/configure.ac b/configure.ac
index 59792fb0..cf0da9eb 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3,7 +3,7 @@
 #
 # Project Grid package  
 # 
-# Time-stamp: <2015-05-27 18:51:47 neo>
+# Time-stamp: <2015-06-09 15:26:39 neo>
 
 AC_PREREQ([2.63])
 AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk])
@@ -29,6 +29,7 @@ AC_PROG_RANLIB
 AX_CXX_COMPILE_STDCXX_11(noext, mandatory)
 AX_EXT
 
+
 # Checks for libraries.
 #AX_GCC_VAR_ATTRIBUTE(aligned)
 
@@ -75,7 +76,7 @@ supported=no
 case ${ac_SIMD} in
      SSE4)
        echo Configuring for SSE4
-       AC_DEFINE([SSE4],[1],[SSE4] )
+       AC_DEFINE([SSE4],[1],[SSE4 Intrinsics] )
        if test x"$ax_cv_support_ssse3_ext" = x"yes"; then  dnl minimal support for SSE4
          supported=yes
        else
@@ -84,7 +85,7 @@ case ${ac_SIMD} in
      ;;
      AVX)
        echo Configuring for AVX
-       AC_DEFINE([AVX1],[1],[AVX] )
+       AC_DEFINE([AVX1],[1],[AVX Intrinsics] )
        if test x"$ax_cv_support_avx_ext" = x"yes"; then  dnl minimal support for AVX
        supported=yes			  
        else
@@ -93,7 +94,7 @@ case ${ac_SIMD} in
      ;;
      AVX2)
        echo Configuring for AVX2
-       AC_DEFINE([AVX2],[1],[AVX2] )
+       AC_DEFINE([AVX2],[1],[AVX2 Intrinsics] )
        if test x"$ax_cv_support_avx2_ext" = x"yes"; then  dnl minimal support for AVX2
        supported=yes
        else
@@ -102,11 +103,20 @@ case ${ac_SIMD} in
      ;;
      AVX512|MIC)
        echo Configuring for AVX512 and MIC
-       AC_DEFINE([AVX512],[1],[AVX512] )
+       AC_DEFINE([AVX512],[1],[AVX512 Intrinsics for Knights Corner] )
        supported="cross compilation"
      ;;
+     NEONv7)
+       echo Configuring for experimental ARMv7 support 
+       AC_DEFINE([NEONv7],[1],[NEON ARMv7 Experimental support ] )
+       supported="cross compilation"
+     ;;
+     DEBUG)
+       echo Configuring without SIMD support - only for compiler DEBUGGING!
+       AC_DEFINE([EMPTY_SIMD],[1],[EMPTY_SIMD only for DEBUGGING] )
+      ;;     
      *)
-     AC_MSG_ERROR([${ac_SIMD} unsupported --enable-simd option]); 
+     AC_MSG_ERROR([${ac_SIMD} flag unsupported as --enable-simd option\nRun ./configure --help for the list of options]); 
      ;;
 esac
 
diff --git a/lib/GridConfig.h b/lib/GridConfig.h
index e5b75382..c9dea1cb 100644
--- a/lib/GridConfig.h
+++ b/lib/GridConfig.h
@@ -1,15 +1,18 @@
 /* lib/GridConfig.h.  Generated from GridConfig.h.in by configure.  */
 /* lib/GridConfig.h.in.  Generated from configure.ac by autoheader.  */
 
-/* AVX */
+/* AVX Intrinsics */
 /* #undef AVX1 */
 
-/* AVX2 */
+/* AVX2 Intrinsics */
 /* #undef AVX2 */
 
-/* AVX512 */
+/* AVX512 Intrinsics for Knights Corner */
 /* #undef AVX512 */
 
+/* EMPTY_SIMD only for DEBUGGING */
+/* #undef EMPTY_SIMD */
+
 /* GRID_COMMS_MPI */
 /* #undef GRID_COMMS_MPI */
 
@@ -111,6 +114,9 @@
 /* Define to 1 if you have the <unistd.h> header file. */
 #define HAVE_UNISTD_H 1
 
+/* NEON ARMv7 Experimental support */
+/* #undef NEONv7 */
+
 /* Name of package */
 #define PACKAGE "grid"
 
@@ -132,7 +138,7 @@
 /* Define to the version of this package. */
 #define PACKAGE_VERSION "1.0"
 
-/* SSE4 */
+/* SSE4 Intrinsics */
 #define SSE4 1
 
 /* Define to 1 if you have the ANSI C header files. */
diff --git a/lib/GridConfig.h.in b/lib/GridConfig.h.in
index afa1d1e4..8ff77f3d 100644
--- a/lib/GridConfig.h.in
+++ b/lib/GridConfig.h.in
@@ -1,14 +1,17 @@
 /* lib/GridConfig.h.in.  Generated from configure.ac by autoheader.  */
 
-/* AVX */
+/* AVX Intrinsics */
 #undef AVX1
 
-/* AVX2 */
+/* AVX2 Intrinsics */
 #undef AVX2
 
-/* AVX512 */
+/* AVX512 Intrinsics for Knights Corner */
 #undef AVX512
 
+/* EMPTY_SIMD only for DEBUGGING */
+#undef EMPTY_SIMD
+
 /* GRID_COMMS_MPI */
 #undef GRID_COMMS_MPI
 
@@ -110,6 +113,9 @@
 /* Define to 1 if you have the <unistd.h> header file. */
 #undef HAVE_UNISTD_H
 
+/* NEON ARMv7 Experimental support */
+#undef NEONv7
+
 /* Name of package */
 #undef PACKAGE
 
@@ -131,7 +137,7 @@
 /* Define to the version of this package. */
 #undef PACKAGE_VERSION
 
-/* SSE4 */
+/* SSE4 Intrinsics */
 #undef SSE4
 
 /* Define to 1 if you have the ANSI C header files. */
diff --git a/lib/Make.inc b/lib/Make.inc
index c38606e2..edbcff7f 100644
--- a/lib/Make.inc
+++ b/lib/Make.inc
@@ -1,4 +1,4 @@
 
-HFILES=./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/CoarsenedMatrix.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/LinearOperator.h ./algorithms/SparseMatrix.h ./Algorithms.h ./AlignedAllocator.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./Cartesian.h ./communicator/Communicator_base.h ./Communicator.h ./Comparison.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./Cshift.h ./Grid.h ./GridConfig.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_ET.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_unary.h ./lattice/Lattice_where.h ./Lattice.h ./parallelIO/NerscIO.h ./qcd/action/Actions.h ./qcd/action/DiffAction.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/gauge/GaugeActionBase.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/QCD.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/WilsonLoops.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Grid_vector_unops.h ./simd/Old/Grid_vComplexD.h ./simd/Old/Grid_vComplexF.h ./simd/Old/Grid_vInteger.h ./simd/Old/Grid_vRealD.h ./simd/Old/Grid_vRealF.h ./Simd.h ./stencil/Lebesgue.h ./Stencil.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_inner.h ./tensors/Tensor_outer.h ./tensors/Tensor_peek.h ./tensors/Tensor_poke.h ./tensors/Tensor_reality.h ./tensors/Tensor_Ta.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./tensors/Tensor_unary.h ./Tensors.h ./Threads.h
+HFILES=./Cshift.h ./simd/Grid_avx.h ./simd/Grid_vector_types.h ./simd/Grid_sse4.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_vector_unops.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./Tensors.h ./Algorithms.h ./communicator/Communicator_base.h ./lattice/Lattice_rng.h ./lattice/Lattice_reduction.h ./lattice/Lattice_transfer.h ./lattice/Lattice_unary.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_comparison.h ./lattice/Lattice_overload.h ./lattice/Lattice_reality.h ./lattice/Lattice_local.h ./lattice/Lattice_conformable.h ./lattice/Lattice_where.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_ET.h ./lattice/Lattice_transpose.h ./lattice/Lattice_trace.h ./Stencil.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_poke.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_class.h ./tensors/Tensor_transpose.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_reality.h ./tensors/Tensor_trace.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_outer.h ./tensors/Tensor_inner.h ./tensors/Tensor_traits.h ./tensors/Tensor_Ta.h ./tensors/Tensor_unary.h ./tensors/Tensor_peek.h ./tensors/Tensor_arith.h ./tensors/Tensor_extract_merge.h ./Communicator.h ./Cartesian.h ./parallelIO/NerscIO.h ./qcd/QCD.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/LinalgUtils.h ./qcd/utils/CovariantCshift.h ./qcd/utils/WilsonLoops.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/gauge/GaugeActionBase.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/spin/TwoSpinor.h ./qcd/spin/Dirac.h ./cshift/Cshift_common.h ./cshift/Cshift_none.h ./cshift/Cshift_mpi.h ./Simd.h ./GridConfig.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_red_black.h ./cartesian/Cartesian_full.h ./AlignedAllocator.h ./Lattice.h ./Threads.h ./Comparison.h ./Grid.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/Zolotarev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Remez.h ./algorithms/LinearOperator.h ./algorithms/SparseMatrix.h ./algorithms/CoarsenedMatrix.h ./stencil/Lebesgue.h
 
-CCFILES=./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./GridInit.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/spin/Dirac.cc ./qcd/utils/SpaceTimeGrid.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc
+CCFILES=./qcd/utils/SpaceTimeGrid.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/spin/Dirac.cc ./GridInit.cc ./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc
diff --git a/lib/algorithms/approx/.dirstamp b/lib/algorithms/approx/.dirstamp
new file mode 100644
index 00000000..e69de29b
diff --git a/lib/communicator/.dirstamp b/lib/communicator/.dirstamp
new file mode 100644
index 00000000..e69de29b
diff --git a/lib/qcd/spin/.dirstamp b/lib/qcd/spin/.dirstamp
new file mode 100644
index 00000000..e69de29b
diff --git a/lib/qcd/utils/.dirstamp b/lib/qcd/utils/.dirstamp
new file mode 100644
index 00000000..e69de29b
diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h
index ba023146..2dc38cca 100644
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@@ -4,7 +4,7 @@
 
   Using intrinsics
 */
-// Time-stamp: <2015-05-29 14:13:30 neo>
+// Time-stamp: <2015-06-09 14:26:59 neo>
 //----------------------------------------------------------------------
 
 #include <immintrin.h>
@@ -383,6 +383,12 @@ namespace Grid {
       _mm_prefetch(ptr+i+512,_MM_HINT_T0);
     }
   }
+  inline void prefetch_HINT_T0(const char *ptr){
+    _mm_prefetch(ptr,_MM_HINT_T0);
+  }
+
+
+
   template < typename VectorSIMD > 
     inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
     Optimization::permute(y.v,b.v,perm);
diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h
index bb914270..10741174 100644
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@@ -4,7 +4,7 @@
 
   Using intrinsics
 */
-// Time-stamp: <2015-05-27 12:08:50 neo>
+// Time-stamp: <2015-06-09 14:27:28 neo>
 //----------------------------------------------------------------------
 
 #include <immintrin.h>
@@ -309,6 +309,12 @@ namespace Grid {
       _mm_prefetch(ptr+i+512,_MM_HINT_T0);
     }
   }
+  inline void prefetch_HINT_T0(const char *ptr){
+    _mm_prefetch(ptr,_MM_HINT_T0);
+  }
+
+
+
 
   // Gpermute utilities consider coalescing into 1 Gpermute
   template < typename VectorSIMD > 
diff --git a/lib/simd/Grid_empty.h b/lib/simd/Grid_empty.h
new file mode 100644
index 00000000..1c088d87
--- /dev/null
+++ b/lib/simd/Grid_empty.h
@@ -0,0 +1,289 @@
+//----------------------------------------------------------------------
+/*! @file Grid_sse4.h
+  @brief Empty Optimization libraries for debugging
+
+  Using intrinsics
+*/
+// Time-stamp: <2015-06-09 14:28:02 neo>
+//----------------------------------------------------------------------
+
+namespace Optimization {
+
+  template<class vtype>
+  union uconv {
+    float f;
+    vtype v;
+  };
+
+  union u128f {
+    float v;
+    float f[4];
+  };
+  union u128d {
+    double v;
+    double f[2];
+  };
+  
+  struct Vsplat{
+    //Complex float
+    inline float operator()(float a, float b){
+      return 0;
+    }
+    // Real float
+    inline float operator()(float a){
+      return 0;
+    }
+    //Complex double
+    inline double operator()(double a, double b){
+      return 0;
+    }
+    //Real double
+    inline double operator()(double a){
+      return 0;
+    }
+    //Integer
+    inline int operator()(Integer a){
+      return 0;
+    }
+  };
+
+  struct Vstore{
+    //Float 
+    inline void operator()(float a, float* F){
+      
+    }
+    //Double
+    inline void operator()(double a, double* D){
+     
+    }
+    //Integer
+    inline void operator()(int a, Integer* I){
+      
+    }
+
+  };
+
+  struct Vstream{
+    //Float
+    inline void operator()(float * a, float b){
+     
+    }
+    //Double
+    inline void operator()(double * a, double b){
+     
+    }
+
+
+  };
+
+  struct Vset{
+    // Complex float 
+    inline float operator()(Grid::ComplexF *a){
+      return 0;
+    }
+    // Complex double 
+    inline double operator()(Grid::ComplexD *a){
+      return 0;
+    }
+    // Real float 
+    inline float operator()(float *a){
+      return  0;
+    }
+    // Real double
+    inline double operator()(double *a){
+      return 0;
+    }
+    // Integer
+    inline int operator()(Integer *a){
+      return 0;
+    }
+
+
+  };
+
+  template <typename Out_type, typename In_type>
+  struct Reduce{
+    //Need templated class to overload output type
+    //General form must generate error if compiled
+    inline Out_type operator()(In_type in){
+      printf("Error, using wrong Reduce function\n");
+      exit(1);
+      return 0;
+    }
+  };
+
+  /////////////////////////////////////////////////////
+  // Arithmetic operations
+  /////////////////////////////////////////////////////
+  struct Sum{
+    //Complex/Real float
+    inline float operator()(float a, float b){
+      return 0;
+    }
+    //Complex/Real double
+    inline double operator()(double a, double b){
+      return 0;
+    }
+    //Integer
+    inline int operator()(int a, int b){
+      return 0;
+    }
+  };
+
+  struct Sub{
+    //Complex/Real float
+    inline float operator()(float a, float b){
+      return 0;
+    }
+    //Complex/Real double
+    inline double operator()(double a, double b){
+      return 0;
+    }
+    //Integer
+    inline int operator()(int a, int b){
+      return 0;
+    }
+  };
+
+  struct MultComplex{
+    // Complex float
+    inline float operator()(float a, float b){
+      return 0;
+    }
+    // Complex double
+    inline double operator()(double a, double b){
+      return 0;
+    }
+  };
+
+  struct Mult{
+    // Real float
+    inline float operator()(float a, float b){
+      return 0;
+    }
+    // Real double
+    inline double operator()(double a, double b){
+      return 0;
+    }
+    // Integer
+    inline int operator()(int a, int b){
+      return 0;
+    }
+  };
+
+  struct Conj{
+    // Complex single
+    inline float operator()(float in){
+      return 0;
+    }
+    // Complex double
+    inline double operator()(double in){
+      return 0;
+    }
+    // do not define for integer input
+  };
+
+  struct TimesMinusI{
+    //Complex single
+    inline float operator()(float in, float ret){
+      return 0;
+    }
+    //Complex double
+    inline double operator()(double in, double ret){
+      return 0;
+    }
+
+
+  };
+
+  struct TimesI{
+    //Complex single
+    inline float operator()(float in, float ret){
+      return 0;
+    }
+    //Complex double
+    inline double operator()(double in, double ret){
+      return 0;
+    }
+  };
+
+  //////////////////////////////////////////////
+  // Some Template specialization
+  template < typename vtype > 
+    void permute(vtype &a, vtype b, int perm) {
+   }; 
+
+  //Complex float Reduce
+  template<>
+  inline Grid::ComplexF Reduce<Grid::ComplexF, float>::operator()(float in){
+    return 0;
+  }
+  //Real float Reduce
+  template<>
+  inline Grid::RealF Reduce<Grid::RealF, float>::operator()(float in){
+    return 0;
+  }
+  
+  
+  //Complex double Reduce
+  template<>
+  inline Grid::ComplexD Reduce<Grid::ComplexD, double>::operator()(double in){
+    return 0;
+  }
+  
+  //Real double Reduce
+  template<>
+  inline Grid::RealD Reduce<Grid::RealD, double>::operator()(double in){
+    return 0;
+  }
+
+  //Integer Reduce
+  template<>
+  inline Integer Reduce<Integer, int>::operator()(int in){
+    // FIXME unimplemented
+   printf("Reduce : Missing integer implementation -> FIX\n");
+    assert(0);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////
+// Here assign types 
+namespace Grid {
+
+  typedef float SIMD_Ftype;  // Single precision type
+  typedef double SIMD_Dtype; // Double precision type
+  typedef int SIMD_Itype; // Integer type
+
+  // prefetch utilities
+  inline void v_prefetch0(int size, const char *ptr){};
+  inline void prefetch_HINT_T0(const char *ptr){};
+
+
+
+  // Gpermute function
+  template < typename VectorSIMD > 
+    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
+    Optimization::permute(y.v,b.v,perm);
+  }
+
+
+  // Function name aliases
+  typedef Optimization::Vsplat   VsplatSIMD;
+  typedef Optimization::Vstore   VstoreSIMD;
+  typedef Optimization::Vset     VsetSIMD;
+  typedef Optimization::Vstream  VstreamSIMD;
+  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
+
+ 
+
+
+  // Arithmetic operations
+  typedef Optimization::Sum         SumSIMD;
+  typedef Optimization::Sub         SubSIMD;
+  typedef Optimization::Mult        MultSIMD;
+  typedef Optimization::MultComplex MultComplexSIMD;
+  typedef Optimization::Conj        ConjSIMD;
+  typedef Optimization::TimesMinusI TimesMinusISIMD;
+  typedef Optimization::TimesI      TimesISIMD;
+
+}
diff --git a/lib/simd/Grid_neon.h b/lib/simd/Grid_neon.h
new file mode 100644
index 00000000..73660b40
--- /dev/null
+++ b/lib/simd/Grid_neon.h
@@ -0,0 +1,308 @@
+//----------------------------------------------------------------------
+/*! @file Grid_sse4.h
+  @brief Optimization libraries for NEON (ARM) instructions set ARMv7
+
+  Experimental - Using intrinsics - DEVELOPING! 
+*/
+// Time-stamp: <2015-06-09 15:25:40 neo>
+//----------------------------------------------------------------------
+
+#include <arm_neon.h>
+
+namespace Optimization {
+
+  template<class vtype>
+  union uconv {
+    float32x4_t f;
+    vtype v;
+  };
+
+  union u128f {
+    float32x4_t v;
+    float f[4];
+  };
+  union u128d {
+    float32x4_t v;
+    float f[4];
+  };
+  
+  struct Vsplat{
+    //Complex float
+    inline float32x4_t operator()(float a, float b){
+      float32x4_t foo;
+      return foo;
+    }
+    // Real float
+    inline float32x4_t operator()(float a){
+      float32x4_t foo;
+      return foo;
+    }
+    //Complex double
+    inline float32x4_t operator()(double a, double b){
+      float32x4_t foo;
+      return foo;
+    }
+    //Real double
+    inline float32x4_t operator()(double a){
+      float32x4_t foo;
+      return foo;
+    }
+    //Integer
+    inline uint32x4_t operator()(Integer a){
+      uint32x4_t foo;
+      return foo;
+    }
+  };
+
+  struct Vstore{
+    //Float 
+    inline void operator()(float32x4_t a, float* F){
+      
+    }
+    //Double
+    inline void operator()(float32x4_t a, double* D){
+      
+    }
+    //Integer
+    inline void operator()(uint32x4_t a, Integer* I){
+     
+    }
+
+  };
+
+  struct Vstream{
+    //Float
+    inline void operator()(float * a, float32x4_t b){
+    
+    }
+    //Double
+    inline void operator()(double * a, float32x4_t b){
+  
+    }
+
+
+  };
+
+  struct Vset{
+    // Complex float 
+    inline float32x4_t operator()(Grid::ComplexF *a){
+      float32x4_t foo;
+      return foo;
+    }
+    // Complex double 
+    inline float32x4_t operator()(Grid::ComplexD *a){
+      float32x4_t foo;
+      return foo;
+    }
+    // Real float 
+    inline float32x4_t operator()(float *a){
+      float32x4_t foo;
+      return foo;
+    }
+    // Real double
+    inline float32x4_t operator()(double *a){
+      float32x4_t foo;
+      return foo;
+    }
+    // Integer
+    inline uint32x4_t operator()(Integer *a){
+      uint32x4_t foo;
+      return foo;
+    }
+
+
+  };
+
+  template <typename Out_type, typename In_type>
+  struct Reduce{
+    //Need templated class to overload output type
+    //General form must generate error if compiled
+    inline Out_type operator()(In_type in){
+      printf("Error, using wrong Reduce function\n");
+      exit(1);
+      return 0;
+    }
+  };
+
+  /////////////////////////////////////////////////////
+  // Arithmetic operations
+  /////////////////////////////////////////////////////
+  struct Sum{
+    //Complex/Real float
+    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
+      float32x4_t foo;
+      return foo;
+    }
+    //Complex/Real double
+    //inline float32x4_t operator()(float32x4_t a, float32x4_t b){
+    //  float32x4_t foo;
+    //  return foo;
+    //}
+    //Integer
+    inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
+      uint32x4_t foo;
+      return foo;
+    }
+  };
+
+  struct Sub{
+    //Complex/Real float
+    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
+      float32x4_t foo;
+      return foo;
+    }
+    //Complex/Real double
+    //inline float32x4_t operator()(float32x4_t a, float32x4_t b){
+    //  float32x4_t foo;
+    //  return foo;
+    //}
+    //Integer
+    inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
+      uint32x4_t foo;
+      return foo;
+    }
+  };
+
+  struct MultComplex{
+    // Complex float
+    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
+      float32x4_t foo;
+      return foo;
+    }
+    // Complex double
+    //inline float32x4_t operator()(float32x4_t a, float32x4_t b){
+    //  float32x4_t foo;
+    //  return foo;
+    //}
+  };
+
+  struct Mult{
+    // Real float
+    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
+      return a;
+    }
+    // Real double
+    //inline float32x4_t operator()(float32x4_t a, float32x4_t b){
+    //  return 0;
+    //}
+    // Integer
+    inline uint32x4_t operator()(uint32x4_t a, uint32x4_t b){
+      return a;
+    }
+  };
+
+  struct Conj{
+    // Complex single
+    inline float32x4_t operator()(float32x4_t in){
+      return in;
+    }
+    // Complex double
+    //inline float32x4_t operator()(float32x4_t in){
+    // return 0;
+    //}
+    // do not define for integer input
+  };
+
+  struct TimesMinusI{
+    //Complex single
+    inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
+      return in;
+    }
+    //Complex double
+    //inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
+    //  return in;
+    //}
+
+
+  };
+
+  struct TimesI{
+    //Complex single
+    inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
+      return in;
+    }
+    //Complex double
+    //inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
+    //  return 0;
+    //}
+  };
+
+  //////////////////////////////////////////////
+  // Some Template specialization
+  template < typename vtype > 
+    void permute(vtype &a, vtype b, int perm) {
+
+  }; 
+
+  //Complex float Reduce
+  template<>
+  inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){
+    return 0;
+  }
+  //Real float Reduce
+  template<>
+  inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
+    return 0;
+  }
+  
+  
+  //Complex double Reduce
+  template<>
+  inline Grid::ComplexD Reduce<Grid::ComplexD, float32x4_t>::operator()(float32x4_t in){
+    return 0;
+  }
+  
+  //Real double Reduce
+  template<>
+  inline Grid::RealD Reduce<Grid::RealD, float32x4_t>::operator()(float32x4_t in){
+    return 0;
+  }
+
+  //Integer Reduce
+  template<>
+  inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){
+    // FIXME unimplemented
+   printf("Reduce : Missing integer implementation -> FIX\n");
+    assert(0);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////
+// Here assign types 
+namespace Grid {
+
+  typedef float32x4_t  SIMD_Ftype; // Single precision type
+  typedef float32x4_t  SIMD_Dtype; // Double precision type - no double on ARMv7
+  typedef uint32x4_t   SIMD_Itype; // Integer type
+
+  inline void v_prefetch0(int size, const char *ptr){};  // prefetch utilities
+  inline void prefetch_HINT_T0(const char *ptr){};
+
+
+  // Gpermute function
+  template < typename VectorSIMD > 
+    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
+    Optimization::permute(y.v,b.v,perm);
+  }
+
+
+  // Function name aliases
+  typedef Optimization::Vsplat   VsplatSIMD;
+  typedef Optimization::Vstore   VstoreSIMD;
+  typedef Optimization::Vset     VsetSIMD;
+  typedef Optimization::Vstream  VstreamSIMD;
+  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
+
+ 
+
+
+  // Arithmetic operations
+  typedef Optimization::Sum         SumSIMD;
+  typedef Optimization::Sub         SubSIMD;
+  typedef Optimization::Mult        MultSIMD;
+  typedef Optimization::MultComplex MultComplexSIMD;
+  typedef Optimization::Conj        ConjSIMD;
+  typedef Optimization::TimesMinusI TimesMinusISIMD;
+  typedef Optimization::TimesI      TimesISIMD;
+
+}
diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h
index 0da888f7..3a54c0b3 100644
--- a/lib/simd/Grid_sse4.h
+++ b/lib/simd/Grid_sse4.h
@@ -4,7 +4,7 @@
 
   Using intrinsics
 */
-// Time-stamp: <2015-05-27 12:02:07 neo>
+// Time-stamp: <2015-06-09 14:24:01 neo>
 //----------------------------------------------------------------------
 
 #include <pmmintrin.h>
@@ -297,7 +297,12 @@ namespace Grid {
   typedef __m128d SIMD_Dtype; // Double precision type
   typedef __m128i SIMD_Itype; // Integer type
 
-  inline void v_prefetch0(int size, const char *ptr){};  // prefetch utilities
+  // prefetch utilities
+  inline void v_prefetch0(int size, const char *ptr){};
+  inline void prefetch_HINT_T0(const char *ptr){
+    _mm_prefetch(ptr,_MM_HINT_T0);
+  }
+  
 
   // Gpermute function
   template < typename VectorSIMD > 
diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h
index 9525e45e..c670f531 100644
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -2,11 +2,14 @@
 /*! @file Grid_vector_types.h
   @brief Defines templated class Grid_simd to deal with inner vector types
 */
-// Time-stamp: <2015-05-29 14:19:48 neo>
+// Time-stamp: <2015-06-09 15:00:47 neo>
 //---------------------------------------------------------------------------
 #ifndef GRID_VECTOR_TYPES
 #define GRID_VECTOR_TYPES
 
+#ifdef EMPTY_SIMD
+#include "Grid_empty.h"
+#endif
 #ifdef SSE4
 #include "Grid_sse4.h"
 #endif
@@ -19,6 +22,9 @@
 #if defined QPX
 #include "Grid_qpx.h"
 #endif
+#ifdef NEONv7
+#include "Grid_neon.h"
+#endif
 
 namespace Grid {
 
@@ -152,7 +158,7 @@ namespace Grid {
     ///////////////////////
     friend inline void vprefetch(const Grid_simd &v)
     {
-      _mm_prefetch((const char*)&v.v,_MM_HINT_T0);
+      prefetch_HINT_T0((const char*)&v.v);
     }
 
     ///////////////////////
diff --git a/lib/stencil/.dirstamp b/lib/stencil/.dirstamp
new file mode 100644
index 00000000..e69de29b
diff --git a/scripts/arm_configure.experimental b/scripts/arm_configure.experimental
new file mode 100644
index 00000000..fbad84a5
--- /dev/null
+++ b/scripts/arm_configure.experimental
@@ -0,0 +1 @@
+./configure --host=arm-linux-gnueabihf  CXX=clang++-3.5 CXXFLAGS='-std=c++11 -O3 -target arm-linux-gnueabihf -I/usr/arm-linux-gnueabihf/include/ -I/home/neo/Codes/gmp6.0/gmp-arm/include/ -I/usr/arm-linux-gnueabihf/include/c++/4.8.2/arm-linux-gnueabihf/ -L/home/neo/Codes/gmp6.0/gmp-arm/lib/ -I/home/neo/Codes/mpfr3.1.2/mpfr-arm/include/ -L/home/neo/Codes/mpfr3.1.2/mpfr-arm/lib/ -static -mcpu=cortex-a7' --enable-simd=NEONv7
diff --git a/tests/Make.inc b/tests/Make.inc
index 2b132fd0..1f5573ca 100644
--- a/tests/Make.inc
+++ b/tests/Make.inc
@@ -1,9 +1,5 @@
 
-bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cf_coarsen_support Test_cf_cr_unprec Test_contfrac_cg Test_contfrac_even_odd Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_gamma Test_main Test_multishift_sqrt Test_nersc_io Test_remez Test_rng Test_rng_fixed Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd
-
-
-Test_GaugeAction_SOURCES=Test_GaugeAction.cc
-Test_GaugeAction_LDADD=-lGrid
+bin_PROGRAMS = Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cf_coarsen_support Test_cf_cr_unprec Test_contfrac_cg Test_contfrac_even_odd Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_gamma Test_GaugeAction Test_main Test_multishift_sqrt Test_nersc_io Test_remez Test_rng Test_rng_fixed Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd
 
 
 Test_cayley_cg_SOURCES=Test_cayley_cg.cc
@@ -66,6 +62,10 @@ Test_gamma_SOURCES=Test_gamma.cc
 Test_gamma_LDADD=-lGrid
 
 
+Test_GaugeAction_SOURCES=Test_GaugeAction.cc
+Test_GaugeAction_LDADD=-lGrid
+
+
 Test_main_SOURCES=Test_main.cc
 Test_main_LDADD=-lGrid
 

From f19518d56466b59344c208680e9e14d954ea7c4c Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 9 Jun 2015 10:25:29 +0100
Subject: [PATCH 09/24] Unary ops and coarse grid support

---
 lib/tensors/Tensor_traits.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lib/tensors/Tensor_traits.h b/lib/tensors/Tensor_traits.h
index c47dca10..b5715c93 100644
--- a/lib/tensors/Tensor_traits.h
+++ b/lib/tensors/Tensor_traits.h
@@ -119,7 +119,6 @@ namespace Grid {
     static const bool value = true;
     static const bool notvalue = false;
   };
-
   template<> struct isGridTensor<int > {
     static const bool value = false;
     static const bool notvalue = true;

From 1048304f3081c1e4b7661c943e0dfb43f781f447 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 9 Jun 2015 10:26:19 +0100
Subject: [PATCH 10/24] Some unary ops and coarse grid support

---
 lib/Grid.h                                    |   1 -
 lib/Make.inc                                  |   2 +-
 lib/algorithms/CoarsenedMatrix.h              | 134 +++++++++++++++---
 lib/lattice/Lattice_base.h                    |   4 +-
 .../Lattice_comparison_utils.h}               |   2 -
 lib/lattice/Lattice_coordinate.h              |  14 +-
 lib/lattice/Lattice_transfer.h                |  23 ++-
 lib/lattice/Lattice_unary.h                   |  41 ++++++
 lib/qcd/action/fermion/WilsonFermion5D.cc     |   4 +
 lib/simd/Grid_vector_unops.h                  |  12 ++
 lib/tensors/Tensor_class.h                    |   9 +-
 lib/tensors/Tensor_unary.h                    |  29 ++++
 scripts/linecount                             |  11 +-
 tests/Test_cayley_coarsen_support.cc          |  19 ++-
 tests/Test_cshift.cc                          |   1 -
 15 files changed, 258 insertions(+), 48 deletions(-)
 rename lib/{Comparison.h => lattice/Lattice_comparison_utils.h} (98%)

diff --git a/lib/Grid.h b/lib/Grid.h
index 16530434..aec0e9ac 100644
--- a/lib/Grid.h
+++ b/lib/Grid.h
@@ -55,7 +55,6 @@
 #include <Cartesian.h> // subdir aggregate
 #include <Tensors.h>   // subdir aggregate
 #include <Lattice.h>   // subdir aggregate
-#include <Comparison.h>
 #include <Cshift.h>    // subdir aggregate
 #include <Stencil.h>   // subdir aggregate
 #include <Algorithms.h>// subdir aggregate
diff --git a/lib/Make.inc b/lib/Make.inc
index c38606e2..7c7e02c8 100644
--- a/lib/Make.inc
+++ b/lib/Make.inc
@@ -1,4 +1,4 @@
 
-HFILES=./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/CoarsenedMatrix.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/LinearOperator.h ./algorithms/SparseMatrix.h ./Algorithms.h ./AlignedAllocator.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./Cartesian.h ./communicator/Communicator_base.h ./Communicator.h ./Comparison.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./Cshift.h ./Grid.h ./GridConfig.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_ET.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_unary.h ./lattice/Lattice_where.h ./Lattice.h ./parallelIO/NerscIO.h ./qcd/action/Actions.h ./qcd/action/DiffAction.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/gauge/GaugeActionBase.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/QCD.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/WilsonLoops.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Grid_vector_unops.h ./simd/Old/Grid_vComplexD.h ./simd/Old/Grid_vComplexF.h ./simd/Old/Grid_vInteger.h ./simd/Old/Grid_vRealD.h ./simd/Old/Grid_vRealF.h ./Simd.h ./stencil/Lebesgue.h ./Stencil.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_inner.h ./tensors/Tensor_outer.h ./tensors/Tensor_peek.h ./tensors/Tensor_poke.h ./tensors/Tensor_reality.h ./tensors/Tensor_Ta.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./tensors/Tensor_unary.h ./Tensors.h ./Threads.h
+HFILES=./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/CoarsenedMatrix.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/LinearOperator.h ./algorithms/SparseMatrix.h ./Algorithms.h ./AlignedAllocator.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./Cartesian.h ./communicator/Communicator_base.h ./Communicator.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./Cshift.h ./Grid.h ./GridConfig.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_ET.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_unary.h ./lattice/Lattice_where.h ./Lattice.h ./parallelIO/NerscIO.h ./qcd/action/Actions.h ./qcd/action/DiffAction.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/gauge/GaugeActionBase.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/QCD.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/WilsonLoops.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Grid_vector_unops.h ./simd/Old/Grid_vComplexD.h ./simd/Old/Grid_vComplexF.h ./simd/Old/Grid_vInteger.h ./simd/Old/Grid_vRealD.h ./simd/Old/Grid_vRealF.h ./Simd.h ./stencil/Lebesgue.h ./Stencil.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_inner.h ./tensors/Tensor_outer.h ./tensors/Tensor_peek.h ./tensors/Tensor_poke.h ./tensors/Tensor_reality.h ./tensors/Tensor_Ta.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./tensors/Tensor_unary.h ./Tensors.h ./Threads.h
 
 CCFILES=./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./GridInit.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/spin/Dirac.cc ./qcd/utils/SpaceTimeGrid.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc
diff --git a/lib/algorithms/CoarsenedMatrix.h b/lib/algorithms/CoarsenedMatrix.h
index 54ea279e..006da07f 100644
--- a/lib/algorithms/CoarsenedMatrix.h
+++ b/lib/algorithms/CoarsenedMatrix.h
@@ -6,12 +6,42 @@
 namespace Grid {
 
   class Geometry {
+    //    int dimension;
   public:
     int npoint;
-    int dimension;
     std::vector<int> directions   ;
     std::vector<int> displacements;
+    std::vector<int> opdirs;
 
+    // FIXME -- don't like xposing the operator directions
+    // as different to the geometrical dirs
+    // Also don't like special casing five dim.. should pass an object in template
+  Geometry(int _d)  {
+  
+      int base = (_d==5) ? 1:0;
+
+      // make coarse grid stencil for 4d , not 5d
+      if ( _d==5 ) _d=4;
+
+      npoint = 2*_d+1;
+      directions.resize(npoint);
+      displacements.resize(npoint);
+      opdirs.resize(npoint);
+      for(int d=0;d<_d;d++){
+	directions[2*d  ] = d+base;
+	directions[2*d+1] = d+base;
+	opdirs[2*d  ] = d;
+	opdirs[2*d+1] = d;
+	displacements[2*d  ] = +1;
+	displacements[2*d+1] = -1;
+      }
+      directions   [2*_d]=0;
+      displacements[2*_d]=0;
+      opdirs       [2*_d]=0;
+    }
+  
+    /*
+      // Original cleaner code
     Geometry(int _d) : dimension(_d), npoint(2*_d+1), directions(npoint), displacements(npoint) {
       for(int d=0;d<dimension;d++){
 	directions[2*d  ] = d;
@@ -22,12 +52,12 @@ namespace Grid {
       directions   [2*dimension]=0;
       displacements[2*dimension]=0;
     }
-    
     std::vector<int> GetDelta(int point) {
       std::vector<int> delta(dimension,0);
       delta[directions[point]] = displacements[point];
       return delta;
     };
+    */    
 
   };
   
@@ -50,7 +80,10 @@ namespace Grid {
     Geometry         geom;
     GridBase *       _grid; 
     CartesianStencil Stencil; 
+
     std::vector<CoarseMatrix> A;
+    std::vector<CoarseMatrix> Aslow;
+
     std::vector<siteVector,alignedAllocator<siteVector> >   comm_buf;
       
     ///////////////////////
@@ -63,7 +96,7 @@ namespace Grid {
       SimpleCompressor<siteVector> compressor;
       Stencil.HaloExchange(in,comm_buf,compressor);
 
-      //PARALLEL_FOR_LOOP
+PARALLEL_FOR_LOOP
       for(int ss=0;ss<Grid()->oSites();ss++){
         siteVector res = zero;
 	siteVector tmp;
@@ -101,43 +134,112 @@ namespace Grid {
       _grid(&CoarseGrid),
       geom(CoarseGrid._ndimension),
       Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
-      A(geom.npoint,&CoarseGrid)
+	A(geom.npoint,&CoarseGrid),
+	Aslow(geom.npoint,&CoarseGrid)
     {
       comm_buf.resize(Stencil._unified_buffer_size);
     };
 
     void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,std::vector<Lattice<Fobj> > & subspace){
 
-      FineField  phi(FineGrid);
-      FineField Mphi(FineGrid);
-      CoarseVector Proj(Grid()); 
+      FineField iblock(FineGrid); // contributions from within this block
+      FineField oblock(FineGrid); // contributions from outwith this block
+
+      FineField     phi(FineGrid);
+      FineField     tmp(FineGrid);
+      FineField     zz(FineGrid); zz=zero;
+      FineField    Mphi(FineGrid);
+
+      Lattice<iScalar<vInteger> > coor(FineGrid);
+
+      CoarseVector iProj(Grid()); 
+      CoarseVector oProj(Grid()); 
       CoarseScalar InnerProd(Grid()); 
 
       // Orthogonalise the subblocks over the basis
       blockOrthogonalise(InnerProd,subspace);
+      blockProject(iProj,subspace[0],subspace);
 
       // Compute the matrix elements of linop between this orthonormal
       // set of vectors.
+      int self_stencil=-1;
+      for(int p=0;p<geom.npoint;p++){ 
+	A[p]=zero;
+	if( geom.displacements[p]==0){
+	  self_stencil=p;
+	}
+      }
+      assert(self_stencil!=-1);
+
       for(int i=0;i<nbasis;i++){
 	phi=subspace[i];
 	for(int p=0;p<geom.npoint;p++){ 
-	  int dir = geom.directions[p];
+
+	  int dir   = geom.directions[p];
+	  int opdir = geom.opdirs[p];
 	  int disp= geom.displacements[p];
 
-	  if ( disp==0 )linop.OpDiag(phi,Mphi);
-	  else linop.OpDir(phi,Mphi,dir,disp); 
+	  int block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]);
 
-	  blockProject(Proj,Mphi,subspace);
+	  LatticeCoordinate(coor,dir);
 
-	  for(int ss=0;ss<Grid()->oSites();ss++){
-	    for(int j=0;j<nbasis;j++){
-	      A[p]._odata[ss](j,i) = Proj._odata[ss](j);
-	    }
+	  if ( disp==0 ){
+	    linop.OpDiag(phi,Mphi);
+	  }
+	  else  {
+	    linop.OpDir(phi,Mphi,opdir,disp); 
 	  }
 
-	}	
+	  ////////////////////////////////////////////////////////////////////////
+	  // Pick out contributions coming from this cell and neighbour cell
+	  ////////////////////////////////////////////////////////////////////////
+	  if ( disp==0 ) {
+	    iblock = Mphi;
+	    oblock = zero;
+	  } else if ( disp==1 ) {
+	    oblock = where(mod(coor,block)==(block-1),Mphi,zz);
+	    iblock = where(mod(coor,block)!=(block-1),Mphi,zz);
+	  } else if ( disp==-1 ) {
+	    oblock = where(mod(coor,block)==0,Mphi,zz);
+	    iblock = where(mod(coor,block)!=0,Mphi,zz);
+	  } else {
+	    assert(0);
+	  }
+
+	  blockProject(iProj,iblock,subspace);
+	  blockProject(oProj,oblock,subspace);
+	  for(int ss=0;ss<Grid()->oSites();ss++){
+	    for(int j=0;j<nbasis;j++){
+	      if( disp!= 0 ) {
+		A[p]._odata[ss](j,i) = oProj._odata[ss](j);
+	      }
+	      A[self_stencil]._odata[ss](j,i) =	A[self_stencil]._odata[ss](j,i) + iProj._odata[ss](j);
+	    }
+	  }
+	}
       }
+
+#if 0
+      ///////////////////////////
+      // test code worth preserving in if block
+      ///////////////////////////
+      std::cout<< " Computed matrix elements "<< self_stencil <<std::endl;
+      for(int p=0;p<geom.npoint;p++){
+	std::cout<< "A["<<p<<"]" << std::endl;
+	std::cout<< A[p] << std::endl;
+      }
+      std::cout<< " picking by block0 "<< self_stencil <<std::endl;
+
+      phi=subspace[0];
+      std::vector<int> bc(FineGrid->_ndimension,0);
+
+      blockPick(Grid(),phi,tmp,bc);      // Pick out a block
+      linop.Op(tmp,Mphi);                // Apply big dop
+      blockProject(iProj,Mphi,subspace); // project it and print it
+      std::cout<< " Computed matrix elements from block zero only "<<std::endl;
+      std::cout<< iProj <<std::endl;
       std::cout<<"Computed Coarse Operator"<<std::endl;
+#endif
     }
     
   };
diff --git a/lib/lattice/Lattice_base.h b/lib/lattice/Lattice_base.h
index ce2990a5..5a17826b 100644
--- a/lib/lattice/Lattice_base.h
+++ b/lib/lattice/Lattice_base.h
@@ -297,11 +297,13 @@ PARALLEL_FOR_LOOP
 #include <lattice/Lattice_reduction.h>
 #include <lattice/Lattice_peekpoke.h>
 #include <lattice/Lattice_reality.h>
+#include <lattice/Lattice_comparison_utils.h>
+#include <lattice/Lattice_comparison.h>
 #include <lattice/Lattice_coordinate.h>
+#include <lattice/Lattice_where.h>
 #include <lattice/Lattice_rng.h>
 #include <lattice/Lattice_unary.h>
 #include <lattice/Lattice_transfer.h>
 
 
-
 #endif
diff --git a/lib/Comparison.h b/lib/lattice/Lattice_comparison_utils.h
similarity index 98%
rename from lib/Comparison.h
rename to lib/lattice/Lattice_comparison_utils.h
index ecd6ece0..d5e3b3e5 100644
--- a/lib/Comparison.h
+++ b/lib/lattice/Lattice_comparison_utils.h
@@ -141,7 +141,5 @@ namespace Grid {
     }
 }
 
-#include <lattice/Lattice_comparison.h>
-#include <lattice/Lattice_where.h>
 
 #endif
diff --git a/lib/lattice/Lattice_coordinate.h b/lib/lattice/Lattice_coordinate.h
index c49aed89..a83dbe49 100644
--- a/lib/lattice/Lattice_coordinate.h
+++ b/lib/lattice/Lattice_coordinate.h
@@ -3,20 +3,8 @@
 
 namespace Grid {
 
-  /*
-  depbase=`echo Grid_main.o | sed 's|[^/]*$|.deps/&|;s|\.o$||'`;\
-        icpc -DHAVE_CONFIG_H -I. -I../lib    -I../lib -mmic -O3 -std=c++11 -fopenmp -MT Grid_main.o -MD -MP -MF $depbase.Tpo -c -o Grid_main.o Grid_main.cc &&\
-        mv -f $depbase.Tpo $depbase.Po
-	  ../lib/lattice/Grid_lattice_coordinate.h(25): error: no suitable user-defined conversion from "vector_type" to "const Grid::iScalar<Grid::iScalar<Grid::iScalar<Grid::vInteger>>>" exists
-    l._odata[o]=vI;
-                    ^
-          detected during instantiation of "void Grid::LatticeCoordinate(Grid::Lattice<iobj> &, int) [with iobj=Grid::QCD::vTInteger]" at line 283 of "Grid_main.cc"
-
-	    compilation aborted for Grid_main.cc (code 2)
-*/
     template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
     {
-      typedef typename iobj::scalar_object scalar_object;
       typedef typename iobj::scalar_type scalar_type;
       typedef typename iobj::vector_type vector_type;
 
@@ -33,7 +21,7 @@ namespace Grid {
 	  mergebuf[i]=(Integer)gcoor[mu];
 	}
 	merge<vector_type,scalar_type>(vI,mergebuf);
-	l._odata[o]._internal._internal._internal=vI;
+	l._odata[o]=vI;
       }
     };
 
diff --git a/lib/lattice/Lattice_transfer.h b/lib/lattice/Lattice_transfer.h
index 871ed893..12993680 100644
--- a/lib/lattice/Lattice_transfer.h
+++ b/lib/lattice/Lattice_transfer.h
@@ -166,9 +166,10 @@ template<class vobj,class CComplex>
 inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
 {
   GridBase *coarse = ip._grid;
+  Lattice<vobj> zz(fineX._grid); zz=zero;
   blockInnerProduct(ip,fineX,fineX);
   ip = rsqrt(ip);
-  blockZAXPY(fineX,ip,fineX,fineX);
+  blockZAXPY(fineX,ip,fineX,zz);
 }
 // useful in multigrid project;
 // Generic name : Coarsen?
@@ -205,6 +206,26 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
   return;
 }
 
+template<class vobj>
+inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vobj> &picked,std::vector<int> coor)
+{
+  GridBase * fine = unpicked._grid;
+
+  Lattice<vobj> zz(fine);
+  Lattice<iScalar<vInteger> > fcoor(fine);
+
+  zz = zero;
+
+  picked = unpicked;
+  for(int d=0;d<fine->_ndimension;d++){
+    LatticeCoordinate(fcoor,d);
+    int block= fine->_rdimensions[d] / coarse->_rdimensions[d];
+    int lo   = (coor[d])*block;
+    int hi   = (coor[d]+1)*block;
+    picked = where( (fcoor<hi) , picked, zz);
+    picked = where( (fcoor>=lo), picked, zz);
+  }
+}
 
 template<class vobj,class CComplex>
 inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> > &Basis)
diff --git a/lib/lattice/Lattice_unary.h b/lib/lattice/Lattice_unary.h
index 131d7d68..fcdcc484 100644
--- a/lib/lattice/Lattice_unary.h
+++ b/lib/lattice/Lattice_unary.h
@@ -26,7 +26,48 @@ PARALLEL_FOR_LOOP
     }
     return ret;
   }
+  template<class obj> Lattice<obj> sin(const Lattice<obj> &rhs){
+    Lattice<obj> ret(rhs._grid);
+    ret.checkerboard = rhs.checkerboard;
+    conformable(ret,rhs);
+PARALLEL_FOR_LOOP
+    for(int ss=0;ss<rhs._grid->oSites();ss++){
+      ret._odata[ss]=sin(rhs._odata[ss]);
+    }
+    return ret;
+  }
+  template<class obj> Lattice<obj> cos(const Lattice<obj> &rhs){
+    Lattice<obj> ret(rhs._grid);
+    ret.checkerboard = rhs.checkerboard;
+    conformable(ret,rhs);
+PARALLEL_FOR_LOOP
+    for(int ss=0;ss<rhs._grid->oSites();ss++){
+      ret._odata[ss]=cos(rhs._odata[ss]);
+    }
+    return ret;
+  }
 
 
+  template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs,RealD y){
+    Lattice<obj> ret(rhs._grid);
+    ret.checkerboard = rhs.checkerboard;
+    conformable(ret,rhs);
+PARALLEL_FOR_LOOP
+    for(int ss=0;ss<rhs._grid->oSites();ss++){
+      ret._odata[ss]=pow(rhs._odata[ss],y);
+    }
+    return ret;
+  }
+  template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs,Integer y){
+    Lattice<obj> ret(rhs._grid);
+    ret.checkerboard = rhs.checkerboard;
+    conformable(ret,rhs);
+PARALLEL_FOR_LOOP
+    for(int ss=0;ss<rhs._grid->oSites();ss++){
+      ret._odata[ss]=mod(rhs._odata[ss],y);
+    }
+    return ret;
+  }
+
 }
 #endif
diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc
index 4ddd14ab..7d9ffe22 100644
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -85,6 +85,7 @@ void WilsonFermion5D::DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGau
 void WilsonFermion5D::DhopDir(const LatticeFermion &in, LatticeFermion &out,int dir,int disp)
 {
   assert( (disp==1)||(disp==-1) );
+  assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
 
   WilsonCompressor compressor(DaggerNo);
   Stencil.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
@@ -93,6 +94,9 @@ void WilsonFermion5D::DhopDir(const LatticeFermion &in, LatticeFermion &out,int
 
   int dirdisp = dir+skip*4;
 
+  assert(dirdisp<=7);
+  assert(dirdisp>=0);
+
 PARALLEL_FOR_LOOP
   for(int ss=0;ss<Umu._grid->oSites();ss++){
     for(int s=0;s<Ls;s++){
diff --git a/lib/simd/Grid_vector_unops.h b/lib/simd/Grid_vector_unops.h
index 9aeec350..8e2b6956 100644
--- a/lib/simd/Grid_vector_unops.h
+++ b/lib/simd/Grid_vector_unops.h
@@ -35,6 +35,14 @@ namespace Grid {
     }
   };
 
+  template<class scalar> struct ModIntFunctor {
+    Integer y;
+  ModIntFunctor(Integer _y) : y(_y) {};
+    scalar operator()(const scalar &a)  const {
+      return Integer(a)%y;
+    }
+  };
+
   template < class S, class V > 
   inline Grid_simd<S,V> sqrt(const Grid_simd<S,V> &r) {
     return SimdApply(SqrtRealFunctor<S>(),r);
@@ -55,6 +63,10 @@ namespace Grid {
   inline Grid_simd<S,V> pow(const Grid_simd<S,V> &r,double y) {
     return SimdApply(PowRealFunctor<S>(y),r);
   }
+  template < class S, class V > 
+  inline Grid_simd<S,V> mod(const Grid_simd<S,V> &r,Integer y) {
+    return SimdApply(ModIntFunctor<S>(y),r);
+  }
 
 }
 #endif
diff --git a/lib/tensors/Tensor_class.h b/lib/tensors/Tensor_class.h
index 3ed4f865..019e6fb1 100644
--- a/lib/tensors/Tensor_class.h
+++ b/lib/tensors/Tensor_class.h
@@ -90,10 +90,10 @@ public:
   operator ComplexD () const { return(TensorRemove(_internal)); };
   operator RealD () const { return(real(TensorRemove(_internal))); }
   
-  // convert from a something to a scalar
-  template<class T,typename std::enable_if<!isGridTensor<T>::value, T>::type* = nullptr > strong_inline auto operator = (T arg) -> iScalar<vtype>
+  // convert from a something to a scalar via constructor of something arg
+  template<class T,typename std::enable_if<!isGridTensor<T>::value, T>::type* = nullptr > strong_inline iScalar<vtype> operator = (T arg)
     { 
-      _internal = vtype(arg);
+      _internal = arg;
       return *this;
     }
 
@@ -316,7 +316,8 @@ public:
 	stream<<o._internal[i][j];
 	if (i<N-1)	stream<<",";
       }
-      stream<<"}\n\t\t";
+      stream<<"}";
+      if(i!=N-1) stream<<"\n\t\t";
     }
     stream<<"}";
     return stream;
diff --git a/lib/tensors/Tensor_unary.h b/lib/tensors/Tensor_unary.h
index 1fbec985..2a546701 100644
--- a/lib/tensors/Tensor_unary.h
+++ b/lib/tensors/Tensor_unary.h
@@ -27,11 +27,40 @@ template<class obj,int N> inline auto func(const iMatrix<obj,N> &z) -> iMatrix<o
     return ret;\
 }
 
+
+#define BINARY_RSCALAR(func,scal)					\
+template<class obj> inline iScalar<obj> func(const iScalar<obj> &z,scal y)	\
+{\
+    iScalar<obj> ret;\
+    ret._internal = func(z._internal,y);	\
+    return ret;\
+}\
+ template<class obj,int N> inline iVector<obj,N> func(const iVector<obj,N> &z,scal y) \
+{\
+    iVector<obj,N> ret;\
+    for(int c1=0;c1<N;c1++){\
+      ret._internal[c1] = func(z._internal[c1],y);	\
+    }\
+    return ret;\
+}\
+ template<class obj,int N> inline  iMatrix<obj,N> func(const iMatrix<obj,N> &z, scal y) \
+{\
+    iMatrix<obj,N> ret;\
+    for(int c1=0;c1<N;c1++){\
+    for(int c2=0;c2<N;c2++){\
+      ret._internal[c1][c2] = func(z._internal[c1][c2],y);	\
+    }}\
+    return ret;\
+}
+
 UNARY_REAL(sqrt);
 UNARY_REAL(rsqrt);
 UNARY_REAL(sin);
 UNARY_REAL(cos);
 
+BINARY_RSCALAR(mod,Integer);
+BINARY_RSCALAR(pow,RealD);
+
 
 }
 #endif
diff --git a/scripts/linecount b/scripts/linecount
index 7624c15f..0be67142 100755
--- a/scripts/linecount
+++ b/scripts/linecount
@@ -1,3 +1,12 @@
 #!/bin/sh 
 
-wc -l lib/*.h lib/*/*.h lib/*/*/*.h lib/*.cc lib/*/*.cc lib/*/*/*.cc tests/*.cc benchmarks/*.cc lib/*/*/*/*.cc lib/*/*/*/*.h
\ No newline at end of file
+wc -l lib/*.h lib/*/*.h lib/*/*/*.h lib/*.cc lib/*/*.cc lib/*/*/*.cc tests/*.cc benchmarks/*.cc lib/*/*/*/*.cc lib/*/*/*/*.h
+
+echo "lib"
+wc -l lib/*.h lib/*/*.h lib/*/*/*.h lib/*.cc lib/*/*.cc lib/*/*/*.cc lib/*/*/*/*.cc lib/*/*/*/*.h | grep total
+
+echo "tests"
+wc -l tests/*.cc | grep total
+
+echo "benchmarks"
+wc -l benchmarks/*.cc | grep total
diff --git a/tests/Test_cayley_coarsen_support.cc b/tests/Test_cayley_coarsen_support.cc
index acd96da3..064f5f13 100644
--- a/tests/Test_cayley_coarsen_support.cc
+++ b/tests/Test_cayley_coarsen_support.cc
@@ -23,7 +23,7 @@ int main (int argc, char ** argv)
 
 
 
-  const int Ls=8;
+  const int Ls=4;
 
   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@@ -50,13 +50,18 @@ int main (int argc, char ** argv)
   LatticeFermion    tmp(FGrid);
   LatticeFermion    err(FGrid);
   LatticeGaugeField Umu(UGrid); random(RNG4,Umu);
-
+#if 0
   std::vector<LatticeColourMatrix> U(4,UGrid);
-
-  for(int mu=0;mu<Nd;mu++){
-    U[mu] = peekIndex<LorentzIndex>(Umu,mu);
+  Umu=zero;
+  Complex cone(1.0,0.0);
+  for(int nn=0;nn<Nd;nn++){
+    if(1) {
+      if (nn!=0) { U[nn]=zero; std::cout << "zeroing gauge field in dir "<<nn<<std::endl; }
+      else       { U[nn] = cone;std::cout << "unit gauge field in dir "<<nn<<std::endl; }
+    }
+    pokeIndex<LorentzIndex>(Umu,U[nn],nn);
   }
-  
+#endif  
   RealD mass=0.5;
   RealD M5=1.8;
 
@@ -75,7 +80,7 @@ int main (int argc, char ** argv)
   err = result-ref;
   std::cout<<"Error "<<norm2(err)<<std::endl;
 
-  const int nbasis = 8;
+  const int nbasis = 2;
   std::vector<LatticeFermion> subspace(nbasis,FGrid);
   
   for(int b=0;b<nbasis;b++){
diff --git a/tests/Test_cshift.cc b/tests/Test_cshift.cc
index cc1da333..85f8529c 100644
--- a/tests/Test_cshift.cc
+++ b/tests/Test_cshift.cc
@@ -34,7 +34,6 @@ int main (int argc, char ** argv)
   }
 
 
-
   TComplex cm;
   
   for(int dir=0;dir<4;dir++){

From 99ace0a89ca1a96921a299c42677c51d9e97cb94 Mon Sep 17 00:00:00 2001
From: neo <cossu@post.kek.jp>
Date: Tue, 9 Jun 2015 18:59:45 +0900
Subject: [PATCH 11/24] Adding support for iMatrix exponentiation

---
 lib/simd/Old/Grid_vComplexD.h | 430 -------------------------------
 lib/simd/Old/Grid_vComplexF.h | 463 ----------------------------------
 lib/simd/Old/Grid_vInteger.h  | 259 -------------------
 lib/simd/Old/Grid_vRealD.h    | 276 --------------------
 lib/simd/Old/Grid_vRealF.h    | 313 -----------------------
 lib/tensors/Tensor_Ta.h       |  84 +++++-
 tests/InvSqrt.gnu             |   0
 tests/Sqrt.gnu                |   2 +
 tests/Test_main.cc            |   3 +
 9 files changed, 83 insertions(+), 1747 deletions(-)
 delete mode 100644 lib/simd/Old/Grid_vComplexD.h
 delete mode 100644 lib/simd/Old/Grid_vComplexF.h
 delete mode 100644 lib/simd/Old/Grid_vInteger.h
 delete mode 100644 lib/simd/Old/Grid_vRealD.h
 delete mode 100644 lib/simd/Old/Grid_vRealF.h
 create mode 100644 tests/InvSqrt.gnu
 create mode 100644 tests/Sqrt.gnu

diff --git a/lib/simd/Old/Grid_vComplexD.h b/lib/simd/Old/Grid_vComplexD.h
deleted file mode 100644
index 116c2866..00000000
--- a/lib/simd/Old/Grid_vComplexD.h
+++ /dev/null
@@ -1,430 +0,0 @@
-#ifndef GRID_VCOMPLEXD_H
-#define  GRID_VCOMPLEXD_H
-
-namespace Grid {
-    class vComplexD {
-    public:
-        zvec v;
-    public:
-	typedef zvec     vector_type;
-	typedef ComplexD scalar_type;
-
-        vComplexD & operator = ( Zero & z){
-            vzero(*this);
-            return (*this);
-        }
-        vComplexD( Zero & z){
-            vzero(*this);
-        }
-        vComplexD()=default;
-        vComplexD(ComplexD a){
-	  vsplat(*this,a);
-	};
-        vComplexD(double a){
-	  vsplat(*this,ComplexD(a));
-	};
- 
-        ///////////////////////////////////////////////
-        // mac, mult, sub, add, adj
-        // Should do an AVX2 version with mac.
-       ///////////////////////////////////////////////
-        friend inline void mac (vComplexD * __restrict__ y,const vComplexD * __restrict__ a,const vComplexD *__restrict__ x) {*y = (*a)*(*x)+(*y);};
-        friend inline void mult(vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) * (*r);}
-        friend inline void sub (vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) - (*r);}
-        friend inline void add (vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) + (*r);}
-        friend inline vComplexD adj(const vComplexD &in){ return conjugate(in); }
-
-        //////////////////////////////////
-        // Initialise to 1,0,i
-        //////////////////////////////////
-        friend inline void vone      (vComplexD &ret){ vsplat(ret,1.0,0.0);}
-        friend inline void vzero     (vComplexD &ret){ vsplat(ret,0.0,0.0);}
-        friend inline void vcomplex_i(vComplexD &ret){ vsplat(ret,0.0,1.0);}
-        
-        ////////////////////////////////////
-        // Arithmetic operator overloads +,-,*
-        ////////////////////////////////////
-        friend inline vComplexD operator + (vComplexD a, vComplexD b)
-        {
-            vComplexD ret;
-#if defined (AVX1)|| defined (AVX2)
-            ret.v = _mm256_add_pd(a.v,b.v);
-#endif
-#ifdef SSE4
-            ret.v = _mm_add_pd(a.v,b.v);
-#endif
-#ifdef AVX512
-            ret.v = _mm512_add_pd(a.v,b.v);
-#endif
-#ifdef QPX
-            ret.v = vec_add(a.v,b.v);
-#endif
-            return ret;
-        };
-        
-        friend inline vComplexD operator - (vComplexD a, vComplexD b)
-        {
-            vComplexD ret;
-#if defined (AVX1)|| defined (AVX2)
-            ret.v = _mm256_sub_pd(a.v,b.v);
-#endif
-#ifdef SSE4
-            ret.v = _mm_sub_pd(a.v,b.v);
-#endif
-#ifdef AVX512
-            ret.v = _mm512_sub_pd(a.v,b.v);
-#endif
-#ifdef QPX
-            ret.v = vec_sub(a.v,b.v);
-#endif
-            return ret;
-        };
-        
-        friend inline vComplexD operator * (vComplexD a, vComplexD b)
-        {
-            vComplexD ret;
-            
-            //Multiplicationof (ak+ibk)*(ck+idk)
-            // a + i b can be stored as a data structure
-            //From intel optimisation reference guide
-            /*
-             movsldup xmm0, Src1; load real parts into the destination,
-             ; a1, a1, a0, a0
-             movaps xmm1, src2; load the 2nd pair of complex values, ; i.e. d1, c1, d0, c0
-             mulps xmm0, xmm1; temporary results, a1d1, a1c1, a0d0, ; a0c0
-             shufps xmm1, xmm1, b1; reorder the real and imaginary ; parts, c1, d1, c0, d0
-             movshdup xmm2, Src1; load the imaginary parts into the ; destination, b1, b1, b0, b0
-             mulps xmm2, xmm1; temporary results, b1c1, b1d1, b0c0, ; b0d0
-             addsubps xmm0, xmm2; b1c1+a1d1, a1c1 -b1d1, b0c0+a0d
-             VSHUFPD (VEX.256 encoded version)
-             IF IMM0[0] = 0
-             THEN DEST[63:0]=SRC1[63:0] ELSE DEST[63:0]=SRC1[127:64] FI;
-             IF IMM0[1] = 0
-             THEN DEST[127:64]=SRC2[63:0] ELSE DEST[127:64]=SRC2[127:64] FI;
-             IF IMM0[2] = 0
-             THEN DEST[191:128]=SRC1[191:128] ELSE DEST[191:128]=SRC1[255:192] FI;
-             IF IMM0[3] = 0
-             THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i   ; 0xC unchanged
-             */
-#if defined (AVX1)|| defined (AVX2)
-            zvec ymm0,ymm1,ymm2;
-            ymm0 = _mm256_shuffle_pd(a.v,a.v,0x0); // ymm0 <- ar ar, ar,ar b'00,00
-            ymm0 = _mm256_mul_pd(ymm0,b.v);        // ymm0 <- ar bi, ar br
-            ymm1 = _mm256_shuffle_pd(b.v,b.v,0x5); // ymm1 <- br,bi  b'01,01 
-            ymm2 = _mm256_shuffle_pd(a.v,a.v,0xF); // ymm2 <- ai,ai  b'11,11
-            ymm1 = _mm256_mul_pd(ymm1,ymm2);       // ymm1 <- br ai, ai bi
-            ret.v= _mm256_addsub_pd(ymm0,ymm1);
-#endif
-#ifdef SSE4
-            zvec ymm0,ymm1,ymm2;
-            ymm0 = _mm_shuffle_pd(a.v,a.v,0x0); // ymm0 <- ar ar,
-            ymm0 = _mm_mul_pd(ymm0,b.v);        // ymm0 <- ar bi, ar br
-            ymm1 = _mm_shuffle_pd(b.v,b.v,0x1); // ymm1 <- br,bi   b01
-            ymm2 = _mm_shuffle_pd(a.v,a.v,0x3); // ymm2 <- ai,ai   b11
-            ymm1 = _mm_mul_pd(ymm1,ymm2);       // ymm1 <- br ai, ai bi
-            ret.v= _mm_addsub_pd(ymm0,ymm1);
-#endif
-#ifdef AVX512
-            /* This is from
-             * Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets 
-             * @inproceedings{McFarlin:2011:ASV:1995896.1995938,
-             * author = {McFarlin, Daniel S. and Arbatov, Volodymyr and Franchetti, Franz and P\"{u}schel, Markus},
-             * title = {Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets},
-             * booktitle = {Proceedings of the International Conference on Supercomputing},
-             * series = {ICS '11},
-             * year = {2011},
-             * isbn = {978-1-4503-0102-2},
-             * location = {Tucson, Arizona, USA},
-             * pages = {265--274},
-             * numpages = {10},
-             * url = {http://doi.acm.org/10.1145/1995896.1995938},
-             * doi = {10.1145/1995896.1995938},
-             * acmid = {1995938},
-             * publisher = {ACM},
-             * address = {New York, NY, USA},
-             * keywords = {autovectorization, fourier transform, program generation, simd, super-optimization},
-	     *                } 
-             */
-            zvec vzero,ymm0,ymm1,real,imag;
-            vzero =(zvec)_mm512_setzero();
-            ymm0 =  _mm512_swizzle_pd(a.v, _MM_SWIZ_REG_CDAB); // 
-            real =(zvec)_mm512_mask_or_epi64((__m512i)a.v, 0xAA,(__m512i)vzero,(__m512i) ymm0);
-            imag =  _mm512_mask_sub_pd(a.v, 0x55,vzero, ymm0);
-            ymm1 =  _mm512_mul_pd(real, b.v);
-            ymm0 =  _mm512_swizzle_pd(b.v, _MM_SWIZ_REG_CDAB); // OK
-            ret.v=  _mm512_fmadd_pd(ymm0,imag,ymm1);
-             /* Imag OK */
-#endif
-#ifdef QPX
-            ret.v = vec_mul(a.v,b.v);
-#endif
-            return ret;
-        };
-
-	////////////////////////////////////////////////////////////////////
-	// General permute; assumes vector length is same across 
-	// all subtypes; may not be a good assumption, but could
-	// add the vector width as a template param for BG/Q for example
-	////////////////////////////////////////////////////////////////////
-	friend inline void permute(vComplexD &y,vComplexD b,int perm)
-	{
-	  Gpermute<vComplexD>(y,b,perm);
-	}
-	/*
-	friend inline void merge(vComplexD &y,std::vector<ComplexD *> &extracted)
-	{
-	  Gmerge<vComplexD,ComplexD >(y,extracted);
-	}
-	friend inline void extract(const vComplexD &y,std::vector<ComplexD *> &extracted)
-	{
-	  Gextract<vComplexD,ComplexD>(y,extracted);
-	}
-	friend inline void merge(vComplexD &y,std::vector<ComplexD > &extracted)
-	{
-	  Gmerge<vComplexD,ComplexD >(y,extracted);
-	}
-	friend inline void extract(const vComplexD &y,std::vector<ComplexD > &extracted)
-	{
-	  Gextract<vComplexD,ComplexD>(y,extracted);
-	}
-	*/
-
-        ///////////////////////
-        // Splat
-        ///////////////////////
-        friend inline void vsplat(vComplexD &ret,ComplexD c){
-            float a= real(c);
-            float b= imag(c);
-            vsplat(ret,a,b);
-        }
-
-
-        friend inline void vsplat(vComplexD &ret,double rl,double ig){
-#if defined (AVX1)|| defined (AVX2)
-            ret.v = _mm256_set_pd(ig,rl,ig,rl);
-#endif
-#ifdef SSE4
-            ret.v = _mm_set_pd(ig,rl);
-#endif
-#ifdef AVX512
-            ret.v = _mm512_set_pd(ig,rl,ig,rl,ig,rl,ig,rl);
-#endif
-#ifdef QPX
-            ret.v = {ig,rl,ig,rl};
-#endif
-        }
-
-        friend inline void vset(vComplexD &ret,ComplexD *a){
-#if defined (AVX1)|| defined (AVX2)
-            ret.v = _mm256_set_pd(a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
-#endif
-#ifdef SSE4
-            ret.v = _mm_set_pd(a[0].imag(),a[0].real());
-#endif
-#ifdef AVX512
-            ret.v = _mm512_set_pd(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
-            // Note v has a0 a1 a2 a3 
-#endif
-#ifdef QPX
-            ret.v = {a[0].real(),a[0].imag(),a[1].real(),a[3].imag()};
-#endif
-        }
-
-friend inline void vstore(const vComplexD &ret, ComplexD *a){
-#if defined (AVX1)|| defined (AVX2)
-       _mm256_store_pd((double *)a,ret.v);
-#endif
-#ifdef SSE4
-       _mm_store_pd((double *)a,ret.v);
-#endif
-#ifdef AVX512
-       _mm512_store_pd((double *)a,ret.v);
-   //Note v has a3 a2 a1 a0
-#endif
-#ifdef QPX
-	assert(0);
-#endif
-        }
-        friend inline void vstream(vComplexD &out,const vComplexD &in){
-#if defined (AVX1)|| defined (AVX2)
-	  _mm256_stream_pd((double *)&out.v,in.v);
-#endif
-#ifdef SSE4
-	  _mm_stream_pd((double *)&out.v,in.v);
-#endif
-#ifdef AVX512
-	  _mm512_storenrngo_pd((double *)&out.v,in.v);
-	  //	  _mm512_stream_pd((double *)&out.v,in.v);
-	  //Note v has a3 a2 a1 a0
-#endif
-#ifdef QPX
-	  assert(0);
-#endif
-	}
-      friend inline void prefetch(const vComplexD &v)
-        {
-            _mm_prefetch((const char*)&v.v,_MM_HINT_T0);
-        }
-
-        ////////////////////////
-        // Conjugate
-        ////////////////////////
-        friend inline vComplexD conjugate(const vComplexD &in){
-            vComplexD ret ; vzero(ret);
-#if defined (AVX1)|| defined (AVX2)
-	    //	    addsubps 0, inv=>0+in.v[3] 0-in.v[2], 0+in.v[1], 0-in.v[0], ...
-	    zvec tmp = _mm256_addsub_pd(ret.v,_mm256_shuffle_pd(in.v,in.v,0x5));
-	    ret.v    =_mm256_shuffle_pd(tmp,tmp,0x5);
-#endif
-#ifdef SSE4
-	    zvec tmp = _mm_addsub_pd(ret.v,_mm_shuffle_pd(in.v,in.v,0x1));
-	    ret.v    = _mm_shuffle_pd(tmp,tmp,0x1);
-#endif
-#ifdef AVX512
-	    ret.v = _mm512_mask_sub_pd(in.v, 0xaa,ret.v, in.v);             
-#endif
-#ifdef QPX
-	     assert(0);
-#endif
-            return ret;
-        }
-
-        friend inline void timesMinusI(vComplexD &ret,const vComplexD &in){
-	  vzero(ret);
-	  vComplexD tmp;
-#if defined (AVX1)|| defined (AVX2)
-	  tmp.v    =_mm256_addsub_pd(ret.v,in.v); // r,-i
-          ret.v    =_mm256_shuffle_pd(tmp.v,tmp.v,0x5);
-#endif
-#ifdef SSE4
-	  tmp.v    =_mm_addsub_pd(ret.v,in.v); // r,-i
-          ret.v    =_mm_shuffle_pd(tmp.v,tmp.v,0x1);
-#endif
-#ifdef AVX512
-          ret.v = _mm512_mask_sub_pd(in.v,0xaa,ret.v,in.v); // real -imag 
-	  ret.v = _mm512_swizzle_pd(ret.v, _MM_SWIZ_REG_CDAB);// OK
-#endif
-#ifdef QPX
-            assert(0);
-#endif
-	}
-
-        friend inline void timesI(vComplexD &ret, const vComplexD &in){
-	  vzero(ret);
-	  vComplexD tmp;
-#if defined (AVX1)|| defined (AVX2)
-	  tmp.v    =_mm256_shuffle_pd(in.v,in.v,0x5);
-          ret.v    =_mm256_addsub_pd(ret.v,tmp.v); // i,-r
-#endif
-#ifdef SSE4
-	  tmp.v    =_mm_shuffle_pd(in.v,in.v,0x1);
-          ret.v    =_mm_addsub_pd(ret.v,tmp.v); // r,-i
-#endif
-#ifdef AVX512
-          tmp.v    = _mm512_swizzle_pd(in.v, _MM_SWIZ_REG_CDAB);// OK
-	  ret.v    = _mm512_mask_sub_pd(tmp.v,0xaa,ret.v,tmp.v); // real -imag 
-#endif
-#ifdef QPX
-            assert(0);
-#endif
-	}
-
-        friend inline vComplexD timesMinusI(const vComplexD &in){
-	  vComplexD ret; 
-	  timesMinusI(ret,in);
-	  return ret;
-	}
-
-        friend inline vComplexD timesI(const vComplexD &in){
-	  vComplexD ret; 
-	  timesI(ret,in);
-	  return ret;
-	}
-
-
-// REDUCE FIXME must be a cleaner implementation
-       friend inline ComplexD Reduce(const vComplexD & in)
-       { 
-	 vComplexD v1,v2;
-	 union { 
-	   zvec v;
-	   double f[sizeof(zvec)/sizeof(double)];
-	 } conv;
-	   
-#ifdef SSE4
-	 v1=in;
-#endif
-#if defined(AVX1) || defined (AVX2)
-	 permute(v1,in,0); // sse 128; paired complex single
-	 v1=v1+in;
-#endif
-#ifdef AVX512
-	 permute(v1,in,0); // sse 128; paired complex single
-	 v1=v1+in;
-	 permute(v2,v1,1); // avx 256; quad complex single
-	 v1=v1+v2;
-#endif 
-#ifdef QPX
-#error
-#endif
-	 conv.v = v1.v;
-	 return ComplexD(conv.f[0],conv.f[1]);
-        }
-        
-        // Unary negation
-        friend inline vComplexD operator -(const vComplexD &r) {
-            vComplexD ret;
-            vzero(ret);
-            ret = ret - r;
-            return ret;
-        }
-        // *=,+=,-= operators
-        inline vComplexD &operator *=(const vComplexD &r) {
-            *this = (*this)*r;
-            return *this;
-        }
-        inline vComplexD &operator +=(const vComplexD &r) {
-            *this = *this+r;
-            return *this;
-        }
-        inline vComplexD &operator -=(const vComplexD &r) {
-            *this = *this-r;
-            return *this;
-        }
-
-    public:
-        static int Nsimd(void) { return sizeof(zvec)/sizeof(double)/2;}
-    };
-
-
-    inline vComplexD innerProduct(const vComplexD & l, const vComplexD & r) { return conjugate(l)*r; }
-
-
-    typedef  vComplexD vDComplex;
-    inline void zeroit(vComplexD &z){ vzero(z);}
-
-    inline vComplexD outerProduct(const vComplexD &l, const vComplexD& r)
-    {
-        return l*r;
-    }
-    inline vComplexD trace(const vComplexD &arg){
-        return arg;
-    }
-/////////////////////////////////////////////////////////////////////////
-//// Generic routine to promote object<complex> -> object<vcomplex>
-//// Supports the array reordering transformation that gives me SIMD utilisation
-///////////////////////////////////////////////////////////////////////////
-/*
-template<template<class> class object>
-inline object<vComplex> splat(object<Complex >s){
-      object<vComplex> ret;
-      vComplex * v_ptr = (vComplex *)& ret;
-      Complex * s_ptr = (Complex *) &s;
-      for(int i=0;i<sizeof(ret);i+=sizeof(vComplex)){
-          vsplat(*(v_ptr++),*(s_ptr++));
-      }
-      return ret;
-    }
-*/
-}
-#endif
diff --git a/lib/simd/Old/Grid_vComplexF.h b/lib/simd/Old/Grid_vComplexF.h
deleted file mode 100644
index 828145d6..00000000
--- a/lib/simd/Old/Grid_vComplexF.h
+++ /dev/null
@@ -1,463 +0,0 @@
-#ifndef GRID_VCOMPLEXF
-#define GRID_VCOMPLEXF
-
-namespace Grid {
-
-  /*
-  inline void Print(const char *A,cvec c) { 
-	  float *fp=(float *)&c; 
-	  printf(A); 
-	  printf(" %le %le %le %le %le %le %le %le\n",
-		 fp[0],fp[1],fp[2],fp[3],fp[4],fp[5],fp[6],fp[7]);
-	}
-  */
-
-    class vComplexF {
-      //    protected:
-
-    public:
-        cvec v;
-        
-    public:
-        static inline int Nsimd(void) { return sizeof(cvec)/sizeof(float)/2;}
-    public:
-	typedef cvec     vector_type;
-	typedef ComplexF scalar_type;
-
-        vComplexF & operator = ( Zero & z){
-            vzero(*this);
-            return (*this);
-        }
-	//        vComplexF( Zero & z){
-	//            vzero(*this);
-	//        }
-        vComplexF()=default;
-        vComplexF(ComplexF a){
-	  vsplat(*this,a);
-	};
-        vComplexF(double a){
-	  vsplat(*this,ComplexF(a));
-	};
-       
-        ///////////////////////////////////////////////
-        // mac, mult, sub, add, adj
-        // Should do an AVX2 version with mac.
-        ///////////////////////////////////////////////
-        friend inline void mac (vComplexF * __restrict__ y,const vComplexF * __restrict__ a,const vComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
-        friend inline void mult(vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); }
-        friend inline void sub (vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) - (*r); }
-        friend inline void add (vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) + (*r); }
-        friend inline vComplexF adj(const vComplexF &in){ return conjugate(in); }
-        
-        //////////////////////////////////
-        // Initialise to 1,0,i
-        //////////////////////////////////
-        friend inline void vone(vComplexF &ret)      { vsplat(ret,1.0,0.0); }
-        friend inline void vzero(vComplexF &ret)     { vsplat(ret,0.0,0.0); }
-        friend inline void vcomplex_i(vComplexF &ret){ vsplat(ret,0.0,1.0); }
-          
-        ////////////////////////////////////
-        // Arithmetic operator overloads +,-,*
-        ////////////////////////////////////
-        friend inline vComplexF operator + (vComplexF a, vComplexF b)
-        {
-            vComplexF ret;
-#if defined (AVX1)|| defined (AVX2)
-            ret.v = _mm256_add_ps(a.v,b.v);
-#endif
-#ifdef SSE4
-            ret.v = _mm_add_ps(a.v,b.v);
-#endif
-#ifdef AVX512
-            ret.v = _mm512_add_ps(a.v,b.v);
-#endif
-#ifdef QPX
-#error
-#endif
-            return ret;
-        };
-        
-        friend inline vComplexF operator - (vComplexF a, vComplexF b)
-        {
-            vComplexF ret;
-#if defined (AVX1)|| defined (AVX2)
-            ret.v = _mm256_sub_ps(a.v,b.v);
-#endif
-#ifdef SSE4
-            ret.v = _mm_sub_ps(a.v,b.v);
-#endif
-#ifdef AVX512
-            ret.v = _mm512_sub_ps(a.v,b.v);
-#endif
-#ifdef QPX
-#error
-#endif
-            return ret;
-        };
-        
-        friend inline vComplexF operator * (vComplexF a, vComplexF b)
-        {
-            vComplexF ret;
-            
-            //Multiplicationof (ak+ibk)*(ck+idk)
-            // a + i b can be stored as a data structure
-            //From intel optimisation reference
-            /*
-             movsldup xmm0, Src1; load real parts into the destination,
-             ; a1, a1, a0, a0
-             movaps xmm1, src2; load the 2nd pair of complex values, ; i.e. d1, c1, d0, c0
-             mulps xmm0, xmm1; temporary results, a1d1, a1c1, a0d0, ; a0c0
-             shufps xmm1, xmm1, b1; reorder the real and imaginary ; parts, c1, d1, c0, d0
-             movshdup xmm2, Src1; load the imaginary parts into the ; destination, b1, b1, b0, b0
-             mulps xmm2, xmm1; temporary results, b1c1, b1d1, b0c0, ; b0d0
-             addsubps xmm0, xmm2; b1c1+a1d1, a1c1 -b1d1, b0c0+a0d
-             */
-#if defined (AVX1)|| defined (AVX2)
-            cvec ymm0,ymm1,ymm2;
-            ymm0 = _mm256_shuffle_ps(a.v,a.v,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
-            ymm0 = _mm256_mul_ps(ymm0,b.v);        // ymm0 <- ar bi, ar br
-            // FIXME AVX2 could MAC
-            ymm1 = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
-            ymm2 = _mm256_shuffle_ps(a.v,a.v,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
-            ymm1 = _mm256_mul_ps(ymm1,ymm2);       // ymm1 <- br ai, ai bi
-            ret.v= _mm256_addsub_ps(ymm0,ymm1);    
-#endif
-#ifdef SSE4
-            cvec ymm0,ymm1,ymm2;
-            ymm0 = _mm_shuffle_ps(a.v,a.v,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
-            ymm0 = _mm_mul_ps(ymm0,b.v);        // ymm0 <- ar bi, ar br
-            ymm1 = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
-            ymm2 = _mm_shuffle_ps(a.v,a.v,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
-            ymm1 = _mm_mul_ps(ymm1,ymm2);       // ymm1 <- br ai, ai bi
-            ret.v= _mm_addsub_ps(ymm0,ymm1);
-#endif
-#ifdef AVX512
-//
-            cvec vzero,ymm0,ymm1,real, imag;
-            vzero = _mm512_setzero();
-            ymm0  = _mm512_swizzle_ps(a.v, _MM_SWIZ_REG_CDAB); // 
-            real  = (__m512)_mm512_mask_or_epi32((__m512i)a.v, 0xAAAA,(__m512i)vzero,(__m512i)ymm0);
-            imag  = _mm512_mask_sub_ps(a.v, 0x5555,vzero, ymm0);
-            ymm1  = _mm512_mul_ps(real, b.v);
-            ymm0  = _mm512_swizzle_ps(b.v, _MM_SWIZ_REG_CDAB); // OK
-            ret.v = _mm512_fmadd_ps(ymm0,imag,ymm1);
-
-
-#endif
-#ifdef QPX
-            ret.v = vec_mul(a.v,b.v);
-#endif
-            return ret;
-        };
-      
-
-	////////////////////////////////////////////////////////////////////////
-	// FIXME:  gonna remove these load/store, get, set, prefetch
-	////////////////////////////////////////////////////////////////////////
-        friend inline void vset(vComplexF &ret, ComplexF *a){
-#if defined (AVX1)|| defined (AVX2)
-            ret.v = _mm256_set_ps(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
-#endif
-#ifdef SSE4
-            ret.v = _mm_set_ps(a[1].imag(), a[1].real(),a[0].imag(),a[0].real());
-#endif
-#ifdef AVX512
-            ret.v = _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(),a[5].imag(),a[5].real(),a[4].imag(),a[4].real(),a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
-            // Note v has a0 a1 a2 a3 a4 a5 a6 a7
-#endif
-#ifdef QPX
-	    ret.v = {a[0].real(),a[0].imag(),a[1].real(),a[1].imag(),a[2].real(),a[2].imag(),a[3].real(),a[3].imag()};
-#endif
-        }
-        
-        ///////////////////////
-        // Splat
-        ///////////////////////
-        friend inline void vsplat(vComplexF &ret,ComplexF c){
-            float a= real(c);
-            float b= imag(c);
-            vsplat(ret,a,b);
-        }
-
-	friend inline void vstore(const vComplexF &ret, ComplexF *a){
-#if defined (AVX1)|| defined (AVX2)
-	  _mm256_store_ps((float *)a,ret.v);
-#endif
-#ifdef SSE4
-	  _mm_store_ps((float *)a,ret.v);
-#endif
-#ifdef AVX512
-	  _mm512_store_ps((float *)a,ret.v);
-	  //Note v has a3 a2 a1 a0
-#endif
-#ifdef QPX
-	  assert(0);
-#endif
-	}
-        friend inline void vstream(vComplexF &out,const vComplexF &in){
-#if defined (AVX1)|| defined (AVX2)
-	  _mm256_stream_ps((float *)&out.v,in.v);
-#endif
-#ifdef SSE4
-	  _mm_stream_ps((float *)&out.v,in.v);
-#endif
-#ifdef AVX512
-	  _mm512_storenrngo_ps((float *)&out.v,in.v);
-	  //Note v has a3 a2 a1 a0
-#endif
-#ifdef QPX
-	  assert(0);
-#endif
-	}
-      friend inline void prefetch(const vComplexF &v)
-        {
-            _mm_prefetch((const char*)&v.v,_MM_HINT_T0);
-        }
-
-        friend inline void vsplat(vComplexF &ret,float a,float b){
-#if defined (AVX1)|| defined (AVX2)
-            ret.v = _mm256_set_ps(b,a,b,a,b,a,b,a);
-#endif
-#ifdef SSE4
-            ret.v = _mm_set_ps(b,a,b,a);
-#endif
-#ifdef AVX512
-            ret.v = _mm512_set_ps(b,a,b,a,b,a,b,a,b,a,b,a,b,a,b,a);
-#endif
-#ifdef QPX
-            ret.v = {a,b,a,b};
-#endif
-        }
-	friend inline void permute(vComplexF &y,vComplexF b,int perm)
-	{
-	  Gpermute<vComplexF>(y,b,perm);
-	}
-	friend inline ComplexF Reduce(const vComplexF & in)
-	{
-	 vComplexF v1,v2;
-	 union { 
-	   cvec v;
-	   float f[sizeof(cvec)/sizeof(float)];
-	 } conv;
-#ifdef SSE4
-	 permute(v1,in,0); // sse 128; paired complex single
-	 v1=v1+in;
-#endif
-#if defined(AVX1) || defined (AVX2)
-	 permute(v1,in,0); // sse 128; paired complex single
-	 v1=v1+in;
-	 permute(v2,v1,1); // avx 256; quad complex single
-	 v1=v1+v2;
-#endif
-#ifdef AVX512
-	 permute(v1,in,0); // avx512 octo-complex single
-	 v1=v1+in;
-	 permute(v2,v1,1); 
-	 v1=v1+v2;
-	 permute(v2,v1,2); 
-	 v1=v1+v2;
-#endif
-#ifdef QPX
-#error
-#endif
-	 conv.v = v1.v;
-	 return ComplexF(conv.f[0],conv.f[1]);
-        }
-
-        friend inline vComplexF operator * (const ComplexF &a, vComplexF b){
-            vComplexF va;
-            vsplat(va,a);
-            return va*b;
-        }
-        friend inline vComplexF operator * (vComplexF b,const ComplexF &a){
-	  return a*b;
-        }
-
-       /*
-	template<class real>
-        friend inline vComplexF operator * (vComplexF b,const real &a){
-            vComplexF va;
-	    Complex ca(a,0);
-            vsplat(va,ca);
-            return va*b;
-        }
-	template<class real>
-	friend inline vComplexF operator * (const real &a,vComplexF b){
-	  return a*b;
-	}
-
-        friend inline vComplexF operator + (const Complex &a, vComplexF b){
-            vComplexF va;
-            vsplat(va,a);
-            return va+b;
-        }
-        friend inline vComplexF operator + (vComplexF b,const Complex &a){
-            return a+b;
-        }
-	template<class real>
-        friend inline vComplexF operator + (vComplexF b,const real &a){
-            vComplexF va;
-	    Complex ca(a,0);
-            vsplat(va,ca);
-            return va+b;
-        }
-	template<class real>
-	friend inline vComplexF operator + (const real &a,vComplexF b){
-	  return a+b;
-	}
-        friend inline vComplexF operator - (const Complex &a, vComplexF b){
-            vComplexF va;
-            vsplat(va,a);
-            return va-b;
-        }
-        friend inline vComplexF operator - (vComplexF b,const Complex &a){
-            vComplexF va;
-            vsplat(va,a);
-            return b-va;
-        }
-	template<class real>
-        friend inline vComplexF operator - (vComplexF b,const real &a){
-            vComplexF va;
-	    Complex ca(a,0);
-            vsplat(va,ca);
-            return b-va;
-        }
-	template<class real>
-	friend inline vComplexF operator - (const real &a,vComplexF b){
-            vComplexF va;
-	    Complex ca(a,0);
-            vsplat(va,ca);
-            return va-b;
-	}
-       */
-       
-
-        ///////////////////////
-        // Conjugate
-        ///////////////////////
-								     
-        friend inline vComplexF conjugate(const vComplexF &in){
-            vComplexF ret ; vzero(ret);
-#if defined (AVX1)|| defined (AVX2)
-	    ret.v = _mm256_xor_ps(_mm256_addsub_ps(ret.v,in.v), _mm256_set1_ps(-0.f));
-#endif
-#ifdef SSE4
-	    ret.v = _mm_xor_ps(_mm_addsub_ps(ret.v,in.v), _mm_set1_ps(-0.f));
-#endif
-#ifdef AVX512
-            ret.v = _mm512_mask_sub_ps(in.v,0xaaaa,ret.v,in.v); // Zero out 0+real 0-imag 
-#endif
-#ifdef QPX
-            assert(0);
-#endif
-            return ret;
-        }
-        friend inline void timesMinusI( vComplexF &ret,const vComplexF &in){
-	  vzero(ret);
-#if defined (AVX1)|| defined (AVX2)
-	  cvec tmp =_mm256_addsub_ps(ret.v,in.v); // r,-i
-          ret.v = _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1)); //-i,r
-#endif
-#ifdef SSE4
-	  cvec tmp =_mm_addsub_ps(ret.v,in.v); // r,-i
-          ret.v = _mm_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
-#endif
-#ifdef AVX512
-          ret.v = _mm512_mask_sub_ps(in.v,0xaaaa,ret.v,in.v); // real -imag 
-	  ret.v = _mm512_swizzle_ps(ret.v, _MM_SWIZ_REG_CDAB);// OK
-#endif
-#ifdef QPX
-            assert(0);
-#endif
-	}
-        friend inline void timesI(vComplexF &ret,const vComplexF &in){
-	  vzero(ret);
-#if defined (AVX1)|| defined (AVX2)
-	  cvec tmp =_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1));//i,r
-          ret.v    =_mm256_addsub_ps(ret.v,tmp);     //i,-r
-#endif
-#ifdef SSE4
-	  cvec tmp =_mm_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1));
-          ret.v = _mm_addsub_ps(ret.v,tmp); // r,-i
-#endif
-#ifdef AVX512
-          cvec tmp = _mm512_swizzle_ps(in.v, _MM_SWIZ_REG_CDAB);// OK
-	  ret.v    = _mm512_mask_sub_ps(tmp,0xaaaa,ret.v,tmp); // real -imag 
-#endif
-#ifdef QPX
-            assert(0);
-#endif
-	}
-        friend inline vComplexF timesMinusI(const vComplexF &in){
-	  vComplexF ret; 
-	  timesMinusI(ret,in);
-	  return ret;
-	}
-        friend inline vComplexF timesI(const vComplexF &in){
-	  vComplexF ret; 
-	  timesI(ret,in);
-	  return ret;
-	}
-
-        
-        // Unary negation
-        friend inline vComplexF operator -(const vComplexF &r) {
-            vComplexF ret;
-            vzero(ret);
-            ret = ret - r;
-            return ret;
-        }
-        // *=,+=,-= operators
-        inline vComplexF &operator *=(const vComplexF &r) {
-            *this = (*this)*r;
-            return *this;
-        }
-        inline vComplexF &operator +=(const vComplexF &r) {
-            *this = *this+r;
-            return *this;
-        }
-        inline vComplexF &operator -=(const vComplexF &r) {
-            *this = *this-r;
-            return *this;
-        }
-
-      /*
-      friend inline void merge(vComplexF &y,std::vector<ComplexF *> &extracted)
-      {
-	Gmerge<vComplexF,ComplexF >(y,extracted);
-      }
-      friend inline void extract(const vComplexF &y,std::vector<ComplexF *> &extracted)
-      {
-	Gextract<vComplexF,ComplexF>(y,extracted);
-      }
-      friend inline void merge(vComplexF &y,std::vector<ComplexF > &extracted)
-      {
-	Gmerge<vComplexF,ComplexF >(y,extracted);
-      }
-      friend inline void extract(const vComplexF &y,std::vector<ComplexF > &extracted)
-      {
-	Gextract<vComplexF,ComplexF>(y,extracted);
-      }
-      */
-
-    };
-
-    inline vComplexF innerProduct(const vComplexF & l, const vComplexF & r) 
-    {
-      return conjugate(l)*r; 
-    }
-
-    inline void zeroit(vComplexF &z){ vzero(z);}
-
-    inline vComplexF outerProduct(const vComplexF &l, const vComplexF& r)
-    {
-        return l*r;
-    }
-    inline vComplexF trace(const vComplexF &arg){
-        return arg;
-    }
-
-
-}
-
-#endif
diff --git a/lib/simd/Old/Grid_vInteger.h b/lib/simd/Old/Grid_vInteger.h
deleted file mode 100644
index ee4a3633..00000000
--- a/lib/simd/Old/Grid_vInteger.h
+++ /dev/null
@@ -1,259 +0,0 @@
-#ifndef GRID_VINTEGER_H
-#define GRID_VINTEGER_H
-
-namespace Grid {
-
-
-  class vInteger {
-    protected:
-
-    public:
-
-      ivec v;
-
-	typedef ivec     vector_type;
-	typedef Integer scalar_type;
-
-        vInteger(){};
-        vInteger & operator = (const Zero & z){
-            vzero(*this);
-            return (*this);
-        }
-        vInteger(Integer a){
-	  vsplat(*this,a);
-	};
-        ////////////////////////////////////
-        // Arithmetic operator overloads +,-,*
-        ////////////////////////////////////
-        friend inline vInteger operator + ( vInteger a,  vInteger b)
-        {
-	  vInteger ret;
-#if defined (AVX1) 
-	  __m128i a0,a1;
-	  __m128i b0,b1;
-	  a0 = _mm256_extractf128_si256(a.v,0);
-	  b0 = _mm256_extractf128_si256(b.v,0);
-	  a1 = _mm256_extractf128_si256(a.v,1);
-	  b1 = _mm256_extractf128_si256(b.v,1);
-	  a0 = _mm_add_epi32(a0,b0);
-	  a1 = _mm_add_epi32(a1,b1);
-	  ret.v = _mm256_set_m128i(a1,a0);
-#endif
-#if defined (AVX2)
-            ret.v = _mm256_add_epi32(a.v,b.v);
-#endif
-#ifdef SSE4
-            ret.v = _mm_add_epi32(a.v,b.v);
-#endif
-#ifdef AVX512
-            ret.v = _mm512_add_epi32(a.v,b.v);
-#endif
-#ifdef QPX
-            // Implement as array of ints is only option
-#error
-#endif
-            return ret;
-        };
-        
-        friend inline vInteger operator - ( vInteger a, vInteger b)
-        {
-            vInteger ret;
-#if defined (AVX1) 
-	  __m128i a0,a1;
-	  __m128i b0,b1;
-	  a0 = _mm256_extractf128_si256(a.v,0);
-	  b0 = _mm256_extractf128_si256(b.v,0);
-	  a1 = _mm256_extractf128_si256(a.v,1);
-	  b1 = _mm256_extractf128_si256(b.v,1);
-	  a0 = _mm_sub_epi32(a0,b0);
-	  a1 = _mm_sub_epi32(a1,b1);
-	  ret.v = _mm256_set_m128i(a1,a0);
-#endif
-#if defined (AVX2)
-            ret.v = _mm256_sub_epi32(a.v,b.v);
-#endif
-#ifdef SSE4
-            ret.v = _mm_sub_epi32(a.v,b.v);
-#endif
-#ifdef AVX512
-            ret.v = _mm512_sub_epi32(a.v,b.v);
-#endif
-#ifdef QPX
-            // Implement as array of ints is only option
-#error
-#endif
-            return ret;
-        };
-
-        friend inline vInteger operator * ( vInteger a, vInteger b)
-        {
-            vInteger ret;
-#if defined (AVX1) 
-	  __m128i a0,a1;
-	  __m128i b0,b1;
-	  a0 = _mm256_extractf128_si256(a.v,0);
-	  b0 = _mm256_extractf128_si256(b.v,0);
-	  a1 = _mm256_extractf128_si256(a.v,1);
-	  b1 = _mm256_extractf128_si256(b.v,1);
-	  a0 = _mm_mul_epi32(a0,b0);
-	  a1 = _mm_mul_epi32(a1,b1);
-	  ret.v = _mm256_set_m128i(a1,a0);
-#endif
-#if defined (AVX2)
-            ret.v = _mm256_mul_epi32(a.v,b.v);
-#endif
-#ifdef SSE4
-            ret.v = _mm_mul_epi32(a.v,b.v);
-#endif
-#ifdef AVX512
-	    //            ret.v = _mm512_mul_epi32(a.v,b.v);
-	    ret.v = _mm512_mullo_epi32(a.v,b.v);
-#endif
-#ifdef QPX
-            // Implement as array of ints is only option
-#error
-#endif
-            return ret;
-        };
-        
-        ///////////////////////////////////////////////
-        // mult, sub, add, adj,conjugate, mac functions
-        ///////////////////////////////////////////////
-        friend inline void mult(vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) * (*r);}
-        friend inline void sub (vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) - (*r);}
-        friend inline void add (vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) + (*r);}
-        friend inline void mac (vInteger &y,const vInteger a,const vInteger x){
-            y = a*x+y;
-	}
-        
-        //////////////////////////////////
-        // Initialise to 1,0,i
-        //////////////////////////////////
-        friend inline void vone (vInteger &ret){vsplat(ret,1);}
-        friend inline void vzero(vInteger &ret){vsplat(ret,0);}
-        friend inline void vtrue (vInteger &ret){vsplat(ret,0xFFFFFFFF);}
-        friend inline void vfalse(vInteger &ret){vsplat(ret,0);}
-
-        
-        /////////////////////////////////////////////////////
-        // Broadcast a value across Nsimd copies.
-        /////////////////////////////////////////////////////
-        friend inline void vsplat(vInteger &ret,scalar_type a){
-#if defined (AVX1)|| defined (AVX2)
-            ret.v = _mm256_set1_epi32(a);
-#endif
-#ifdef SSE4
-            ret.v = _mm_set1_epi32(a);
-#endif
-#ifdef AVX512
-            ret.v = _mm512_set1_epi32(a);
-#endif
-#ifdef QPX
-#error
-#endif
-        }
-        friend inline void vset(vInteger &ret,scalar_type *a){
-#if defined (AVX1)|| defined (AVX2)
-	  ret.v = _mm256_set_epi32(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
-#endif
-#ifdef SSE4
-	  ret.v = _mm_set_epi32(a[0],a[1],a[2],a[3]);
-#endif
-#ifdef AVX512
-	  ret.v = _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
-                                   a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
-#endif
-#ifdef QPX
-#error
-#endif
-	}
-
-	friend inline void vstore(const vInteger &ret, Integer *a){
-#if defined (AVX1)|| defined (AVX2)
-	  _mm256_store_si256((__m256i*)a,ret.v);
-#endif
-#ifdef SSE4
-	  _mm_store_si128((__m128i *)a,ret.v);
-#endif
-#ifdef AVX512
-	  _mm512_store_si512(a,ret.v);
-#endif
-#ifdef QPX
-	  assert(0);
-#endif
-        }
-
-	friend inline void vstream(vInteger & out,const vInteger &in){
-	  out=in;
-        }
-
-        friend inline void prefetch(const vInteger &v)
-        {
-            _mm_prefetch((const char*)&v.v,_MM_HINT_T0);
-        }
-
-        // Unary negation
-        friend inline vInteger operator -(const vInteger &r) {
-            vInteger ret;
-            vzero(ret);
-            ret = ret - r;
-            return ret;
-        }
-       friend inline Integer Reduce(const vInteger & in)
-       {
-	 // unimplemented
-	 assert(0);
-       }
-        // *=,+=,-= operators
-        inline vInteger &operator *=(const vInteger &r) {
-            *this = (*this)*r;
-            return *this;
-        }
-        inline vInteger &operator +=(const vInteger &r) {
-            *this = *this+r;
-            return *this;
-        }
-        inline vInteger &operator -=(const vInteger &r) {
-            *this = *this-r;
-            return *this;
-        }
-
-      friend inline void permute(vInteger &y,const vInteger b,int perm)
-      {
-	Gpermute<vInteger>(y,b,perm);
-      }
-      /*
-      friend inline void merge(vInteger &y,std::vector<Integer *> &extracted)
-      {
-	Gmerge<vInteger,Integer>(y,extracted);
-      }
-      friend inline void extract(const vInteger &y,std::vector<Integer *> &extracted)
-      {
-	Gextract<vInteger,Integer>(y,extracted);
-      }
-      friend inline void merge(vInteger &y,std::vector<Integer> &extracted)
-      {
-	Gmerge<vInteger,Integer>(y,extracted);
-      }
-      friend inline void extract(const vInteger &y,std::vector<Integer> &extracted)
-      {
-	Gextract<vInteger,Integer>(y,extracted);
-      }
-      */
-
-    public:
-        static inline int Nsimd(void) { return sizeof(ivec)/sizeof(Integer);}
-  };
-
-    inline vInteger localInnerProduct(const vInteger & l, const vInteger & r) { return l*r; }
-
-    inline void  zeroit(vInteger &z){ vzero(z);}
-
-    inline vInteger outerProduct(const vInteger &l, const vInteger& r)
-    {
-        return l*r;
-    }
-    
-}
-
-#endif
diff --git a/lib/simd/Old/Grid_vRealD.h b/lib/simd/Old/Grid_vRealD.h
deleted file mode 100644
index a32e579e..00000000
--- a/lib/simd/Old/Grid_vRealD.h
+++ /dev/null
@@ -1,276 +0,0 @@
-#ifndef GRID_VREALD_H
-#define GRID_VREALD_H
-
-namespace Grid {
-    class vRealD  {
-    public:
-        dvec v; // dvec is double precision vector
-
-    public:
-	typedef dvec  vector_type;
-	typedef RealD scalar_type;
-
-        vRealD()=default;
-        vRealD(RealD a){
-	  vsplat(*this,a);
-	};
-        vRealD(Zero &zero){
-	  zeroit(*this);
-	}
-        vRealD & operator = ( Zero & z){
-	  vzero(*this);
-	  return (*this);
-        }
-
-        friend inline void mult(vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) * (*r);}
-        friend inline void sub (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) - (*r);}
-        friend inline void add (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) + (*r);}
-        friend inline vRealD adj(const vRealD &in) { return in; }
-        friend inline vRealD conjugate(const vRealD &in){ return in; }
-        
-        friend inline void mac (vRealD &y,const vRealD a,const vRealD x){
-#if defined (AVX1) || defined (SSE4)
-            y = a*x+y;
-#endif
-#ifdef AVX2     // AVX 2 introduced FMA support. FMA4 eliminates a copy, but AVX only has FMA3
-            // accelerates multiply accumulate, but not general multiply add
-            y.v = _mm256_fmadd_pd(a.v,x.v,y.v);
-#endif
-#ifdef AVX512
-            // here precision of vector are still single
-            y.v = _mm512_fmadd_pd(a.v,x.v,y.v);
-#endif
-#ifdef QPX
-            y.v = vec_madd(a.v,x.v,y.v);
-#endif
-        }
-        //////////////////////////////////
-        // Initialise to 1,0
-        //////////////////////////////////
-        friend inline void vone (vRealD &ret){ vsplat(ret,1.0);}
-        friend inline void vzero(vRealD &ret){ vsplat(ret,0.0);}
-        
-        
-        ////////////////////////////////////
-        // Arithmetic operator overloads +,-,*
-        ////////////////////////////////////
-        friend inline vRealD operator + (vRealD a, vRealD b)
-        {
-            vRealD ret;
-#if defined (AVX1)|| defined (AVX2)
-            ret.v = _mm256_add_pd(a.v,b.v);
-#endif
-#ifdef SSE4
-            ret.v = _mm_add_pd(a.v,b.v);
-#endif
-#ifdef AVX512
-            ret.v = _mm512_add_pd(a.v,b.v);
-#endif
-#ifdef QPX
-            ret.v = vec_add(a.v,b.v);
-#endif
-            return ret;
-        };
-        friend inline vRealD operator - (vRealD a, vRealD b)
-        {
-            vRealD ret;
-#if defined (AVX1)|| defined (AVX2)
-            ret.v = _mm256_sub_pd(a.v,b.v);
-#endif
-#ifdef SSE4
-            ret.v = _mm_sub_pd(a.v,b.v);
-#endif
-#ifdef AVX512
-            ret.v = _mm512_sub_pd(a.v,b.v);
-#endif
-#ifdef QPX
-            ret.v = vec_sub(a.v,b.v);
-#endif
-            return ret;
-        };
-        
-        friend inline vRealD operator * (vRealD a, vRealD b)
-        {
-            vRealD ret;
-#if defined (AVX1)|| defined (AVX2)
-            ret.v = _mm256_mul_pd(a.v,b.v);
-#endif
-#ifdef SSE4
-            ret.v = _mm_mul_pd(a.v,b.v);
-#endif
-#ifdef AVX512
-            ret.v = _mm512_mul_pd(a.v,b.v);
-#endif
-#ifdef QPX
-            ret.v = vec_mul(a.v,b.v);
-#endif
-            return ret;
-        };
-
-	////////////////////////////////////////////////////////////////////
-	// General permute; assumes vector length is same across 
-	// all subtypes; may not be a good assumption, but could
-	// add the vector width as a template param for BG/Q for example
-	////////////////////////////////////////////////////////////////////
-
-	friend inline void permute(vRealD &y,vRealD b,int perm)
-	{
-	  Gpermute<vRealD>(y,b,perm);
-	}
-	/*
-	friend inline void merge(vRealD &y,std::vector<RealD *> &extracted)
-	{
-	  Gmerge<vRealD,RealD >(y,extracted);
-	}
-	friend inline void extract(const vRealD &y,std::vector<RealD *> &extracted)
-	{
-	  Gextract<vRealD,RealD>(y,extracted);
-	}
-	friend inline void merge(vRealD &y,std::vector<RealD > &extracted)
-	{
-	  Gmerge<vRealD,RealD >(y,extracted);
-	}
-	friend inline void extract(const vRealD &y,std::vector<RealD > &extracted)
-	{
-	  Gextract<vRealD,RealD>(y,extracted);
-	}
-	*/
-        
-        friend inline void vsplat(vRealD &ret,double a){
-#if defined (AVX1)|| defined (AVX2)
-            ret.v = _mm256_set_pd(a,a,a,a);
-#endif
-#ifdef SSE4
-            ret.v = _mm_set_pd(a,a);
-#endif
-#ifdef AVX512
-            ret.v = _mm512_set1_pd(a);
-#endif
-#ifdef QPX
-            ret.v = {a,a,a,a};
-#endif
-        }
-	friend inline void vset(vRealD &ret, double *a){
-#if defined (AVX1)|| defined (AVX2)
-            ret.v = _mm256_set_pd(a[3],a[2],a[1],a[0]);
-#endif
-#ifdef SSE4
-            ret.v = _mm_set_pd(a[1],a[0]);
-#endif
-#ifdef AVX512
-            ret.v = _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
-            // Note v has a0 a1 a2 a3 a4 a5 a6 a7
-#endif
-#ifdef QPX
-            ret.v = {a[0],a[1],a[2],a[3]};
-#endif
-	}
-
-	friend inline void vstore(const vRealD &ret, double *a){
-#if defined (AVX1)|| defined (AVX2)
-            _mm256_store_pd(a,ret.v);
-#endif
-#ifdef SSE4
-            _mm_store_pd(a,ret.v);
-#endif
-#ifdef AVX512
-            _mm512_store_pd(a,ret.v);
-            // Note v has a7 a6 a5ba4 a3 a2 a1 a0
-#endif
-#ifdef QPX
-	    assert(0);
-#endif
-	}
-        friend inline void vstream(vRealD &out,const vRealD &in){
-#if defined (AVX1)|| defined (AVX2)
-	  _mm256_stream_pd((double *)&out.v,in.v);
-#endif
-#ifdef SSE4
-	  _mm_stream_pd((double *)&out.v,in.v);
-#endif
-#ifdef AVX512
-	  _mm512_storenrngo_pd((double *)&out.v,in.v);
-	  //Note v has a3 a2 a1 a0
-#endif
-#ifdef QPX
-	  assert(0);
-#endif
-	}
-        friend inline void prefetch(const vRealD &v)
-        {
-            _mm_prefetch((const char*)&v.v,_MM_HINT_T0);
-        }
-        // Unary negation
-        friend inline vRealD operator -(const vRealD &r) {
-            vRealD ret;
-            vzero(ret);
-            ret = ret - r;
-            return ret;
-        }
-
-       friend inline RealD Reduce(const vRealD & in)
-       {
-	 vRealD v1,v2;
-	 union { 
-	   dvec v;
-	   double f[sizeof(dvec)/sizeof(double)];
-	 } conv;
-#ifdef SSE4
-	 permute(v1,in,0); // sse 128; paired real double
-	 v1=v1+in;
-#endif
-#if defined(AVX1) || defined (AVX2)
-	 permute(v1,in,0); // avx 256; quad double
-	 v1=v1+in;
-	 permute(v2,v1,1); 
-	 v1=v1+v2;
-#endif
-#ifdef AVX512
-	 permute(v1,in,0); // avx 512; octo-double
-	 v1=v1+in;
-	 permute(v2,v1,1); 
-	 v1=v1+v2;
-	 permute(v2,v1,2); 
-	 v1=v1+v2;
-#endif
-#ifdef QPX
-#endif
-	 conv.v=v1.v;
-	 return conv.f[0];
-       }
-
-        // *=,+=,-= operators
-        inline vRealD &operator *=(const vRealD &r) {
-            *this = (*this)*r;
-            return *this;
-        }
-        inline vRealD &operator +=(const vRealD &r) {
-            *this = *this+r;
-            return *this;
-        }
-        inline vRealD &operator -=(const vRealD &r) {
-            *this = *this-r;
-            return *this;
-        }
-
-    public:
-        static int Nsimd(void) { return sizeof(dvec)/sizeof(double);}
-    };
-
-    inline vRealD innerProduct(const vRealD & l, const vRealD & r) { return conjugate(l)*r; }
-    inline void zeroit(vRealD &z){ vzero(z);}
-
-    inline vRealD outerProduct(const vRealD &l, const vRealD& r)
-    {
-        return l*r;
-    }
-    inline vRealD trace(const vRealD &arg){
-        return arg;
-    }
-    inline vRealD real(const vRealD &arg){
-        return arg;
-    }
-
-
-}
-#endif
diff --git a/lib/simd/Old/Grid_vRealF.h b/lib/simd/Old/Grid_vRealF.h
deleted file mode 100644
index 05bfa2f5..00000000
--- a/lib/simd/Old/Grid_vRealF.h
+++ /dev/null
@@ -1,313 +0,0 @@
-#ifndef GRID_VREALF_H
-#define GRID_VREALF_H
-
-
-namespace Grid {
-    class vRealF  {
-    public:
-        fvec v;
-
-    public:
-	typedef fvec  vector_type;
-	typedef RealF scalar_type;
-
-        vRealF()=default;
-        vRealF(RealF a){
-	  vsplat(*this,a);
-	};
-        vRealF(Zero &zero){
-	  zeroit(*this);
-	}
-        vRealF & operator = ( Zero & z){
-	  vzero(*this);
-	  return (*this);
-        }
-        ////////////////////////////////////
-        // Arithmetic operator overloads +,-,*
-        ////////////////////////////////////
-        friend inline vRealF operator + ( vRealF a,  vRealF b)
-        {
-            vRealF ret;
-#if defined (AVX1)|| defined (AVX2)
-            ret.v = _mm256_add_ps(a.v,b.v);
-#endif
-#ifdef SSE4
-            ret.v = _mm_add_ps(a.v,b.v);
-#endif
-#ifdef AVX512
-            ret.v = _mm512_add_ps(a.v,b.v);
-#endif
-#ifdef QPX
-            vector4double aa,bb,cc;
-            aa = vec_lda(0,(float *)&a);
-            bb = vec_lda(0,(float *)&b);
-            cc = vec_add(aa,bb);
-            vec_sta(cc,0,(float *)&ret.v);
-#endif
-            return ret;
-        };
-        
-        friend inline vRealF operator - ( vRealF a, vRealF b)
-        {
-            vRealF ret;
-#if defined (AVX1)|| defined (AVX2)
-            ret.v = _mm256_sub_ps(a.v,b.v);
-#endif
-#ifdef SSE4
-            ret.v = _mm_sub_ps(a.v,b.v);
-#endif
-#ifdef AVX512
-            ret.v = _mm512_sub_ps(a.v,b.v);
-#endif
-#ifdef QPX
-            vector4double aa,bb,cc;
-            aa = vec_lda(0,(float *)&a);
-            bb = vec_lda(0,(float *)&b);
-            cc = vec_sub(aa,bb);
-            vec_sta(cc,0,(float *)&ret.v);
-#endif
-            return ret;
-        };
-
-        friend inline vRealF operator * ( vRealF a, vRealF b)
-        {
-            vRealF ret;
-#if defined (AVX1)|| defined (AVX2)
-            ret.v = _mm256_mul_ps(a.v,b.v);
-#endif
-#ifdef SSE4
-            ret.v = _mm_mul_ps(a.v,b.v);
-#endif
-#ifdef AVX512
-            ret.v = _mm512_mul_ps(a.v,b.v);
-#endif
-#ifdef QPX
-            vector4double aa,bb,cc; // QPX single we are forced to load as this promotes single mem->double regs.
-            aa = vec_lda(0,(float *)&a);
-            bb = vec_lda(0,(float *)&b);
-            cc = vec_mul(aa,bb);
-            vec_sta(cc,0,(float *)&ret.v);
-#endif
-            return ret;
-        };
-        
-        ///////////////////////////////////////////////
-        // mult, sub, add, adj,conjugate, mac functions
-        ///////////////////////////////////////////////
-        friend inline void mult(vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) * (*r);}
-        friend inline void sub (vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) - (*r);}
-        friend inline void add (vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) + (*r);}
-        friend inline vRealF adj(const vRealF &in) { return in; }
-        friend inline vRealF conjugate(const vRealF &in){ return in; }
-
-        friend inline void mac (vRealF &y,const vRealF a,const vRealF x){
-#if defined (AVX1) || defined (SSE4)
-            y = a*x+y;
-#endif
-#ifdef AVX2     // AVX 2 introduced FMA support. FMA4 eliminates a copy, but AVX only has FMA3
-            // accelerates multiply accumulate, but not general multiply add
-            y.v = _mm256_fmadd_ps(a.v,x.v,y.v);
-#endif
-#ifdef AVX512
-            y.v = _mm512_fmadd_ps(a.v,x.v,y.v);
-#endif
-#ifdef QPX
-            vector4double aa,xx,yy; // QPX single we are forced to load as this promotes single mem->double regs.
-            aa = vec_lda(0,(float *)&a.v);
-            xx = vec_lda(0,(float *)&x.v);
-            yy = vec_lda(0,(float *)&y.v);
-            yy = vec_madd(aa,xx,yy);
-            vec_sta(yy,0,(float *)&y.v);
-#endif
-        }
-        
-        //////////////////////////////////
-        // Initialise to 1,0,i
-        //////////////////////////////////
-        friend inline void vone (vRealF &ret){vsplat(ret,1.0);}
-        friend inline void vzero(vRealF &ret){vsplat(ret,0.0);}
-
-
-	////////////////////////////////////////////////////////////////////
-	// General permute; assumes vector length is same across 
-	// all subtypes; may not be a good assumption, but could
-	// add the vector width as a template param for BG/Q for example
-	////////////////////////////////////////////////////////////////////
-
-	friend inline void permute(vRealF &y,vRealF b,int perm)
-	{
-	  Gpermute<vRealF>(y,b,perm);
-	}
-	/*
-	friend inline void merge(vRealF &y,std::vector<RealF *> &extracted)
-	{
-	  Gmerge<vRealF,RealF >(y,extracted);
-	}
-	friend inline void extract(const vRealF &y,std::vector<RealF *> &extracted)
-	{
-	  Gextract<vRealF,RealF>(y,extracted);
-	}
-	friend inline void merge(vRealF &y,std::vector<RealF> &extracted)
-	{
-	  Gmerge<vRealF,RealF >(y,extracted);
-	}
-	friend inline void extract(const vRealF &y,std::vector<RealF> &extracted)
-	{
-	  Gextract<vRealF,RealF>(y,extracted);
-	}
-	*/
-        
-        /////////////////////////////////////////////////////
-        // Broadcast a value across Nsimd copies.
-        /////////////////////////////////////////////////////
-        friend inline void vsplat(vRealF &ret,float a){
-#if defined (AVX1)|| defined (AVX2)
-            ret.v = _mm256_set_ps(a,a,a,a,a,a,a,a);
-#endif
-#ifdef SSE4
-            ret.v = _mm_set_ps(a,a,a,a);
-#endif
-#ifdef AVX512
-            //ret.v = _mm512_set_ps(a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a);
-            ret.v = _mm512_set1_ps(a);
-#endif
-#ifdef QPX
-            ret.v = {a,a,a,a};
-#endif
-        }
-
-
-        friend inline void vset(vRealF &ret, float *a){
-#if defined (AVX1)|| defined (AVX2)
-            ret.v = _mm256_set_ps(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
-#endif
-#ifdef SSE4
-            ret.v = _mm_set_ps(a[3],a[2],a[1],a[0]);
-#endif
-#ifdef AVX512
-            ret.v = _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
-                                   a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
-            // Note v has a0 a1 a2 a3 a4 a5 a6 a7
-#endif
-#ifdef QPX
-            ret.v = {a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7]};
-#endif
-	}
-
-	////////////////////////////////////////////////////////////////////////
-	// FIXME:  gonna remove these load/store, get, set, prefetch
-	////////////////////////////////////////////////////////////////////////
-friend inline void vstore(const vRealF &ret, float *a){
-#if defined (AVX1)|| defined (AVX2)
-	_mm256_store_ps(a,ret.v);
-#endif
-#ifdef SSE4
-	_mm_store_ps(a,ret.v);
-#endif
-#ifdef AVX512
-	_mm512_store_ps(a,ret.v);
-	// Note v has a7 a6 a5ba4 a3 a2 a1 a0
-#endif
-#ifdef QPX
-	assert(0);
-#endif
-        }
-        friend inline void vstream(vRealF &out,const vRealF &in){
-#if defined (AVX1)|| defined (AVX2)
-	  _mm256_stream_ps((float *)&out.v,in.v);
-#endif
-#ifdef SSE4
-	  _mm_stream_ps((float *)&out.v,in.v);
-#endif
-#ifdef AVX512
-	  _mm512_storenrngo_ps((float *)&out.v,in.v);
-	  //	  _mm512_stream_ps((float *)&out.v,in.v);
-	  //Note v has a3 a2 a1 a0
-#endif
-#ifdef QPX
-	  assert(0);
-#endif
-	}
-
-
-        friend inline void prefetch(const vRealF &v)
-        {
-            _mm_prefetch((const char*)&v.v,_MM_HINT_T0);
-        }
-        // Unary negation
-        friend inline vRealF operator -(const vRealF &r) {
-            vRealF ret;
-            vzero(ret);
-            ret = ret - r;
-            return ret;
-        }
-       friend inline RealF Reduce(const vRealF & in)
-       {
-	 vRealF v1,v2;
-	 union { 
-	   fvec v;
-	   float f[sizeof(fvec)/sizeof(double)];
-	 } conv;
-#ifdef SSE4
-	 permute(v1,in,0); // sse 128; quad single
-	 v1=v1+in;
-	 permute(v2,v1,1); 
-	 v1=v1+v2;
-#endif
-#if defined(AVX1) || defined (AVX2)
-	 permute(v1,in,0); // avx 256; octo-double
-	 v1=v1+in;
-	 permute(v2,v1,1); 
-	 v1=v1+v2;
-	 permute(v2,v1,2); 
-	 v1=v1+v2;
-#endif
-#ifdef AVX512
-	 permute(v1,in,0); // avx 256; octo-double
-	 v1=v1+in;
-	 permute(v2,v1,1); 
-	 v1=v1+v2;
-	 permute(v2,v1,2); 
-	 v1=v1+v2;
-	 permute(v2,v1,3); 
-	 v1=v1+v2;
-#endif
-#ifdef QPX
-#endif
-	 conv.v=v1.v;
-	 return conv.f[0];
-        }
-
-        // *=,+=,-= operators
-        inline vRealF &operator *=(const vRealF &r) {
-            *this = (*this)*r;
-            return *this;
-        }
-        inline vRealF &operator +=(const vRealF &r) {
-            *this = *this+r;
-            return *this;
-        }
-        inline vRealF &operator -=(const vRealF &r) {
-            *this = *this-r;
-            return *this;
-        }
-    public:
-        static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);}
-    };
-    inline vRealF innerProduct(const vRealF & l, const vRealF & r) { return conjugate(l)*r; }
-    inline void  zeroit(vRealF &z){ vzero(z);}
-
-    inline vRealF outerProduct(const vRealF &l, const vRealF& r)
-    {
-        return l*r;
-    }
-    inline vRealF trace(const vRealF &arg){
-        return arg;
-    }
-    inline vRealF real(const vRealF &arg){
-        return arg;
-    }
-
-    
-}
-#endif
diff --git a/lib/tensors/Tensor_Ta.h b/lib/tensors/Tensor_Ta.h
index 3a903364..f06dbf04 100644
--- a/lib/tensors/Tensor_Ta.h
+++ b/lib/tensors/Tensor_Ta.h
@@ -29,15 +29,87 @@ namespace Grid {
     {
       iMatrix<vtype,N> ret(arg);
       double factor = (1/(double)N);
-      for(int c1=0;c1<N;c1++){
-	for(int c2=0;c2<N;c2++){
-	  ret._internal[c1][c2]= (ret._internal[c1][c2] - adj(arg._internal[c2][c1]));
-	  ret._internal[c1][c2] *= 0.5;
-	}}
-      //ret = (ret - adj(arg))*0.5;
+      ret = (ret - adj(arg))*0.5;
       ret -= trace(ret)*factor;
       return ret;
     }
 
+
+  /////////////////////////////////////////////// 
+  // ProjectOnGroup function for scalar, vector, matrix
+  /////////////////////////////////////////////// 
+
+
+  template<class vtype> inline iScalar<vtype> ProjectOnGroup(const iScalar<vtype>&r)
+    {
+      iScalar<vtype> ret;
+      ret._internal = ProjectOnGroup(r._internal);
+      return ret;
+    }
+  template<class vtype,int N> inline iVector<vtype,N> ProjectOnGroup(const iVector<vtype,N>&r)
+    {
+      iVector<vtype,N> ret;
+      for(int i=0;i<N;i++){
+        ret._internal[i] = ProjectOnGroup(r._internal[i]);
+      }
+      return ret;
+    }
+  template<class vtype,int N, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0 >::type * =nullptr> 
+    inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
+    {
+      // need a check for the group type?
+      iMatrix<vtype,N> ret(arg);
+      double nrm;
+      for(int c1=0;c1<N;c1++){
+	nrm = 0.0; 
+	for(int c2=0;c2<N;c2++)
+	  nrm = real(innerProduct(ret._internal[c1][c2],ret._internal[c1][c2]));
+	nrm = 1.0/sqrt(nrm);
+	for(int c2=0;c2<N;c2++)
+	  ret._internal[c1][c2]*= nrm;
+      
+	for (int b=c1+1; b<N; ++b){
+	  decltype(ret._internal[b][b]*ret._internal[b][b]) pr = 0.0;
+	  for(int c=0; c<N; ++c)
+	    pr += ret._internal[c1][c]*ret._internal[b][c];
+	  
+	  for(int c=0; c<N; ++c){
+	    ret._internal[b][c] -= pr * ret._internal[c1][c];
+	  }
+	}
+	  
+      }
+      return ret;
+    }
+
+
+  /////////////////////////////////////////////// 
+  // Exponentiate function for scalar, vector, matrix
+  /////////////////////////////////////////////// 
+
+
+  template<class vtype> inline iScalar<vtype> Exponentiate(const iScalar<vtype>&r, double alpha,  int Nexp)
+    {
+      iScalar<vtype> ret;
+      ret._internal = Exponentiate(r._internal, alpha, Nexp);
+      return ret;
+    }
+
+
+  template<class vtype,int N, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0 >::type * =nullptr> 
+    inline iMatrix<vtype,N> Exponentiate(const iMatrix<vtype,N> &arg, double alpha, int Nexp)
+    {
+      iMatrix<vtype,N> unit(1.0);
+      iMatrix<vtype,N> temp(unit);
+      
+      for(int i=Nexp; i>=1;--i){
+	temp *= alpha/double(i);
+	temp = unit + temp*arg;
+      }
+      return ProjectOnGroup(temp);
+    }
+
+
+
 }
 #endif
diff --git a/tests/InvSqrt.gnu b/tests/InvSqrt.gnu
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/Sqrt.gnu b/tests/Sqrt.gnu
new file mode 100644
index 00000000..ae56ab97
--- /dev/null
+++ b/tests/Sqrt.gnu
@@ -0,0 +1,2 @@
+f(x) = 6.81384+(-2.34645e-06/(x+0.000228091))+(-1.51593e-05/(x+0.00112084))+(-6.89254e-05/(x+0.003496))+(-0.000288983/(x+0.00954309))+(-0.00119277/(x+0.024928))+(-0.0050183/(x+0.0646627))+(-0.0226449/(x+0.171576))+(-0.123767/(x+0.491792))+(-1.1705/(x+1.78667))+(-102.992/(x+18.4866));
+f(x) = 0.14676+(0.00952992/(x+5.40933e-05))+(0.0115952/(x+0.000559699))+(0.0161824/(x+0.00203338))+(0.0243252/(x+0.00582831))+(0.0379533/(x+0.0154649))+(0.060699/(x+0.0401156))+(0.100345/(x+0.104788))+(0.178335/(x+0.286042))+(0.381586/(x+0.892189))+(1.42625/(x+4.38422));
diff --git a/tests/Test_main.cc b/tests/Test_main.cc
index cef2d848..88d69192 100644
--- a/tests/Test_main.cc
+++ b/tests/Test_main.cc
@@ -223,6 +223,9 @@ int main (int argc, char ** argv)
     //TComplex tracecm= trace(cm);      
     //std::cout << cm << "  "<< tracecm << std::endl;
 
+    cm = ProjectOnGroup(cm);
+
+    cm = Exponentiate(cm, 1.0, 10);
 
 //    Foo = Foo+scalar; // LatticeColourMatrix+Scalar
 //    Foo = Foo*scalar; // LatticeColourMatrix*Scalar

From 4d11fc03301df7da821e08ba50795879c6f68e4c Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 9 Jun 2015 22:36:48 +0100
Subject: [PATCH 12/24] silly change

---
 tests/Test_nersc_io.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/Test_nersc_io.cc b/tests/Test_nersc_io.cc
index 403d2cab..f473781e 100644
--- a/tests/Test_nersc_io.cc
+++ b/tests/Test_nersc_io.cc
@@ -25,7 +25,6 @@ int main (int argc, char ** argv)
   std::vector<LatticeColourMatrix> U(4,&Fine);
   
   NerscField header;
-  
   std::string file("./ckpoint_lat.4000");
   readNerscConfiguration(Umu,header,file);
 

From 6403e3021ff976f186deec45073057fad3cd5438 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 9 Jun 2015 22:37:21 +0100
Subject: [PATCH 13/24] Debugged finally. A silly mistake in permute cost me a
 day of debug.

---
 lib/algorithms/CoarsenedMatrix.h | 104 ++++++++++++++++++++++++-------
 1 file changed, 82 insertions(+), 22 deletions(-)

diff --git a/lib/algorithms/CoarsenedMatrix.h b/lib/algorithms/CoarsenedMatrix.h
index 006da07f..ba417be3 100644
--- a/lib/algorithms/CoarsenedMatrix.h
+++ b/lib/algorithms/CoarsenedMatrix.h
@@ -11,7 +11,6 @@ namespace Grid {
     int npoint;
     std::vector<int> directions   ;
     std::vector<int> displacements;
-    std::vector<int> opdirs;
 
     // FIXME -- don't like xposing the operator directions
     // as different to the geometrical dirs
@@ -26,18 +25,22 @@ namespace Grid {
       npoint = 2*_d+1;
       directions.resize(npoint);
       displacements.resize(npoint);
-      opdirs.resize(npoint);
       for(int d=0;d<_d;d++){
 	directions[2*d  ] = d+base;
 	directions[2*d+1] = d+base;
-	opdirs[2*d  ] = d;
-	opdirs[2*d+1] = d;
 	displacements[2*d  ] = +1;
 	displacements[2*d+1] = -1;
       }
       directions   [2*_d]=0;
       displacements[2*_d]=0;
-      opdirs       [2*_d]=0;
+      
+      //// report back
+      std::cout<<"directions    :";
+      for(int d=0;d<npoint;d++) std::cout<< directions[d]<< " ";
+      std::cout <<std::endl;
+      std::cout<<"displacements :";
+      for(int d=0;d<npoint;d++) std::cout<< displacements[d]<< " ";
+      std::cout <<std::endl;
     }
   
     /*
@@ -64,12 +67,12 @@ namespace Grid {
   // Fine Object == (per site) type of fine field
   // nbasis      == number of deflation vectors
   template<class Fobj,class CComplex,int nbasis>
-  class CoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<vComplex,nbasis > > >  {
+  class CoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > >  {
   public:
     
-    typedef iVector<vComplex,nbasis > siteVector;
-    typedef Lattice<iVector<vComplex,nbasis > > CoarseVector;
-    typedef Lattice<iMatrix<vComplex,nbasis > > CoarseMatrix;
+    typedef iVector<CComplex,nbasis >             siteVector;
+    typedef Lattice<siteVector>                 CoarseVector;
+    typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
 
     typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
     typedef Lattice<Fobj >        FineField;
@@ -82,7 +85,6 @@ namespace Grid {
     CartesianStencil Stencil; 
 
     std::vector<CoarseMatrix> A;
-    std::vector<CoarseMatrix> Aslow;
 
     std::vector<siteVector,alignedAllocator<siteVector> >   comm_buf;
       
@@ -91,25 +93,28 @@ namespace Grid {
     ///////////////////////
     GridBase * Grid(void)         { return _grid; };   // this is all the linalg routines need to know
 
-    RealD M    (const CoarseVector &in, CoarseVector &out){
-      
+    RealD M (const CoarseVector &in, CoarseVector &out){
+
+      conformable(_grid,in._grid);
+      conformable(in._grid,out._grid);
+
       SimpleCompressor<siteVector> compressor;
       Stencil.HaloExchange(in,comm_buf,compressor);
 
-PARALLEL_FOR_LOOP
+      //PARALLEL_FOR_LOOP
       for(int ss=0;ss<Grid()->oSites();ss++){
         siteVector res = zero;
-	siteVector tmp;
 	siteVector nbr;
+	int offset,local,perm,ptype;
 
-	int offset,local,perm;
 	for(int point=0;point<geom.npoint;point++){
 	  offset = Stencil._offsets [point][ss];
 	  local  = Stencil._is_local[point][ss];
-	  perm   = Stencil._permute[point][ss];
+	  perm   = Stencil._permute [point][ss];
+	  ptype  = Stencil._permute_type[point];
 	  
 	  if(local&&perm) { 
-	    permute(nbr,in._odata[offset],perm);
+	    permute(nbr,in._odata[offset],ptype);
 	  } else if(local) { 
 	    nbr = in._odata[offset];
 	  } else {
@@ -125,6 +130,7 @@ PARALLEL_FOR_LOOP
     RealD Mdag (const CoarseVector &in, CoarseVector &out){ 
       return M(in,out);
     };
+
     // Defer support for further coarsening for now
     void Mdiag    (const CoarseVector &in,  CoarseVector &out){};
     void Mdir     (const CoarseVector &in,  CoarseVector &out,int dir, int disp){};
@@ -134,8 +140,7 @@ PARALLEL_FOR_LOOP
       _grid(&CoarseGrid),
       geom(CoarseGrid._ndimension),
       Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
-	A(geom.npoint,&CoarseGrid),
-	Aslow(geom.npoint,&CoarseGrid)
+      A(geom.npoint,&CoarseGrid)
     {
       comm_buf.resize(Stencil._unified_buffer_size);
     };
@@ -176,8 +181,7 @@ PARALLEL_FOR_LOOP
 	for(int p=0;p<geom.npoint;p++){ 
 
 	  int dir   = geom.directions[p];
-	  int opdir = geom.opdirs[p];
-	  int disp= geom.displacements[p];
+	  int disp  = geom.displacements[p];
 
 	  int block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]);
 
@@ -187,7 +191,7 @@ PARALLEL_FOR_LOOP
 	    linop.OpDiag(phi,Mphi);
 	  }
 	  else  {
-	    linop.OpDir(phi,Mphi,opdir,disp); 
+	    linop.OpDir(phi,Mphi,dir,disp); 
 	  }
 
 	  ////////////////////////////////////////////////////////////////////////
@@ -240,6 +244,62 @@ PARALLEL_FOR_LOOP
       std::cout<< iProj <<std::endl;
       std::cout<<"Computed Coarse Operator"<<std::endl;
 #endif
+      AssertHermitian();
+      //      ForceHermitian();
+      //      ForceDiagonal();
+    }
+    void ForceDiagonal(void) {
+
+
+      std::cout<<"**************************************************"<<std::endl;
+      std::cout<<"****   Forcing coarse operator to be diagonal ****"<<std::endl;
+      std::cout<<"**************************************************"<<std::endl;
+      for(int p=0;p<8;p++){
+	A[p]=zero;
+      }
+
+      GridParallelRNG  RNG(Grid()); RNG.SeedRandomDevice();
+      Lattice<iScalar<CComplex> > val(Grid()); random(RNG,val);
+
+      Complex one(1.0);
+
+      iMatrix<Complex,nbasis> ident;  ident=one;
+
+      val = val*adj(val);
+      val = val + 1.0;
+
+      A[8] = val*ident;
+
+      //      for(int s=0;s<Grid()->oSites();s++) {
+      //	A[8]._odata[s]=val._odata[s];
+      //      }
+    }
+    void ForceHermitian(void) {
+      for(int d=0;d<4;d++){
+	int dd=d+1;
+	A[2*d] = adj(Cshift(A[2*d+1],dd,1));
+      }
+      A[8] = 0.5*(A[8] + adj(A[8]));
+    }
+    void AssertHermitian(void) {
+      CoarseMatrix AA    (Grid());
+      CoarseMatrix AAc   (Grid());
+      CoarseMatrix Diff  (Grid());
+      for(int d=0;d<4;d++){
+	
+	int dd=d+1;
+	AAc = Cshift(A[2*d+1],dd,1);
+	AA  = A[2*d];
+	
+	Diff = AA - adj(AAc);
+
+	std::cout<<"Norm diff dim "<<d<<" "<< norm2(Diff)<<std::endl;
+	std::cout<<"Norm dim "<<d<<" "<< norm2(AA)<<std::endl;
+	  
+      }
+      Diff = A[8] - adj(A[8]);
+      std::cout<<"Norm diff local "<< norm2(Diff)<<std::endl;
+      std::cout<<"Norm local "<< norm2(A[8])<<std::endl;
     }
     
   };

From 25bfff370af27bdc35004cd1a37f989bf1a5e0a1 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 9 Jun 2015 22:38:13 +0100
Subject: [PATCH 14/24] Starting to use

---
 lib/algorithms/approx/Chebyshev.h | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/lib/algorithms/approx/Chebyshev.h b/lib/algorithms/approx/Chebyshev.h
index 5c7741e4..0dad7179 100644
--- a/lib/algorithms/approx/Chebyshev.h
+++ b/lib/algorithms/approx/Chebyshev.h
@@ -42,6 +42,14 @@ namespace Grid {
     double lo;
 
   public:
+    void csv(std::ostream &out){
+      for (double x=lo; x<hi; x+=(hi-lo)/1000) {
+	double f = approx(x);
+	out<< x<<" "<<f<<std::endl;
+      }
+      return;
+    }
+
     Chebyshev(double _lo,double _hi,int _order, double (* func)(double) ){
       lo=_lo;
       hi=_hi;
@@ -52,16 +60,16 @@ namespace Grid {
       for(int j=0;j<order;j++){
 	double s=0;
 	for(int k=0;k<order;k++){
-	  double y=cos(M_PI*(k+0.5)/order);
+	  double y=std::cos(M_PI*(k+0.5)/order);
 	  double x=0.5*(y*(hi-lo)+(hi+lo));
 	  double f=func(x);
-	  s=s+f*cos( j*M_PI*(k+0.5)/order );
+	  s=s+f*std::cos( j*M_PI*(k+0.5)/order );
 	}
 	Coeffs[j] = s * 2.0/order;
       }
     };
 
-    double Evaluate(double x) // Convenience for plotting the approximation
+    double approx(double x) // Convenience for plotting the approximation
     {
       double Tn;
       double Tnm;
@@ -91,7 +99,7 @@ namespace Grid {
     void   PlotApprox(std::ostream &out) {
       out<<"Polynomial approx ["<<lo<<","<<hi<<"]"<<std::endl;
       for(double x=lo;x<hi;x+=(hi-lo)/50.0){
-	out <<x<<"\t"<<Evaluate(x)<<std::endl;
+	out <<x<<"\t"<<approx(x)<<std::endl;
       }
     };
 

From 1f2498918d220910737c58b713d87c37fe53727f Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 9 Jun 2015 22:39:13 +0100
Subject: [PATCH 15/24] Got this sorted with the promote working in a test

---
 lib/lattice/Lattice_transfer.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/lib/lattice/Lattice_transfer.h b/lib/lattice/Lattice_transfer.h
index 12993680..fb6b39f1 100644
--- a/lib/lattice/Lattice_transfer.h
+++ b/lib/lattice/Lattice_transfer.h
@@ -94,7 +94,7 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
     for(int i=0;i<nbasis;i++) {
       
       coarseData._odata[sc](i)=coarseData._odata[sc](i)
-	+ TensorRemove( innerProduct(Basis[i]._odata[sf],fineData._odata[sf]));
+	+ innerProduct(Basis[i]._odata[sf],fineData._odata[sf]);
 
     }
   }
@@ -266,7 +266,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
   assert( nbasis == Basis.size() );
   subdivides(coarse,fine); 
   for(int i=0;i<nbasis;i++){
-    conformable(Basis,fineData);
+    conformable(Basis[i]._grid,fine);
   }
 
   std::vector<int>  block_r      (_ndimension);
@@ -287,9 +287,8 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
     GridBase::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
 
     for(int i=0;i<nbasis;i++) {
-
-      if(i==0) fineData._odata[sf]=                    coarseData._odata[sc][i]*Basis[i]._odata[sf];
-      else     fineData._odata[sf]=fineData._odata[sf]+coarseData._odata[sc][i]*Basis[i]._odata[sf];
+      if(i==0) fineData._odata[sf]=coarseData._odata[sc](i) * Basis[i]._odata[sf];
+      else     fineData._odata[sf]=fineData._odata[sf]+coarseData._odata[sc](i)*Basis[i]._odata[sf];
 
     }
   }

From a57ca0bbe7e1d7b119afdc2690a96682e81de16b Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 9 Jun 2015 22:39:37 +0100
Subject: [PATCH 16/24] Prettier reporting

---
 scripts/linecount | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/scripts/linecount b/scripts/linecount
index 0be67142..4463ce84 100755
--- a/scripts/linecount
+++ b/scripts/linecount
@@ -1,12 +1,30 @@
 #!/bin/sh 
+if [ $# -eq 1 ]
+then
+  wc -l lib/*.h lib/*/*.h lib/*/*/*.h lib/*.cc lib/*/*.cc lib/*/*/*.cc tests/*.cc benchmarks/*.cc lib/*/*/*/*.cc lib/*/*/*/*.h >& tmp.fil
+  count=`grep total tmp.fil`
+  echo $count " in Grid library"
+else 
+  wc -l lib/*.h lib/*/*.h lib/*/*/*.h lib/*.cc lib/*/*.cc lib/*/*/*.cc tests/*.cc benchmarks/*.cc lib/*/*/*/*.cc lib/*/*/*/*.h 
+fi
 
-wc -l lib/*.h lib/*/*.h lib/*/*/*.h lib/*.cc lib/*/*.cc lib/*/*/*.cc tests/*.cc benchmarks/*.cc lib/*/*/*/*.cc lib/*/*/*/*.h
+wc -l lib/*.h lib/*/*.h lib/*/*/*.h lib/*.cc lib/*/*.cc lib/*/*/*.cc lib/*/*/*/*.cc lib/*/*/*/*.h >& tmp.fil
+count=`grep total tmp.fil`
+echo  $count " in lib"
 
-echo "lib"
-wc -l lib/*.h lib/*/*.h lib/*/*/*.h lib/*.cc lib/*/*.cc lib/*/*/*.cc lib/*/*/*/*.cc lib/*/*/*/*.h | grep total
+for sdir in `ls -F lib/| grep /`
+do
+wc -l lib/${sdir}/*.h lib/${sdir}/*/*.h lib/${sdir}/*.cc lib/${sdir}/*/*.cc lib/${sdir}/*/*/*.cc lib/${sdir}/*/*/*.h >& tmp.fil
+count=`grep total tmp.fil`
+echo $count " in lib/${sdir}" 
+done
 
-echo "tests"
-wc -l tests/*.cc | grep total
+wc -l tests/*.cc | grep total >& tmp.fil
+count=`grep total tmp.fil`
+echo  $count " in tests"
 
-echo "benchmarks"
-wc -l benchmarks/*.cc | grep total
+wc -l benchmarks/*.cc | grep total >& tmp.fil
+count=`grep total tmp.fil`
+echo  $count " in benchmarks"
+
+rm tmp.fil

From 61770d447242dee8976c018631bc09721327bd2a Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 9 Jun 2015 22:40:12 +0100
Subject: [PATCH 17/24] Files update

---
 tests/Make.inc | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tests/Make.inc b/tests/Make.inc
index 2b132fd0..64ba4768 100644
--- a/tests/Make.inc
+++ b/tests/Make.inc
@@ -1,5 +1,5 @@
 
-bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cf_coarsen_support Test_cf_cr_unprec Test_contfrac_cg Test_contfrac_even_odd Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_gamma Test_main Test_multishift_sqrt Test_nersc_io Test_remez Test_rng Test_rng_fixed Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd
+bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cg Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_contfrac_cg Test_contfrac_even_odd Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_gamma Test_main Test_multishift_sqrt Test_nersc_io Test_remez Test_rng Test_rng_fixed Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd
 
 
 Test_GaugeAction_SOURCES=Test_GaugeAction.cc
@@ -18,6 +18,14 @@ Test_cayley_even_odd_SOURCES=Test_cayley_even_odd.cc
 Test_cayley_even_odd_LDADD=-lGrid
 
 
+Test_cayley_ldop_cg_SOURCES=Test_cayley_ldop_cg.cc
+Test_cayley_ldop_cg_LDADD=-lGrid
+
+
+Test_cayley_ldop_cr_SOURCES=Test_cayley_ldop_cr.cc
+Test_cayley_ldop_cr_LDADD=-lGrid
+
+
 Test_cf_coarsen_support_SOURCES=Test_cf_coarsen_support.cc
 Test_cf_coarsen_support_LDADD=-lGrid
 

From 708d4f7533c2c89cd7358bae073d672622e112ca Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 9 Jun 2015 22:40:58 +0100
Subject: [PATCH 18/24] g5 and g5R5 hermitian are now differentiated

---
 lib/qcd/action/fermion/g5HermitianLinop.h | 47 ++++++++++++++++++++++-
 1 file changed, 45 insertions(+), 2 deletions(-)

diff --git a/lib/qcd/action/fermion/g5HermitianLinop.h b/lib/qcd/action/fermion/g5HermitianLinop.h
index 5b3411ec..e5b691d8 100644
--- a/lib/qcd/action/fermion/g5HermitianLinop.h
+++ b/lib/qcd/action/fermion/g5HermitianLinop.h
@@ -6,10 +6,10 @@ namespace Grid {
 // Wrap an already herm matrix
 ////////////////////////////////////////////////////////////////////
 template<class Matrix,class Field>
-class Gamma5HermitianLinearOperator : public LinearOperatorBase<Field> {
+class Gamma5R5HermitianLinearOperator : public LinearOperatorBase<Field> {
   Matrix &_Mat;
 public:
-  Gamma5HermitianLinearOperator(Matrix &Mat): _Mat(Mat){};
+  Gamma5R5HermitianLinearOperator(Matrix &Mat): _Mat(Mat){};
   void Op     (const Field &in, Field &out){
     HermOp(in,out);
   }
@@ -45,5 +45,48 @@ public:
   }
 };
 
+
+template<class Matrix,class Field>
+class Gamma5HermitianLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Gamma g5;
+public:
+    Gamma5HermitianLinearOperator(Matrix &Mat): _Mat(Mat), g5(Gamma::Gamma5) {};
+  void Op     (const Field &in, Field &out){
+    HermOp(in,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    HermOp(in,out);
+  }
+  void OpDiag (const Field &in, Field &out) {
+    Field tmp(in._grid);
+    _Mat.Mdiag(in,tmp);
+    out=g5*tmp;
+  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {
+    Field tmp(in._grid);
+    _Mat.Mdir(in,tmp,dir,disp);
+    out=g5*tmp;
+  }
+
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+
+    HermOp(in,out);
+    
+    ComplexD dot;
+    dot= innerProduct(in,out);
+    n1=real(dot);
+    
+    dot = innerProduct(out,out);
+    n2=real(dot);
+  }
+  void HermOp(const Field &in, Field &out){
+    Field tmp(in._grid);
+    _Mat.M(in,tmp);
+    out=g5*tmp;
+  }
+};
+
+
 }}
 #endif

From 3a52bc4ce1a79fb75869e54a345c20748fccdc53 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 9 Jun 2015 22:41:27 +0100
Subject: [PATCH 19/24] G5R5 update

---
 tests/Test_dwf_cr_unprec.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/Test_dwf_cr_unprec.cc b/tests/Test_dwf_cr_unprec.cc
index cffa726c..6cd2b32a 100644
--- a/tests/Test_dwf_cr_unprec.cc
+++ b/tests/Test_dwf_cr_unprec.cc
@@ -55,7 +55,7 @@ int main (int argc, char ** argv)
   MdagMLinearOperator<DomainWallFermion,LatticeFermion> HermOp(Ddwf);
   MCR(HermOp,src,result);
 
-  Gamma5HermitianLinearOperator<DomainWallFermion,LatticeFermion> g5HermOp(Ddwf);
+  Gamma5R5HermitianLinearOperator<DomainWallFermion,LatticeFermion> g5HermOp(Ddwf);
   MCR(g5HermOp,src,result);
 
 

From 4963f7356ac0973171c52fbe5bcb503fe77c1062 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 9 Jun 2015 22:41:59 +0100
Subject: [PATCH 20/24] 5d OpDir direction interface refers to the 5d dims, not
 4d to present a sensible and consistent external interface.

---
 lib/qcd/action/fermion/WilsonFermion5D.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc
index 7d9ffe22..7bbeeef9 100644
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -72,6 +72,7 @@ namespace QCD {
 }
 void WilsonFermion5D::DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeField &Umu)
 {
+  assert(GaugeGrid()->_ndimension==4);
   conformable(Uds._grid,GaugeGrid());
   conformable(Umu._grid,GaugeGrid());
   LatticeColourMatrix U(GaugeGrid());
@@ -82,8 +83,10 @@ void WilsonFermion5D::DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGau
     pokeIndex<LorentzIndex>(Uds,U,mu+4);
   }
 }
-void WilsonFermion5D::DhopDir(const LatticeFermion &in, LatticeFermion &out,int dir,int disp)
+void WilsonFermion5D::DhopDir(const LatticeFermion &in, LatticeFermion &out,int dir5,int disp)
 {
+  int dir = dir5-1; // Maps to the ordering above in "directions" that is passed to stencil
+                    // we drop off the innermost fifth dimension
   assert( (disp==1)||(disp==-1) );
   assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
 

From 4357f5acdc6038f9b157271eea0c536496053c93 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 9 Jun 2015 22:43:10 +0100
Subject: [PATCH 21/24] Some extra tests

---
 tests/Test_cayley_coarsen_support.cc | 99 ++++++++++++++++++++++------
 1 file changed, 80 insertions(+), 19 deletions(-)

diff --git a/tests/Test_cayley_coarsen_support.cc b/tests/Test_cayley_coarsen_support.cc
index 064f5f13..b68a5cc3 100644
--- a/tests/Test_cayley_coarsen_support.cc
+++ b/tests/Test_cayley_coarsen_support.cc
@@ -21,9 +21,7 @@ int main (int argc, char ** argv)
 {
   Grid_init(&argc,&argv);
 
-
-
-  const int Ls=4;
+  const int Ls=8;
 
   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@@ -41,8 +39,10 @@ int main (int argc, char ** argv)
 
   std::vector<int> seeds4({1,2,3,4});
   std::vector<int> seeds5({5,6,7,8});
-  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
-  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+  std::vector<int> cseeds({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
 
   LatticeFermion    src(FGrid); random(RNG5,src);
   LatticeFermion result(FGrid); result=zero;
@@ -50,31 +50,33 @@ int main (int argc, char ** argv)
   LatticeFermion    tmp(FGrid);
   LatticeFermion    err(FGrid);
   LatticeGaugeField Umu(UGrid); random(RNG4,Umu);
+
 #if 0
   std::vector<LatticeColourMatrix> U(4,UGrid);
   Umu=zero;
   Complex cone(1.0,0.0);
   for(int nn=0;nn<Nd;nn++){
     if(1) {
-      if (nn!=0) { U[nn]=zero; std::cout << "zeroing gauge field in dir "<<nn<<std::endl; }
-      else       { U[nn] = cone;std::cout << "unit gauge field in dir "<<nn<<std::endl; }
+      if (nn>2) { U[nn]=zero; std::cout << "zeroing gauge field in dir "<<nn<<std::endl; }
+      else      { U[nn]=cone; std::cout << "unit gauge field in dir "<<nn<<std::endl; }
     }
     pokeIndex<LorentzIndex>(Umu,U[nn],nn);
   }
 #endif  
+
   RealD mass=0.5;
   RealD M5=1.8;
 
   DomainWallFermion Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-  Gamma5HermitianLinearOperator<DomainWallFermion,LatticeFermion> HermIndefOp(Ddwf);
+  Gamma5R5HermitianLinearOperator<DomainWallFermion,LatticeFermion> HermIndefOp(Ddwf);
 
   HermIndefOp.Op(src,ref);
   HermIndefOp.OpDiag(src,result);
   
   for(int d=0;d<4;d++){
-    HermIndefOp.OpDir(src,tmp,d,+1); result=result+tmp; 
+    HermIndefOp.OpDir(src,tmp,d+1,+1); result=result+tmp; 
     std::cout<<"dir "<<d<<" tmp "<<norm2(tmp)<<std::endl;
-    HermIndefOp.OpDir(src,tmp,d,-1); result=result+tmp;
+    HermIndefOp.OpDir(src,tmp,d+1,-1); result=result+tmp;
     std::cout<<"dir "<<d<<" tmp "<<norm2(tmp)<<std::endl;
   }
   err = result-ref;
@@ -82,32 +84,91 @@ int main (int argc, char ** argv)
 
   const int nbasis = 2;
   std::vector<LatticeFermion> subspace(nbasis,FGrid);
-  
+  LatticeFermion prom(FGrid);
+
   for(int b=0;b<nbasis;b++){
     random(RNG5,subspace[b]);
   }
   std::cout << "Computed randoms"<< std::endl;
 
-  CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOp(*Coarse5d);
+  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  LittleDiracOperator LittleDiracOp(*Coarse5d);
 
   LittleDiracOp.CoarsenOperator(FGrid,HermIndefOp,subspace);
-
-  typedef   Lattice<iVector<vComplex,nbasis > > coarse_vec;
   
-  coarse_vec c_src (Coarse5d);  c_src= zero;
-  coarse_vec c_res (Coarse5d);
+  CoarseVector c_src (Coarse5d);
+  CoarseVector c_res (Coarse5d);
+  CoarseVector c_proj(Coarse5d);
   
-  Complex one(1.0);
-  c_src = one;  // 1 in every element for vector 1.
-
   //  TODO
   // -- promote from subspace, check we get the vector we wanted
   // -- apply ldop; check we get the same as inner product of M times big vec
   // -- pick blocks one by one. Evaluate matrix elements.
+  Complex one(1.0);
+  c_src = one;  // 1 in every element for vector 1.
+  
+  blockPromote(c_src,err,subspace);
+
+
+  prom=zero;
+  for(int b=0;b<nbasis;b++){
+    prom=prom+subspace[b];
+  }
+  err=err-prom; 
+  std::cout<<"Promoted back from subspace err "<<norm2(err)<<std::endl;
+
+  HermIndefOp.HermOp(prom,tmp);
+  blockProject(c_proj,tmp,subspace);
+
+  LittleDiracOp.M(c_src,c_res);
+
+  c_proj = c_proj - c_res;
+  std::cout<<"Representation of ldop within subspace "<<norm2(c_proj)<<std::endl;
 
   std::cout << "Multiplying by LittleDiracOp "<< std::endl;
   LittleDiracOp.M(c_src,c_res);
 
+  std::cout<<"Testing hermiticity explicitly by inspecting matrix elements"<<std::endl;
+  LittleDiracOp.AssertHermitian();
+
+  std::cout << "Testing Hermiticity stochastically "<< std::endl;
+  CoarseVector phi(Coarse5d);
+  CoarseVector chi(Coarse5d);
+  CoarseVector Aphi(Coarse5d);
+  CoarseVector Achi(Coarse5d);
+
+  random(CRNG,phi);
+  random(CRNG,chi);
+
+
+  std::cout<<"Made randoms"<<std::endl;
+
+  LittleDiracOp.M(phi,Aphi);
+  LittleDiracOp.Mdag(chi,Achi);
+
+  ComplexD pAc = innerProduct(chi,Aphi);
+  ComplexD cAp = innerProduct(phi,Achi);
+  ComplexD cAc = innerProduct(chi,Achi);
+  ComplexD pAp = innerProduct(phi,Aphi);
+
+  std::cout<< "pAc "<<pAc<<" cAp "<< cAp<< " diff "<<pAc-adj(cAp)<<std::endl;
+
+  std::cout<< "pAp "<<pAp<<" cAc "<< cAc<<"Should be real"<< std::endl;
+
+  std::cout<<"Testing linearity"<<std::endl;
+  CoarseVector PhiPlusChi(Coarse5d);
+  CoarseVector APhiPlusChi(Coarse5d);
+  CoarseVector linerr(Coarse5d);
+  PhiPlusChi = phi+chi;
+  LittleDiracOp.M(PhiPlusChi,APhiPlusChi);
+
+  linerr= APhiPlusChi-Aphi;
+  linerr= linerr-Achi;
+  std::cout<<"**Diff "<<norm2(linerr)<<std::endl;
+
+
   std::cout << "Done "<< std::endl;
   Grid_finalize();
 }

From 2022a7205c585e8187b306442c2459a3c004f696 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 9 Jun 2015 22:43:41 +0100
Subject: [PATCH 22/24] Remove extra layers of checks now it works

---
 lib/algorithms/CoarsenedMatrix.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/algorithms/CoarsenedMatrix.h b/lib/algorithms/CoarsenedMatrix.h
index ba417be3..96a40fe7 100644
--- a/lib/algorithms/CoarsenedMatrix.h
+++ b/lib/algorithms/CoarsenedMatrix.h
@@ -244,7 +244,7 @@ namespace Grid {
       std::cout<< iProj <<std::endl;
       std::cout<<"Computed Coarse Operator"<<std::endl;
 #endif
-      AssertHermitian();
+      //      AssertHermitian();
       //      ForceHermitian();
       //      ForceDiagonal();
     }

From c4f204f440d097b14b27890df08096bb2f13fba6 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 9 Jun 2015 22:50:45 +0100
Subject: [PATCH 23/24] Solver converges

---
 tests/Test_cayley_ldop_cg.cc | 136 +++++++++++++++++++++++++++++++++++
 1 file changed, 136 insertions(+)
 create mode 100644 tests/Test_cayley_ldop_cg.cc

diff --git a/tests/Test_cayley_ldop_cg.cc b/tests/Test_cayley_ldop_cg.cc
new file mode 100644
index 00000000..b146002d
--- /dev/null
+++ b/tests/Test_cayley_ldop_cg.cc
@@ -0,0 +1,136 @@
+#include <Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::GammaMatrix Gmu [] = {
+    Gamma::GammaX,
+    Gamma::GammaY,
+    Gamma::GammaZ,
+    Gamma::GammaT
+  };
+
+double lowpass(double x)
+{
+  return pow(x*x+1.0,-2);
+}
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  Chebyshev<LatticeFermion> filter(-150.0, 150.0,16, lowpass);
+  ofstream csv(std::string("filter.dat"),std::ios::out|std::ios::trunc);
+  filter.csv(csv);
+  csv.close();
+
+  const int Ls=8;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  // Construct a coarsened grid
+  std::vector<int> clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/2;
+  }
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
+
+  LatticeFermion    src(FGrid); gaussian(RNG5,src);
+  LatticeFermion result(FGrid); result=zero;
+  LatticeFermion    ref(FGrid); ref=zero;
+  LatticeFermion    tmp(FGrid);
+  LatticeFermion    err(FGrid);
+  LatticeGaugeField Umu(UGrid); gaussian(RNG4,Umu);
+
+
+  //random(RNG4,Umu);
+  NerscField header;
+  std::string file("./ckpoint_lat.400");
+  readNerscConfiguration(Umu,header,file);
+
+#if 0
+  LatticeColourMatrix U(UGrid);
+  U=zero;
+  for(int nn=0;nn<Nd;nn++){
+    if (nn>2) {
+      pokeIndex<LorentzIndex>(Umu,U,nn);
+    }
+  }
+#endif  
+  RealD mass=0.1;
+  RealD M5=1.0;
+
+  DomainWallFermion Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  Gamma5R5HermitianLinearOperator<DomainWallFermion,LatticeFermion> HermIndefOp(Ddwf);
+
+  const int nbasis = 8;
+  std::vector<LatticeFermion> subspace(nbasis,FGrid);
+  LatticeFermion noise(FGrid);
+  LatticeFermion ms(FGrid);
+  for(int b=0;b<nbasis;b++){
+    Gamma g5(Gamma::Gamma5);
+    gaussian(RNG5,noise);
+    RealD scale = pow(norm2(noise),-0.5); 
+    noise=noise*scale;
+
+    HermIndefOp.HermOp(noise,ms); std::cout << "Noise    "<<b<<" Ms "<<norm2(ms)<< " "<< norm2(noise)<<std::endl;
+
+
+    //    filter(HermIndefOp,noise,subspace[b]);
+    // inverse iteration
+    MdagMLinearOperator<DomainWallFermion,LatticeFermion> HermDefOp(Ddwf);
+    ConjugateGradient<LatticeFermion> CG(1.0e-5,10000);
+
+    for(int i=0;i<4;i++){
+
+      CG(HermDefOp,noise,subspace[b]);
+      noise = subspace[b];
+
+      scale = pow(norm2(noise),-0.5); 
+      noise=noise*scale;
+      HermDefOp.HermOp(noise,ms); std::cout << "filt    "<<b<<" <u|H|u> "<<norm2(ms)<< " "<< norm2(noise)<<std::endl;
+    }
+
+    subspace[b] = noise;
+    HermIndefOp.HermOp(subspace[b],ms); std::cout << "Filtered "<<b<<" Ms "<<norm2(ms)<< " "<<norm2(subspace[b]) <<std::endl;
+  }
+  std::cout << "Computed randoms"<< std::endl;
+
+  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  LittleDiracOperator LittleDiracOp(*Coarse5d);
+  LittleDiracOp.CoarsenOperator(FGrid,HermIndefOp,subspace);
+  
+  CoarseVector c_src (Coarse5d);
+  CoarseVector c_res (Coarse5d);
+  gaussian(CRNG,c_src);
+  c_res=zero;
+
+  std::cout << "Solving CG on coarse space "<< std::endl;
+  MdagMLinearOperator<LittleDiracOperator,CoarseVector> PosdefLdop(LittleDiracOp);
+  ConjugateGradient<CoarseVector> CG(1.0e-8,10000);
+  CG(PosdefLdop,c_src,c_res);
+
+  std::cout << "Done "<< std::endl;
+  Grid_finalize();
+}

From 5a53086e41d959140460cdc96cfca575547b3918 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 9 Jun 2015 22:51:02 +0100
Subject: [PATCH 24/24] Solver converges

---
 tests/Test_cayley_ldop_cr.cc | 138 +++++++++++++++++++++++++++++++++++
 1 file changed, 138 insertions(+)
 create mode 100644 tests/Test_cayley_ldop_cr.cc

diff --git a/tests/Test_cayley_ldop_cr.cc b/tests/Test_cayley_ldop_cr.cc
new file mode 100644
index 00000000..afa773b2
--- /dev/null
+++ b/tests/Test_cayley_ldop_cr.cc
@@ -0,0 +1,138 @@
+#include <Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::GammaMatrix Gmu [] = {
+    Gamma::GammaX,
+    Gamma::GammaY,
+    Gamma::GammaZ,
+    Gamma::GammaT
+  };
+
+double lowpass(double x)
+{
+  return pow(x*x+1.0,-2);
+}
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  Chebyshev<LatticeFermion> filter(-150.0, 150.0,16, lowpass);
+  ofstream csv(std::string("filter.dat"),std::ios::out|std::ios::trunc);
+  filter.csv(csv);
+  csv.close();
+
+  const int Ls=8;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  // Construct a coarsened grid
+  std::vector<int> clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/2;
+  }
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
+
+  LatticeFermion    src(FGrid); gaussian(RNG5,src);
+  LatticeFermion result(FGrid); result=zero;
+  LatticeFermion    ref(FGrid); ref=zero;
+  LatticeFermion    tmp(FGrid);
+  LatticeFermion    err(FGrid);
+  LatticeGaugeField Umu(UGrid); gaussian(RNG4,Umu);
+
+
+  //random(RNG4,Umu);
+  NerscField header;
+  std::string file("./ckpoint_lat.400");
+  readNerscConfiguration(Umu,header,file);
+
+#if 0
+  LatticeColourMatrix U(UGrid);
+  Complex cone(1.0,0.0);
+  for(int nn=0;nn<Nd;nn++){
+    if (nn==3) { 
+      U=zero; std::cout << "zeroing gauge field in dir "<<nn<<std::endl;
+    //    else       { U[nn]= cone;std::cout << "unit gauge field in dir "<<nn<<std::endl; }
+      pokeIndex<LorentzIndex>(Umu,U,nn);
+    }
+  }
+#endif  
+  RealD mass=0.5;
+  RealD M5=1.8;
+
+  DomainWallFermion Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  Gamma5R5HermitianLinearOperator<DomainWallFermion,LatticeFermion> HermIndefOp(Ddwf);
+
+  const int nbasis = 8;
+  std::vector<LatticeFermion> subspace(nbasis,FGrid);
+  LatticeFermion noise(FGrid);
+  LatticeFermion ms(FGrid);
+  for(int b=0;b<nbasis;b++){
+    Gamma g5(Gamma::Gamma5);
+    gaussian(RNG5,noise);
+    RealD scale = pow(norm2(noise),-0.5); 
+    noise=noise*scale;
+
+    HermIndefOp.HermOp(noise,ms); std::cout << "Noise    "<<b<<" Ms "<<norm2(ms)<< " "<< norm2(noise)<<std::endl;
+
+
+    //    filter(HermIndefOp,noise,subspace[b]);
+    // inverse iteration
+    MdagMLinearOperator<DomainWallFermion,LatticeFermion> HermDefOp(Ddwf);
+    ConjugateGradient<LatticeFermion> CG(1.0e-5,10000);
+
+    for(int i=0;i<4;i++){
+
+      CG(HermDefOp,noise,subspace[b]);
+      noise = subspace[b];
+
+      scale = pow(norm2(noise),-0.5); 
+      noise=noise*scale;
+      HermDefOp.HermOp(noise,ms); std::cout << "filt    "<<b<<" <u|H|u> "<<norm2(ms)<< " "<< norm2(noise)<<std::endl;
+    }
+
+    subspace[b] = noise;
+    HermIndefOp.HermOp(subspace[b],ms); std::cout << "Filtered "<<b<<" Ms "<<norm2(ms)<< " "<<norm2(subspace[b]) <<std::endl;
+  }
+  std::cout << "Computed randoms"<< std::endl;
+
+  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  LittleDiracOperator LittleDiracOp(*Coarse5d);
+  LittleDiracOp.CoarsenOperator(FGrid,HermIndefOp,subspace);
+  
+  CoarseVector c_src (Coarse5d);
+  CoarseVector c_res (Coarse5d);
+  gaussian(CRNG,c_src);
+  c_res=zero;
+
+  std::cout << "Solving MCR on coarse space "<< std::endl;
+  HermitianLinearOperator<LittleDiracOperator,CoarseVector> HermIndefLdop(LittleDiracOp);
+  ConjugateResidual<CoarseVector> MCR(1.0e-8,10000);
+  MCR(HermIndefLdop,c_src,c_res);
+
+  std::cout << "Done "<< std::endl;
+  Grid_finalize();
+}