Merge branch 'master' of github.com:paboyle/Grid

Conflicts: lib/simd/Grid_avx512.h lib/simd/Grid_imci.h
2026-02-04 14:23:29 +00:00 · 2015-11-04 03:32:10 -08:00
parent 6be9716e6f dfc1de6f60
commit 16c7993434
33 changed files with 2939 additions and 339 deletions
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -24,7 +24,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;

  std::vector<int> latt4 = GridDefaultLatt();
-  const int Ls=8;
+  const int Ls=16;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
@@ -82,22 +82,24 @@ int main (int argc, char ** argv)
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  
  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
-  int ncall=10;
-  double t0=usecond();
-  for(int i=0;i<ncall;i++){
-    Dw.Dhop(src,result,0);
-  }
-  double t1=usecond();
+  int ncall=100;
+  {
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+      Dw.Dhop(src,result,0);
+    }
+    double t1=usecond();
+    
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=1344*volume*ncall;

-  double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-  double flops=1344*volume*ncall;
-  
-  std::cout<<GridLogMessage << "Called Dw"<<std::endl;
-  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
-  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
-  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
-  err = ref-result; 
-  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
+    std::cout<<GridLogMessage << "Called Dw"<<std::endl;
+    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
+    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+    err = ref-result; 
+    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
+  }


  if (1)
@@ -140,6 +142,18 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;

+  {
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+      Dw.DhopEO(src_o,r_e,DaggerNo);
+    }
+    double t1=usecond();
+    
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=(1344.0*volume*ncall)/2;
+
+    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
+  }

  Dw.DhopEO(src_o,r_e,DaggerNo);
  Dw.DhopOE(src_e,r_o,DaggerNo);
--- a/benchmarks/Benchmark_wilson.cc
+++ b/benchmarks/Benchmark_wilson.cc
@@ -90,7 +90,7 @@ int main (int argc, char ** argv)
  WilsonFermionR Dw(Umu,Grid,RBGrid,mass);
  
  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
-  int ncall=10000;
+  int ncall=1000;
  double t0=usecond();
  for(int i=0;i<ncall;i++){
    Dw.Dhop(src,result,0);
--- a/13
+++ b/13
@@ -1384,7 +1384,7 @@ Optional Features:
  --disable-dependency-tracking
                          speeds up one-time build
  --disable-openmp        do not use OpenMP
-  --enable-simd=SSE4|AVX|AVX2|AVX512|IMCI
+  --enable-simd=SSE4|AVX|AVXFMA4|AVX2|AVX512|IMCI
                          Select instructions to be SSE4.0, AVX 1.0, AVX
                          2.0+FMA, AVX 512, IMCI
  --enable-precision=single|double
@@ -6403,6 +6403,17 @@ $as_echo "#define AVX1 1" >>confdefs.h
 $as_echo "$as_me: WARNING: Your processor does not support AVX instructions" >&2;}
       fi
     ;;
+     AVXFMA4)
+       echo Configuring for AVX
+
+$as_echo "#define AVXFMA4 1" >>confdefs.h
+
+       if test x"$ax_cv_support_avx_ext" = x"yes"; then         supported=yes
+       else
+       	{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Your processor does not support AVX instructions" >&5
+$as_echo "$as_me: WARNING: Your processor does not support AVX instructions" >&2;}
+       fi
+     ;;
     AVX2)
       echo Configuring for AVX2

--- a/configure.ac
+++ b/configure.ac
@@ -65,7 +65,7 @@ AC_CHECK_FUNCS([gettimeofday])
 #Please install or provide the correct path to your installation
 #Info at: http://www.mpfr.org/)])

-AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVX2|AVX512|IMCI],\
+AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVXFMA4|AVX2|AVX512|IMCI],\
 	[Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, IMCI])],\
 	[ac_SIMD=${enable_simd}],[ac_SIMD=AVX2])

@@ -90,6 +90,15 @@ case ${ac_SIMD} in
       	AC_MSG_WARN([Your processor does not support AVX instructions])
       fi
     ;;
+     AVXFMA4)
+       echo Configuring for AVX
+       AC_DEFINE([AVXFMA4],[1],[AVX Intrinsics with FMA4] )
+       if test x"$ax_cv_support_avx_ext" = x"yes"; then  dnl minimal support for AVX
+       supported=yes			  
+       else
+       	AC_MSG_WARN([Your processor does not support AVX instructions])
+       fi
+     ;;
     AVX2)
       echo Configuring for AVX2
       AC_DEFINE([AVX2],[1],[AVX2 Intrinsics] )
--- a/lib/Algorithms.h
+++ b/lib/Algorithms.h
@@ -18,8 +18,8 @@
 #include <algorithms/iterative/ConjugateGradientMultiShift.h>

 // Lanczos support
-//#include <algorithms/iterative/MatrixUtils.h>
-//#include <algorithms/iterative/ImplicitlyRestartedLanczos.h>
+#include <algorithms/iterative/MatrixUtils.h>
+#include <algorithms/iterative/ImplicitlyRestartedLanczos.h>

 #include <algorithms/CoarsenedMatrix.h>

--- a/lib/Config.h.in
+++ b/lib/Config.h.in
@@ -9,6 +9,9 @@
 /* AVX512 Intrinsics for Knights Landing */
 #undef AVX512

+/* AVX Intrinsics with FMA4 */
+#undef AVXFMA4
+
 /* EMPTY_SIMD only for DEBUGGING */
 #undef EMPTY_SIMD

--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -124,6 +124,7 @@ namespace Grid {
 	  if ( comm_dim ) {
 	    sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
 	    sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
+	    //	    std::cout << "dim "<<dimension<<"cb "<<_checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
 	    if ( sshift[0] == sshift[1] ) {
 	      if (splice_dim) {
 		GatherStartCommsSimd(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
@@ -164,23 +165,23 @@ namespace Grid {
 	  assert(comm_dim==1);
 	  assert(shift>=0);
 	  assert(shift<fd);
-	  
+
 	  int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
-	  
+
 	  std::vector<cobj,alignedAllocator<cobj> > send_buf(buffer_size); // hmm...
 	  std::vector<cobj,alignedAllocator<cobj> > recv_buf(buffer_size);
-	  
+
 	  int cb= (cbmask==0x2)? Odd : Even;
 	  int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
-	  
+
 	  for(int x=0;x<rd;x++){       
-	    
+
 	    int sx        = (x+sshift)%rd;
 	    int comm_proc = ((x+sshift)/rd)%pd;

 	    if (comm_proc) {
 	      
-	      int words = send_buf.size();
+	      int words = buffer_size;
 	      if (cbmask != 0x3) words=words>>1;
 	    
 	      int bytes = words * sizeof(cobj);
@@ -201,10 +202,11 @@ namespace Grid {
 				   recv_from_rank,
 				   bytes);

-	      for(int i=0;i<buffer_size;i++){
+	      for(int i=0;i<words;i++){
 		u_comm_buf[u_comm_offset+i]=recv_buf[i];
+		//		std::cout << " Halo["<<i<<"] snd "<<send_buf[i]<< " rcv "<<recv_buf[i]<<"  mask 0x"<<cbmask<<std::endl;
 	      }
-	      u_comm_offset+=buffer_size;
+	      u_comm_offset+=words;
 	    }
 	  }
 	}
@@ -241,6 +243,7 @@ namespace Grid {
 	  int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
 	  int words = sizeof(cobj)/sizeof(vector_type);

+	  assert(cbmask==0x3); // Fixme think there is a latent bug if not true
 	  /*
 	   * possibly slow to allocate
 	   * Doesn't matter in this test, but may want to preallocate in the 
--- a/lib/algorithms/approx/Chebyshev.h
+++ b/lib/algorithms/approx/Chebyshev.h
@@ -9,23 +9,34 @@ namespace Grid {
  ////////////////////////////////////////////////////////////////////////////////////////////
  // Simple general polynomial with user supplied coefficients
  ////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field>
+  class HermOpOperatorFunction : public OperatorFunction<Field> {
+    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
+      Linop.HermOp(in,out);
+    };
+  };
+
  template<class Field>
  class Polynomial : public OperatorFunction<Field> {
  private:
-    std::vector<double> Coeffs;
+    std::vector<RealD> Coeffs;
  public:
-    Polynomial(std::vector<double> &_Coeffs) : Coeffs(_Coeffs) {};
+    Polynomial(std::vector<RealD> &_Coeffs) : Coeffs(_Coeffs) { };

    // Implement the required interface
    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {

-      Field AtoN = in;
+      Field AtoN(in._grid);
+      Field Mtmp(in._grid);
+      AtoN = in;
      out = AtoN*Coeffs[0];
-
+      //      std::cout <<"Poly in " <<norm2(in)<<std::endl;
+      //      std::cout <<"0 " <<norm2(out)<<std::endl;
      for(int n=1;n<Coeffs.size();n++){
-	Field Mtmp=AtoN;
-	Linop.Op(Mtmp,AtoN);
+	Mtmp = AtoN;
+	Linop.HermOp(Mtmp,AtoN);
 	out=out+AtoN*Coeffs[n];
+	//	std::cout << n<<" " <<norm2(out)<<std::endl;
      }
    };
  };
@@ -36,15 +47,15 @@ namespace Grid {
  template<class Field>
  class Chebyshev : public OperatorFunction<Field> {
  private:
-    std::vector<double> Coeffs;
+    std::vector<RealD> Coeffs;
    int order;
-    double hi;
-    double lo;
+    RealD hi;
+    RealD lo;

  public:
    void csv(std::ostream &out){
-      for (double x=lo; x<hi; x+=(hi-lo)/1000) {
-	double f = approx(x);
+      for (RealD x=lo; x<hi; x+=(hi-lo)/1000) {
+	RealD f = approx(x);
 	out<< x<<" "<<f<<std::endl;
      }
      return;
@@ -53,15 +64,19 @@ namespace Grid {
    // Convenience for plotting the approximation
    void   PlotApprox(std::ostream &out) {
      out<<"Polynomial approx ["<<lo<<","<<hi<<"]"<<std::endl;
-      for(double x=lo;x<hi;x+=(hi-lo)/50.0){
+      for(RealD x=lo;x<hi;x+=(hi-lo)/50.0){
 	out <<x<<"\t"<<approx(x)<<std::endl;
      }
    };

+    Chebyshev(){};
+    Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ) {Init(_lo,_hi,_order,func);};
    
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
    // c.f. numerical recipes "chebft"/"chebev". This is sec 5.8 "Chebyshev approximation".
-    //
-    Chebyshev(double _lo,double _hi,int _order, double (* func)(double) ){
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
+    {
      lo=_lo;
      hi=_hi;
      order=_order;
@@ -69,24 +84,26 @@ namespace Grid {
      if(order < 2) exit(-1);
      Coeffs.resize(order);
      for(int j=0;j<order;j++){
-	double s=0;
+	RealD s=0;
 	for(int k=0;k<order;k++){
-	  double y=std::cos(M_PI*(k+0.5)/order);
-	  double x=0.5*(y*(hi-lo)+(hi+lo));
-	  double f=func(x);
+	  RealD y=std::cos(M_PI*(k+0.5)/order);
+	  RealD x=0.5*(y*(hi-lo)+(hi+lo));
+	  RealD f=func(x);
 	  s=s+f*std::cos( j*M_PI*(k+0.5)/order );
 	}
 	Coeffs[j] = s * 2.0/order;
      }
    };
+
+    
    void JacksonSmooth(void){
-      double M=order;
-      double alpha = M_PI/(M+2);
-      double lmax = std::cos(alpha);
-      double sumUsq =0;
-      std::vector<double> U(M);
-      std::vector<double> a(M);
-      std::vector<double> g(M);
+      RealD M=order;
+      RealD alpha = M_PI/(M+2);
+      RealD lmax = std::cos(alpha);
+      RealD sumUsq =0;
+      std::vector<RealD> U(M);
+      std::vector<RealD> a(M);
+      std::vector<RealD> g(M);
      for(int n=0;n<=M;n++){
 	U[n] = std::sin((n+1)*std::acos(lmax))/std::sin(std::acos(lmax));
 	sumUsq += U[n]*U[n];
@@ -107,18 +124,18 @@ namespace Grid {
 	Coeffs[m]*=g[m];
      }
    }
-    double approx(double x) // Convenience for plotting the approximation
+    RealD approx(RealD x) // Convenience for plotting the approximation
    {
-      double Tn;
-      double Tnm;
-      double Tnp;
+      RealD Tn;
+      RealD Tnm;
+      RealD Tnp;
      
-      double y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
+      RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
      
-      double T0=1;
-      double T1=y;
+      RealD T0=1;
+      RealD T1=y;
      
-      double sum;
+      RealD sum;
      sum = 0.5*Coeffs[0]*T0;
      sum+= Coeffs[1]*T1;
      
@@ -151,8 +168,8 @@ namespace Grid {

      std::cout<<GridLogMessage << "Chebyshev ["<<lo<<","<<hi<<"]"<< " order "<<order <<std::endl;
      // Tn=T1 = (xscale M + mscale)in
-      double xscale = 2.0/(hi-lo);
-      double mscale = -(hi+lo)/(hi-lo);
+      RealD xscale = 2.0/(hi-lo);
+      RealD mscale = -(hi+lo)/(hi-lo);
      Linop.HermOp(T0,y);
      T1=y*xscale+in*mscale;

@@ -179,5 +196,121 @@ namespace Grid {
  };


+  template<class Field>
+  class ChebyshevLanczos : public Chebyshev<Field> {
+  private:
+    std::vector<RealD> Coeffs;
+    int order;
+    RealD alpha;
+    RealD beta;
+    RealD mu;
+
+  public:
+    ChebyshevLanczos(RealD _alpha,RealD _beta,RealD _mu,int _order) :
+    alpha(_alpha),
+      beta(_beta),
+          mu(_mu)
+    {
+      order=_order;
+      Coeffs.resize(order);
+      for(int i=0;i<_order;i++){
+	Coeffs[i] = 0.0;
+      }
+      Coeffs[order-1]=1.0;
+    };
+
+    void csv(std::ostream &out){
+      for (RealD x=-1.2*alpha; x<1.2*alpha; x+=(2.0*alpha)/10000) {
+	RealD f = approx(x);
+	out<< x<<" "<<f<<std::endl;
+      }
+      return;
+    }
+
+    RealD approx(RealD xx) // Convenience for plotting the approximation
+    {
+      RealD Tn;
+      RealD Tnm;
+      RealD Tnp;
+      Real aa = alpha * alpha;
+      Real bb = beta  *  beta;
+      
+      RealD x = ( 2.0 * (xx-mu)*(xx-mu) - (aa+bb) ) / (aa-bb);
+
+      RealD y= x;
+      
+      RealD T0=1;
+      RealD T1=y;
+      
+      RealD sum;
+      sum = 0.5*Coeffs[0]*T0;
+      sum+= Coeffs[1]*T1;
+      
+      Tn =T1;
+      Tnm=T0;
+      for(int i=2;i<order;i++){
+	Tnp=2*y*Tn-Tnm;
+	Tnm=Tn;
+	Tn =Tnp;
+	sum+= Tn*Coeffs[i];
+      }
+      return sum;
+    };
+
+    // shift_Multiply in Rudy's code
+    void AminusMuSq(LinearOperatorBase<Field> &Linop, const Field &in, Field &out) 
+    {
+      GridBase *grid=in._grid;
+      Field tmp(grid);
+
+      RealD aa= alpha*alpha;
+      RealD bb= beta * beta;
+
+      Linop.HermOp(in,out);
+      out = out - mu*in;
+
+      Linop.HermOp(out,tmp);
+      tmp = tmp - mu * out;
+
+      out = (2.0/ (aa-bb) ) * tmp -  ((aa+bb)/(aa-bb))*in;
+    };
+    // Implement the required interface
+    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
+
+      GridBase *grid=in._grid;
+
+      int vol=grid->gSites();
+
+      Field T0(grid); T0 = in;  
+      Field T1(grid); 
+      Field T2(grid);
+      Field  y(grid);
+      
+      Field *Tnm = &T0;
+      Field *Tn  = &T1;
+      Field *Tnp = &T2;
+
+      // Tn=T1 = (xscale M )*in
+      AminusMuSq(Linop,T0,T1);
+
+      // sum = .5 c[0] T0 + c[1] T1
+      out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
+      for(int n=2;n<order;n++){
+	
+	AminusMuSq(Linop,*Tn,y);
+
+	*Tnp=2.0*y-(*Tnm);
+
+	out=out+Coeffs[n]* (*Tnp);
+
+	// Cycle pointers to avoid copies
+	Field *swizzle = Tnm;
+	Tnm    =Tn;
+	Tn     =Tnp;
+	Tnp    =swizzle;
+	  
+      }
+    }
+  };
 }
 #endif
--- a/lib/algorithms/iterative/DenseMatrix.h
+++ b/lib/algorithms/iterative/DenseMatrix.h
@@ -0,0 +1,106 @@
+#ifndef GRID_DENSE_MATRIX_H
+#define GRID_DENSE_MATRIX_H
+
+namespace Grid {
+    /////////////////////////////////////////////////////////////
+    // Matrix untils
+    /////////////////////////////////////////////////////////////
+
+template<class T> using DenseVector = std::vector<T>;
+template<class T> using DenseMatrix = DenseVector<DenseVector<T> >;
+
+template<class T> void Size(DenseVector<T> & vec, int &N) 
+{ 
+  N= vec.size();
+}
+template<class T> void Size(DenseMatrix<T> & mat, int &N,int &M) 
+{ 
+  N= mat.size();
+  M= mat[0].size();
+}
+
+template<class T> void SizeSquare(DenseMatrix<T> & mat, int &N) 
+{ 
+  int M; Size(mat,N,M);
+  assert(N==M);
+}
+
+template<class T> void Resize(DenseMatrix<T > & mat, int N, int M) { 
+  mat.resize(N);
+  for(int i=0;i<N;i++){
+    mat[i].resize(M);
+  }
+}
+template<class T> void Fill(DenseMatrix<T> & mat, T&val) { 
+  int N,M;
+  Size(mat,N,M);
+  for(int i=0;i<N;i++){
+  for(int j=0;j<M;j++){
+    mat[i][j] = val;
+  }}
+}
+
+/** Transpose of a matrix **/
+template<class T> DenseMatrix<T> Transpose(DenseMatrix<T> & mat){
+  int N,M;
+  Size(mat,N,M);
+  DenseMatrix<T> C; Resize(C,M,N);
+  for(int i=0;i<M;i++){
+  for(int j=0;j<N;j++){
+    C[i][j] = mat[j][i];
+  }} 
+  return C;
+}
+/** Set DenseMatrix to unit matrix **/
+template<class T> void Unity(DenseMatrix<T> &A){
+  int N;  SizeSquare(A,N);
+  for(int i=0;i<N;i++){
+    for(int j=0;j<N;j++){
+      if ( i==j ) A[i][j] = 1;
+      else        A[i][j] = 0;
+    } 
+  } 
+}
+
+/** Add C * I to matrix **/
+template<class T>
+void PlusUnit(DenseMatrix<T> & A,T c){
+  int dim;  SizeSquare(A,dim);
+  for(int i=0;i<dim;i++){A[i][i] = A[i][i] + c;} 
+}
+
+/** return the Hermitian conjugate of matrix **/
+template<class T>
+DenseMatrix<T> HermitianConj(DenseMatrix<T> &mat){
+
+  int dim; SizeSquare(mat,dim);
+
+  DenseMatrix<T> C; Resize(C,dim,dim);
+
+  for(int i=0;i<dim;i++){
+    for(int j=0;j<dim;j++){
+      C[i][j] = conj(mat[j][i]);
+    } 
+  } 
+  return C;
+}
+/**Get a square submatrix**/
+template <class T>
+DenseMatrix<T> GetSubMtx(DenseMatrix<T> &A,int row_st, int row_end, int col_st, int col_end)
+{
+  DenseMatrix<T> H; Resize(H,row_end - row_st,col_end-col_st);
+
+  for(int i = row_st; i<row_end; i++){
+  for(int j = col_st; j<col_end; j++){
+    H[i-row_st][j-col_st]=A[i][j];
+  }}
+  return H;
+}
+
+}
+
+#include <algorithms/iterative/Householder.h>
+#include <algorithms/iterative/Francis.h>
+
+#endif
+
--- a/lib/algorithms/iterative/EigenSort.h
+++ b/lib/algorithms/iterative/EigenSort.h
@@ -0,0 +1,52 @@
+#ifndef GRID_EIGENSORT_H
+#define GRID_EIGENSORT_H
+
+
+namespace Grid {
+    /////////////////////////////////////////////////////////////
+    // Eigen sorter to begin with
+    /////////////////////////////////////////////////////////////
+
+template<class Field>
+class SortEigen {
+ private:
+  
+  static bool less_lmd(RealD left,RealD right){
+    return fabs(left) < fabs(right);
+  }  
+  static bool less_pair(std::pair<RealD,Field>& left,
+		 std::pair<RealD,Field>& right){
+    return fabs(left.first) < fabs(right.first);
+  }  
+  
+ public:
+
+  void push(DenseVector<RealD>& lmd,
+	    DenseVector<Field>& evec,int N) {
+
+    DenseVector<std::pair<RealD, Field> > emod;
+    typename DenseVector<std::pair<RealD, Field> >::iterator it;
+    
+    for(int i=0;i<lmd.size();++i){
+      emod.push_back(std::pair<RealD,Field>(lmd[i],evec[i]));
+    }
+
+    partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair);
+
+    it=emod.begin();
+    for(int i=0;i<N;++i){
+      lmd[i]=it->first;
+      evec[i]=it->second;
+      ++it;
+    }
+  }
+  void push(DenseVector<RealD>& lmd,int N) {
+    std::partial_sort(lmd.begin(),lmd.begin()+N,lmd.end(),less_lmd);
+  }
+  bool saturated(RealD lmd, RealD thrs) {
+    return fabs(lmd) > fabs(thrs);
+  }
+};
+
+}
+#endif
--- a/lib/algorithms/iterative/Francis.h
+++ b/lib/algorithms/iterative/Francis.h
@@ -0,0 +1,498 @@
+#ifndef FRANCIS_H
+#define FRANCIS_H
+
+#include <cstdlib>
+#include <string>
+#include <cmath>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <fstream>
+#include <complex>
+#include <algorithm>
+
+//#include <timer.h>
+//#include <lapacke.h>
+//#include <Eigen/Dense>
+
+namespace Grid {
+
+template <class T> int SymmEigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small);
+template <class T> int     Eigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small);
+
+/**
+  Find the eigenvalues of an upper hessenberg matrix using the Francis QR algorithm.
+H =
+      x  x  x  x  x  x  x  x  x
+      x  x  x  x  x  x  x  x  x
+      0  x  x  x  x  x  x  x  x
+      0  0  x  x  x  x  x  x  x
+      0  0  0  x  x  x  x  x  x
+      0  0  0  0  x  x  x  x  x
+      0  0  0  0  0  x  x  x  x
+      0  0  0  0  0  0  x  x  x
+      0  0  0  0  0  0  0  x  x
+Factorization is P T P^H where T is upper triangular (mod cc blocks) and P is orthagonal/unitary.
+**/
+template <class T>
+int QReigensystem(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small)
+{
+  DenseMatrix<T> H = Hin; 
+
+  int N ; SizeSquare(H,N);
+  int M = N;
+
+  Fill(evals,0);
+  Fill(evecs,0);
+
+  T s,t,x=0,y=0,z=0;
+  T u,d;
+  T apd,amd,bc;
+  DenseVector<T> p(N,0);
+  T nrm = Norm(H);    ///DenseMatrix Norm
+  int n, m;
+  int e = 0;
+  int it = 0;
+  int tot_it = 0;
+  int l = 0;
+  int r = 0;
+  DenseMatrix<T> P; Resize(P,N,N); Unity(P);
+  DenseVector<int> trows(N,0);
+
+  /// Check if the matrix is really hessenberg, if not abort
+  RealD sth = 0;
+  for(int j=0;j<N;j++){
+    for(int i=j+2;i<N;i++){
+      sth = abs(H[i][j]);
+      if(sth > small){
+	std::cout << "Non hessenberg H = " << sth << " > " << small << std::endl;
+	exit(1);
+      }
+    }
+  }
+
+  do{
+    std::cout << "Francis QR Step N = " << N << std::endl;
+    /** Check for convergence
+      x  x  x  x  x
+      0  x  x  x  x
+      0  0  x  x  x
+      0  0  x  x  x
+      0  0  0  0  x
+      for this matrix l = 4
+     **/
+    do{
+      l = Chop_subdiag(H,nrm,e,small);
+      r = 0;    ///May have converged on more than one eval
+      ///Single eval
+      if(l == N-1){
+        evals[e] = H[l][l];
+        N--; e++; r++; it = 0;
+      }
+      ///RealD eval
+      if(l == N-2){
+        trows[l+1] = 1;    ///Needed for UTSolve
+        apd = H[l][l] + H[l+1][l+1];
+        amd = H[l][l] - H[l+1][l+1];
+        bc =  (T)4.0*H[l+1][l]*H[l][l+1];
+        evals[e]   = (T)0.5*( apd + sqrt(amd*amd + bc) );
+        evals[e+1] = (T)0.5*( apd - sqrt(amd*amd + bc) );
+        N-=2; e+=2; r++; it = 0;
+      }
+    } while(r>0);
+
+    if(N ==0) break;
+
+    DenseVector<T > ck; Resize(ck,3);
+    DenseVector<T> v;   Resize(v,3);
+
+    for(int m = N-3; m >= l; m--){
+      ///Starting vector essentially random shift.
+      if(it%10 == 0 && N >= 3 && it > 0){
+        s = (T)1.618033989*( abs( H[N-1][N-2] ) + abs( H[N-2][N-3] ) );
+        t = (T)0.618033989*( abs( H[N-1][N-2] ) + abs( H[N-2][N-3] ) );
+        x = H[m][m]*H[m][m] + H[m][m+1]*H[m+1][m] - s*H[m][m] + t;
+        y = H[m+1][m]*(H[m][m] + H[m+1][m+1] - s);
+        z = H[m+1][m]*H[m+2][m+1];
+      }
+      ///Starting vector implicit Q theorem
+      else{
+        s = (H[N-2][N-2] + H[N-1][N-1]);
+        t = (H[N-2][N-2]*H[N-1][N-1] - H[N-2][N-1]*H[N-1][N-2]);
+        x = H[m][m]*H[m][m] + H[m][m+1]*H[m+1][m] - s*H[m][m] + t;
+        y = H[m+1][m]*(H[m][m] + H[m+1][m+1] - s);
+        z = H[m+1][m]*H[m+2][m+1];
+      }
+      ck[0] = x; ck[1] = y; ck[2] = z;
+
+      if(m == l) break;
+
+      /** Some stupid thing from numerical recipies, seems to work**/
+      // PAB.. for heaven's sake quote page, purpose, evidence it works.
+      //       what sort of comment is that!?!?!?
+      u=abs(H[m][m-1])*(abs(y)+abs(z));
+      d=abs(x)*(abs(H[m-1][m-1])+abs(H[m][m])+abs(H[m+1][m+1]));
+      if ((T)abs(u+d) == (T)abs(d) ){
+	l = m; break;
+      }
+
+      //if (u < small){l = m; break;}
+    }
+    if(it > 100000){
+     std::cout << "QReigensystem: bugger it got stuck after 100000 iterations" << std::endl;
+     std::cout << "got " << e << " evals " << l << " " << N << std::endl;
+      exit(1);
+    }
+    normalize(ck);    ///Normalization cancels in PHP anyway
+    T beta;
+    Householder_vector<T >(ck, 0, 2, v, beta);
+    Householder_mult<T >(H,v,beta,0,l,l+2,0);
+    Householder_mult<T >(H,v,beta,0,l,l+2,1);
+    ///Accumulate eigenvector
+    Householder_mult<T >(P,v,beta,0,l,l+2,1);
+    int sw = 0;      ///Are we on the last row?
+    for(int k=l;k<N-2;k++){
+      x = H[k+1][k];
+      y = H[k+2][k];
+      z = (T)0.0;
+      if(k+3 <= N-1){
+	z = H[k+3][k];
+      } else{
+	sw = 1; 
+	v[2] = (T)0.0;
+      }
+      ck[0] = x; ck[1] = y; ck[2] = z;
+      normalize(ck);
+      Householder_vector<T >(ck, 0, 2-sw, v, beta);
+      Householder_mult<T >(H,v, beta,0,k+1,k+3-sw,0);
+      Householder_mult<T >(H,v, beta,0,k+1,k+3-sw,1);
+      ///Accumulate eigenvector
+      Householder_mult<T >(P,v, beta,0,k+1,k+3-sw,1);
+    }
+    it++;
+    tot_it++;
+  }while(N > 1);
+  N = evals.size();
+  ///Annoying - UT solves in reverse order;
+  DenseVector<T> tmp; Resize(tmp,N);
+  for(int i=0;i<N;i++){
+    tmp[i] = evals[N-i-1];
+  } 
+  evals = tmp;
+  UTeigenvectors(H, trows, evals, evecs);
+  for(int i=0;i<evals.size();i++){evecs[i] = P*evecs[i]; normalize(evecs[i]);}
+  return tot_it;
+}
+
+template <class T>
+int my_Wilkinson(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small)
+{
+  /**
+  Find the eigenvalues of an upper Hessenberg matrix using the Wilkinson QR algorithm.
+  H =
+  x  x  0  0  0  0
+  x  x  x  0  0  0
+  0  x  x  x  0  0
+  0  0  x  x  x  0
+  0  0  0  x  x  x
+  0  0  0  0  x  x
+  Factorization is P T P^H where T is upper triangular (mod cc blocks) and P is orthagonal/unitary.  **/
+  return my_Wilkinson(Hin, evals, evecs, small, small);
+}
+
+template <class T>
+int my_Wilkinson(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small, RealD tol)
+{
+  int N; SizeSquare(Hin,N);
+  int M = N;
+
+  ///I don't want to modify the input but matricies must be passed by reference
+  //Scale a matrix by its "norm"
+  //RealD Hnorm = abs( Hin.LargestDiag() ); H =  H*(1.0/Hnorm);
+  DenseMatrix<T> H;  H = Hin;
+  
+  RealD Hnorm = abs(Norm(Hin));
+  H = H * (1.0 / Hnorm);
+
+  // TODO use openmp and memset
+  Fill(evals,0);
+  Fill(evecs,0);
+
+  T s, t, x = 0, y = 0, z = 0;
+  T u, d;
+  T apd, amd, bc;
+  DenseVector<T> p; Resize(p,N); Fill(p,0);
+
+  T nrm = Norm(H);    ///DenseMatrix Norm
+  int n, m;
+  int e = 0;
+  int it = 0;
+  int tot_it = 0;
+  int l = 0;
+  int r = 0;
+  DenseMatrix<T> P; Resize(P,N,N);
+  Unity(P);
+  DenseVector<int> trows(N, 0);
+  /// Check if the matrix is really symm tridiag
+  RealD sth = 0;
+  for(int j = 0; j < N; ++j)
+  {
+    for(int i = j + 2; i < N; ++i)
+    {
+      if(abs(H[i][j]) > tol || abs(H[j][i]) > tol)
+      {
+	std::cout << "Non Tridiagonal H(" << i << ","<< j << ") = |" << Real( real( H[j][i] ) ) << "| > " << tol << std::endl;
+	std::cout << "Warning tridiagonalize and call again" << std::endl;
+        // exit(1); // see what is going on
+        //return;
+      }
+    }
+  }
+
+  do{
+    do{
+      //Jasper
+      //Check if the subdiagonal term is small enough (<small)
+      //if true then it is converged.
+      //check start from H.dim - e - 1
+      //How to deal with more than 2 are converged?
+      //What if Chop_symm_subdiag return something int the middle?
+      //--------------
+      l = Chop_symm_subdiag(H,nrm, e, small);
+      r = 0;    ///May have converged on more than one eval
+      //Jasper
+      //In this case
+      // x  x  0  0  0  0
+      // x  x  x  0  0  0
+      // 0  x  x  x  0  0
+      // 0  0  x  x  x  0
+      // 0  0  0  x  x  0
+      // 0  0  0  0  0  x  <- l
+      //--------------
+      ///Single eval
+      if(l == N - 1)
+      {
+        evals[e] = H[l][l];
+        N--;
+        e++;
+        r++;
+        it = 0;
+      }
+      //Jasper
+      // x  x  0  0  0  0
+      // x  x  x  0  0  0
+      // 0  x  x  x  0  0
+      // 0  0  x  x  0  0
+      // 0  0  0  0  x  x  <- l
+      // 0  0  0  0  x  x
+      //--------------
+      ///RealD eval
+      if(l == N - 2)
+      {
+        trows[l + 1] = 1;    ///Needed for UTSolve
+        apd = H[l][l] + H[l + 1][ l + 1];
+        amd = H[l][l] - H[l + 1][l + 1];
+        bc =  (T) 4.0 * H[l + 1][l] * H[l][l + 1];
+        evals[e] = (T) 0.5 * (apd + sqrt(amd * amd + bc));
+        evals[e + 1] = (T) 0.5 * (apd - sqrt(amd * amd + bc));
+        N -= 2;
+        e += 2;
+        r++;
+        it = 0;
+      }
+    }while(r > 0);
+    //Jasper
+    //Already converged
+    //--------------
+    if(N == 0) break;
+
+    DenseVector<T> ck,v; Resize(ck,2); Resize(v,2);
+
+    for(int m = N - 3; m >= l; m--)
+    {
+      ///Starting vector essentially random shift.
+      if(it%10 == 0 && N >= 3 && it > 0)
+      {
+        t = abs(H[N - 1][N - 2]) + abs(H[N - 2][N - 3]);
+        x = H[m][m] - t;
+        z = H[m + 1][m];
+      } else {
+      ///Starting vector implicit Q theorem
+        d = (H[N - 2][N - 2] - H[N - 1][N - 1]) * (T) 0.5;
+        t =  H[N - 1][N - 1] - H[N - 1][N - 2] * H[N - 1][N - 2] 
+	  / (d + sign(d) * sqrt(d * d + H[N - 1][N - 2] * H[N - 1][N - 2]));
+        x = H[m][m] - t;
+        z = H[m + 1][m];
+      }
+      //Jasper
+      //why it is here????
+      //-----------------------
+      if(m == l)
+        break;
+
+      u = abs(H[m][m - 1]) * (abs(y) + abs(z));
+      d = abs(x) * (abs(H[m - 1][m - 1]) + abs(H[m][m]) + abs(H[m + 1][m + 1]));
+      if ((T)abs(u + d) == (T)abs(d))
+      {
+        l = m;
+        break;
+      }
+    }
+    //Jasper
+    if(it > 1000000)
+    {
+      std::cout << "Wilkinson: bugger it got stuck after 100000 iterations" << std::endl;
+      std::cout << "got " << e << " evals " << l << " " << N << std::endl;
+      exit(1);
+    }
+    //
+    T s, c;
+    Givens_calc<T>(x, z, c, s);
+    Givens_mult<T>(H, l, l + 1, c, -s, 0);
+    Givens_mult<T>(H, l, l + 1, c,  s, 1);
+    Givens_mult<T>(P, l, l + 1, c,  s, 1);
+    //
+    for(int k = l; k < N - 2; ++k)
+    {
+      x = H.A[k + 1][k];
+      z = H.A[k + 2][k];
+      Givens_calc<T>(x, z, c, s);
+      Givens_mult<T>(H, k + 1, k + 2, c, -s, 0);
+      Givens_mult<T>(H, k + 1, k + 2, c,  s, 1);
+      Givens_mult<T>(P, k + 1, k + 2, c,  s, 1);
+    }
+    it++;
+    tot_it++;
+  }while(N > 1);
+
+  N = evals.size();
+  ///Annoying - UT solves in reverse order;
+  DenseVector<T> tmp(N);
+  for(int i = 0; i < N; ++i)
+    tmp[i] = evals[N-i-1];
+  evals = tmp;
+  //
+  UTeigenvectors(H, trows, evals, evecs);
+  //UTSymmEigenvectors(H, trows, evals, evecs);
+  for(int i = 0; i < evals.size(); ++i)
+  {
+    evecs[i] = P * evecs[i];
+    normalize(evecs[i]);
+    evals[i] = evals[i] * Hnorm;
+  }
+  // // FIXME this is to test
+  // Hin.write("evecs3", evecs);
+  // Hin.write("evals3", evals);
+  // // check rsd
+  // for(int i = 0; i < M; i++) {
+  //   vector<T> Aevec = Hin * evecs[i];
+  //   RealD norm2(0.);
+  //   for(int j = 0; j < M; j++) {
+  //     norm2 += (Aevec[j] - evals[i] * evecs[i][j]) * (Aevec[j] - evals[i] * evecs[i][j]);
+  //   }
+  // }
+  return tot_it;
+}
+
+template <class T>
+void Hess(DenseMatrix<T > &A, DenseMatrix<T> &Q, int start){
+
+  /**
+  turn a matrix A =
+  x  x  x  x  x
+  x  x  x  x  x
+  x  x  x  x  x
+  x  x  x  x  x
+  x  x  x  x  x
+  into
+  x  x  x  x  x
+  x  x  x  x  x
+  0  x  x  x  x
+  0  0  x  x  x
+  0  0  0  x  x
+  with householder rotations
+  Slow.
+  */
+  int N ; SizeSquare(A,N);
+  DenseVector<T > p; Resize(p,N); Fill(p,0);
+
+  for(int k=start;k<N-2;k++){
+    //cerr << "hess" << k << std::endl;
+    DenseVector<T > ck,v; Resize(ck,N-k-1); Resize(v,N-k-1);
+    for(int i=k+1;i<N;i++){ck[i-k-1] = A(i,k);}  ///kth column
+    normalize(ck);    ///Normalization cancels in PHP anyway
+    T beta;
+    Householder_vector<T >(ck, 0, ck.size()-1, v, beta);  ///Householder vector
+    Householder_mult<T>(A,v,beta,start,k+1,N-1,0);  ///A -> PA
+    Householder_mult<T >(A,v,beta,start,k+1,N-1,1);  ///PA -> PAP^H
+    ///Accumulate eigenvector
+    Householder_mult<T >(Q,v,beta,start,k+1,N-1,1);  ///Q -> QP^H
+  }
+  /*for(int l=0;l<N-2;l++){
+    for(int k=l+2;k<N;k++){
+    A(0,k,l);
+    }
+    }*/
+}
+
+template <class T>
+void Tri(DenseMatrix<T > &A, DenseMatrix<T> &Q, int start){
+///Tridiagonalize a matrix
+  int N; SizeSquare(A,N);
+  Hess(A,Q,start);
+  /*for(int l=0;l<N-2;l++){
+    for(int k=l+2;k<N;k++){
+    A(0,l,k);
+    }
+    }*/
+}
+
+template <class T>
+void ForceTridiagonal(DenseMatrix<T> &A){
+///Tridiagonalize a matrix
+  int N ; SizeSquare(A,N);
+  for(int l=0;l<N-2;l++){
+    for(int k=l+2;k<N;k++){
+      A[l][k]=0;
+      A[k][l]=0;
+    }
+  }
+}
+
+template <class T>
+int my_SymmEigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
+  ///Solve a symmetric eigensystem, not necessarily in tridiagonal form
+  int N; SizeSquare(Ain,N);
+  DenseMatrix<T > A; A = Ain;
+  DenseMatrix<T > Q; Resize(Q,N,N); Unity(Q);
+  Tri(A,Q,0);
+  int it = my_Wilkinson<T>(A, evals, evecs, small);
+  for(int k=0;k<N;k++){evecs[k] = Q*evecs[k];}
+  return it;
+}
+
+
+template <class T>
+int Wilkinson(DenseMatrix<T> &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
+  return my_Wilkinson(Ain, evals, evecs, small);
+}
+
+template <class T>
+int SymmEigensystem(DenseMatrix<T> &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
+  return my_SymmEigensystem(Ain, evals, evecs, small);
+}
+
+template <class T>
+int Eigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
+///Solve a general eigensystem, not necessarily in tridiagonal form
+  int N = Ain.dim;
+  DenseMatrix<T > A(N); A = Ain;
+  DenseMatrix<T > Q(N);Q.Unity();
+  Hess(A,Q,0);
+  int it = QReigensystem<T>(A, evals, evecs, small);
+  for(int k=0;k<N;k++){evecs[k] = Q*evecs[k];}
+  return it;
+}
+
+}
+#endif
--- a/lib/algorithms/iterative/Householder.h
+++ b/lib/algorithms/iterative/Householder.h
@@ -0,0 +1,215 @@
+#ifndef HOUSEHOLDER_H
+#define HOUSEHOLDER_H
+
+#define TIMER(A) std::cout << GridLogMessage << __FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
+#define ENTER()  std::cout << GridLogMessage << "ENTRY "<<__FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
+#define LEAVE()  std::cout << GridLogMessage << "EXIT  "<<__FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
+
+#include <cstdlib>
+#include <string>
+#include <cmath>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <fstream>
+#include <complex>
+#include <algorithm>
+
+namespace Grid {
+/** Comparison function for finding the max element in a vector **/
+template <class T> bool cf(T i, T j) { 
+  return abs(i) < abs(j); 
+}
+
+/** 
+	Calculate a real Givens angle 
+ **/
+template <class T> inline void Givens_calc(T y, T z, T &c, T &s){
+
+  RealD mz = (RealD)abs(z);
+  
+  if(mz==0.0){
+    c = 1; s = 0;
+  }
+  if(mz >= (RealD)abs(y)){
+    T t = -y/z;
+    s = (T)1.0 / sqrt ((T)1.0 + t * t);
+    c = s * t;
+  } else {
+    T t = -z/y;
+    c = (T)1.0 / sqrt ((T)1.0 + t * t);
+    s = c * t;
+  }
+}
+
+template <class T> inline void Givens_mult(DenseMatrix<T> &A,  int i, int k, T c, T s, int dir)
+{
+  int q ; SizeSquare(A,q);
+
+  if(dir == 0){
+    for(int j=0;j<q;j++){
+      T nu = A[i][j];
+      T w  = A[k][j];
+      A[i][j] = (c*nu + s*w);
+      A[k][j] = (-s*nu + c*w);
+    }
+  }
+
+  if(dir == 1){
+    for(int j=0;j<q;j++){
+      T nu = A[j][i];
+      T w  = A[j][k];
+      A[j][i] = (c*nu - s*w);
+      A[j][k] = (s*nu + c*w);
+    }
+  }
+}
+
+/**
+	from input = x;
+	Compute the complex Householder vector, v, such that
+	P = (I - b v transpose(v) )
+	b = 2/v.v
+
+	P | x |    | x | k = 0
+	| x |    | 0 | 
+	| x | =  | 0 |
+	| x |    | 0 | j = 3
+	| x |	   | x |
+
+	These are the "Unreduced" Householder vectors.
+
+ **/
+template <class T> inline void Householder_vector(DenseVector<T> input, int k, int j, DenseVector<T> &v, T &beta)
+{
+  int N ; Size(input,N);
+  T m = *max_element(input.begin() + k, input.begin() + j + 1, cf<T> );
+
+  if(abs(m) > 0.0){
+    T alpha = 0;
+
+    for(int i=k; i<j+1; i++){
+      v[i] = input[i]/m;
+      alpha = alpha + v[i]*conj(v[i]);
+    }
+    alpha = sqrt(alpha);
+    beta = (T)1.0/(alpha*(alpha + abs(v[k]) ));
+
+    if(abs(v[k]) > 0.0)  v[k] = v[k] + (v[k]/abs(v[k]))*alpha;
+    else                 v[k] = -alpha;
+  } else{
+    for(int i=k; i<j+1; i++){
+      v[i] = 0.0;
+    } 
+  }
+}
+
+/**
+	from input = x;
+	Compute the complex Householder vector, v, such that
+	P = (I - b v transpose(v) )
+	b = 2/v.v
+
+	Px = alpha*e_dir
+
+	These are the "Unreduced" Householder vectors.
+
+ **/
+
+template <class T> inline void Householder_vector(DenseVector<T> input, int k, int j, int dir, DenseVector<T> &v, T &beta)
+{
+  int N = input.size();
+  T m = *max_element(input.begin() + k, input.begin() + j + 1, cf);
+  
+  if(abs(m) > 0.0){
+    T alpha = 0;
+
+    for(int i=k; i<j+1; i++){
+      v[i] = input[i]/m;
+      alpha = alpha + v[i]*conj(v[i]);
+    }
+    
+    alpha = sqrt(alpha);
+    beta = 1.0/(alpha*(alpha + abs(v[dir]) ));
+	
+    if(abs(v[dir]) > 0.0) v[dir] = v[dir] + (v[dir]/abs(v[dir]))*alpha;
+    else                  v[dir] = -alpha;
+  }else{
+    for(int i=k; i<j+1; i++){
+      v[i] = 0.0;
+    } 
+  }
+}
+
+/**
+	Compute the product PA if trans = 0
+	AP if trans = 1
+	P = (I - b v transpose(v) )
+	b = 2/v.v
+	start at element l of matrix A
+	v is of length j - k + 1 of v are nonzero
+ **/
+
+template <class T> inline void Householder_mult(DenseMatrix<T> &A , DenseVector<T> v, T beta, int l, int k, int j, int trans)
+{
+  int N ; SizeSquare(A,N);
+
+  if(abs(beta) > 0.0){
+    for(int p=l; p<N; p++){
+      T s = 0;
+      if(trans==0){
+	for(int i=k;i<j+1;i++) s += conj(v[i-k])*A[i][p];
+	s *= beta;
+	for(int i=k;i<j+1;i++){ A[i][p] = A[i][p]-s*conj(v[i-k]);}
+      } else {
+	for(int i=k;i<j+1;i++){ s += conj(v[i-k])*A[p][i];}
+	s *= beta;
+	for(int i=k;i<j+1;i++){ A[p][i]=A[p][i]-s*conj(v[i-k]);}
+      }
+    }
+  }
+}
+
+/**
+	Compute the product PA if trans = 0
+	AP if trans = 1
+	P = (I - b v transpose(v) )
+	b = 2/v.v
+	start at element l of matrix A
+	v is of length j - k + 1 of v are nonzero
+	A is tridiagonal
+ **/
+template <class T> inline void Householder_mult_tri(DenseMatrix<T> &A , DenseVector<T> v, T beta, int l, int M, int k, int j, int trans)
+{
+  if(abs(beta) > 0.0){
+
+    int N ; SizeSquare(A,N);
+
+    DenseMatrix<T> tmp; Resize(tmp,N,N); Fill(tmp,0); 
+
+    T s;
+    for(int p=l; p<M; p++){
+      s = 0;
+      if(trans==0){
+	for(int i=k;i<j+1;i++) s = s + conj(v[i-k])*A[i][p];
+      }else{
+	for(int i=k;i<j+1;i++) s = s + v[i-k]*A[p][i];
+      }
+      s = beta*s;
+      if(trans==0){
+	for(int i=k;i<j+1;i++) tmp[i][p] = tmp(i,p) - s*v[i-k];
+      }else{
+	for(int i=k;i<j+1;i++) tmp[p][i] = tmp[p][i] - s*conj(v[i-k]);
+      }
+    }
+    for(int p=l; p<M; p++){
+      if(trans==0){
+	for(int i=k;i<j+1;i++) A[i][p] = A[i][p] + tmp[i][p];
+      }else{
+	for(int i=k;i<j+1;i++) A[p][i] = A[p][i] + tmp[p][i];
+      }
+    }
+  }
+}
+}
+#endif
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
--- a/lib/algorithms/iterative/Matrix.h
+++ b/lib/algorithms/iterative/Matrix.h
@@ -0,0 +1,426 @@
+#ifndef MATRIX_H
+#define MATRIX_H
+
+#include <cstdlib>
+#include <string>
+#include <cmath>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+#include <complex>
+#include <typeinfo>
+#include <Grid.h>
+
+
+/** Sign function **/
+template <class T> T sign(T p){return ( p/abs(p) );}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////// Hijack STL containers for our wicked means /////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+template<class T> using Vector = Vector<T>;
+template<class T> using Matrix = Vector<Vector<T> >;
+
+template<class T> void Resize(Vector<T > & vec, int N) { vec.resize(N); }
+
+template<class T> void Resize(Matrix<T > & mat, int N, int M) { 
+  mat.resize(N);
+  for(int i=0;i<N;i++){
+    mat[i].resize(M);
+  }
+}
+template<class T> void Size(Vector<T> & vec, int &N) 
+{ 
+  N= vec.size();
+}
+template<class T> void Size(Matrix<T> & mat, int &N,int &M) 
+{ 
+  N= mat.size();
+  M= mat[0].size();
+}
+template<class T> void SizeSquare(Matrix<T> & mat, int &N) 
+{ 
+  int M; Size(mat,N,M);
+  assert(N==M);
+}
+template<class T> void SizeSame(Matrix<T> & mat1,Matrix<T> &mat2, int &N1,int &M1) 
+{ 
+  int N2,M2;
+  Size(mat1,N1,M1);
+  Size(mat2,N2,M2);
+  assert(N1==N2);
+  assert(M1==M2);
+}
+
+//*****************************************
+//*	(Complex) Vector operations	*
+//*****************************************
+
+/**Conj of a Vector **/
+template <class T> Vector<T> conj(Vector<T> p){
+	Vector<T> q(p.size());
+	for(int i=0;i<p.size();i++){q[i] = conj(p[i]);}
+	return q;
+}
+
+/** Norm of a Vector**/
+template <class T> T norm(Vector<T> p){
+	T sum = 0;
+	for(int i=0;i<p.size();i++){sum = sum + p[i]*conj(p[i]);}
+	return abs(sqrt(sum));
+}
+
+/** Norm squared of a Vector **/
+template <class T> T norm2(Vector<T> p){
+	T sum = 0;
+	for(int i=0;i<p.size();i++){sum = sum + p[i]*conj(p[i]);}
+	return abs((sum));
+}
+
+/** Sum elements of a Vector **/
+template <class T> T trace(Vector<T> p){
+	T sum = 0;
+	for(int i=0;i<p.size();i++){sum = sum + p[i];}
+	return sum;
+}
+
+/** Fill a Vector with constant c **/
+template <class T> void Fill(Vector<T> &p, T c){
+	for(int i=0;i<p.size();i++){p[i] = c;}
+}
+/** Normalize a Vector **/
+template <class T> void normalize(Vector<T> &p){
+	T m = norm(p);
+	if( abs(m) > 0.0) for(int i=0;i<p.size();i++){p[i] /= m;}
+}
+/** Vector by scalar **/
+template <class T, class U> Vector<T> times(Vector<T> p, U s){
+	for(int i=0;i<p.size();i++){p[i] *= s;}
+	return p;
+}
+template <class T, class U> Vector<T> times(U s, Vector<T> p){
+	for(int i=0;i<p.size();i++){p[i] *= s;}
+	return p;
+}
+/** inner product of a and b = conj(a) . b **/
+template <class T> T inner(Vector<T> a, Vector<T> b){
+	T m = 0.;
+	for(int i=0;i<a.size();i++){m = m + conj(a[i])*b[i];}
+	return m;
+}
+/** sum of a and b = a + b **/
+template <class T> Vector<T> add(Vector<T> a, Vector<T> b){
+	Vector<T> m(a.size());
+	for(int i=0;i<a.size();i++){m[i] = a[i] + b[i];}
+	return m;
+}
+/** sum of a and b = a - b **/
+template <class T> Vector<T> sub(Vector<T> a, Vector<T> b){
+	Vector<T> m(a.size());
+	for(int i=0;i<a.size();i++){m[i] = a[i] - b[i];}
+	return m;
+}
+
+/** 
+ *********************************
+ *	Matrices	         *
+ *********************************
+ **/
+
+template<class T> void Fill(Matrix<T> & mat, T&val) { 
+  int N,M;
+  Size(mat,N,M);
+  for(int i=0;i<N;i++){
+  for(int j=0;j<M;j++){
+    mat[i][j] = val;
+  }}
+}
+
+/** Transpose of a matrix **/
+Matrix<T> Transpose(Matrix<T> & mat){
+  int N,M;
+  Size(mat,N,M);
+  Matrix C; Resize(C,M,N);
+  for(int i=0;i<M;i++){
+  for(int j=0;j<N;j++){
+    C[i][j] = mat[j][i];
+  }} 
+  return C;
+}
+/** Set Matrix to unit matrix **/
+template<class T> void Unity(Matrix<T> &mat){
+  int N;  SizeSquare(mat,N);
+  for(int i=0;i<N;i++){
+    for(int j=0;j<N;j++){
+      if ( i==j ) A[i][j] = 1;
+      else        A[i][j] = 0;
+    } 
+  } 
+}
+/** Add C * I to matrix **/
+template<class T>
+void PlusUnit(Matrix<T> & A,T c){
+  int dim;  SizeSquare(A,dim);
+  for(int i=0;i<dim;i++){A[i][i] = A[i][i] + c;} 
+}
+
+/** return the Hermitian conjugate of matrix **/
+Matrix<T> HermitianConj(Matrix<T> &mat){
+
+  int dim; SizeSquare(mat,dim);
+
+  Matrix<T> C; Resize(C,dim,dim);
+
+  for(int i=0;i<dim;i++){
+    for(int j=0;j<dim;j++){
+      C[i][j] = conj(mat[j][i]);
+    } 
+  } 
+  return C;
+}
+
+/** return diagonal entries as a Vector **/
+Vector<T> diag(Matrix<T> &A)
+{
+  int dim; SizeSquare(A,dim);
+  Vector<T> d; Resize(d,dim);
+
+  for(int i=0;i<dim;i++){
+    d[i] = A[i][i];
+  }
+  return d;
+}
+
+/** Left multiply by a Vector **/
+Vector<T> operator *(Vector<T> &B,Matrix<T> &A)
+{
+  int K,M,N; 
+  Size(B,K);
+  Size(A,M,N);
+  assert(K==M);
+  
+  Vector<T> C; Resize(C,N);
+
+  for(int j=0;j<N;j++){
+    T sum = 0.0;
+    for(int i=0;i<M;i++){
+      sum += B[i] * A[i][j];
+    }
+    C[j] =  sum;
+  }
+  return C; 
+}
+
+/** return 1/diagonal entries as a Vector **/
+Vector<T> inv_diag(Matrix<T> & A){
+  int dim; SizeSquare(A,dim);
+  Vector<T> d; Resize(d,dim);
+  for(int i=0;i<dim;i++){
+    d[i] = 1.0/A[i][i];
+  }
+  return d;
+}
+/** Matrix Addition **/
+inline Matrix<T> operator + (Matrix<T> &A,Matrix<T> &B)
+{
+  int N,M  ; SizeSame(A,B,N,M);
+  Matrix C; Resize(C,N,M);
+  for(int i=0;i<N;i++){
+    for(int j=0;j<M;j++){
+      C[i][j] = A[i][j] +  B[i][j];
+    } 
+  } 
+  return C;
+} 
+/** Matrix Subtraction **/
+inline Matrix<T> operator- (Matrix<T> & A,Matrix<T> &B){
+  int N,M  ; SizeSame(A,B,N,M);
+  Matrix C; Resize(C,N,M);
+  for(int i=0;i<N;i++){
+  for(int j=0;j<M;j++){
+    C[i][j] = A[i][j] -  B[i][j];
+  }}
+  return C;
+} 
+
+/** Matrix scalar multiplication **/
+inline Matrix<T> operator* (Matrix<T> & A,T c){
+  int N,M; Size(A,N,M);
+  Matrix C; Resize(C,N,M);
+  for(int i=0;i<N;i++){
+  for(int j=0;j<M;j++){
+    C[i][j] = A[i][j]*c;
+  }} 
+  return C;
+} 
+/** Matrix Matrix multiplication **/
+inline Matrix<T> operator* (Matrix<T> &A,Matrix<T> &B){
+  int K,L,N,M;
+  Size(A,K,L);
+  Size(B,N,M); assert(L==N);
+  Matrix C; Resize(C,K,M);
+
+  for(int i=0;i<K;i++){
+    for(int j=0;j<M;j++){
+      T sum = 0.0;
+      for(int k=0;k<N;k++) sum += A[i][k]*B[k][j];
+      C[i][j] =sum;
+    }
+  }
+  return C; 
+} 
+/** Matrix Vector multiplication **/
+inline Vector<T> operator* (Matrix<T> &A,Vector<T> &B){
+  int M,N,K;
+  Size(A,N,M);
+  Size(B,K); assert(K==M);
+  Vector<T> C; Resize(C,N);
+  for(int i=0;i<N;i++){
+    T sum = 0.0;
+    for(int j=0;j<M;j++) sum += A[i][j]*B[j];
+    C[i] =  sum;
+  }
+  return C; 
+} 
+
+/** Some version of Matrix norm **/
+/*
+inline T Norm(){ // this is not a usual L2 norm
+    T norm = 0;
+    for(int i=0;i<dim;i++){
+      for(int j=0;j<dim;j++){
+	norm += abs(A[i][j]);
+    }}
+    return norm;
+  }
+*/
+
+/** Some version of Matrix norm **/
+template<class T> T LargestDiag(Matrix<T> &A)
+{
+  int dim ; SizeSquare(A,dim); 
+
+  T ld = abs(A[0][0]);
+  for(int i=1;i<dim;i++){
+    T cf = abs(A[i][i]);
+    if(abs(cf) > abs(ld) ){ld = cf;}
+  }
+  return ld;
+}
+
+/** Look for entries on the leading subdiagonal that are smaller than 'small' **/
+template <class T,class U> int Chop_subdiag(Matrix<T> &A,T norm, int offset, U small)
+{
+  int dim; SizeSquare(A,dim);
+  for(int l = dim - 1 - offset; l >= 1; l--) {             		
+    if((U)abs(A[l][l - 1]) < (U)small) {
+      A[l][l-1]=(U)0.0;
+      return l;
+    }
+  }
+  return 0;
+}
+
+/** Look for entries on the leading subdiagonal that are smaller than 'small' **/
+template <class T,class U> int Chop_symm_subdiag(Matrix<T> & A,T norm, int offset, U small) 
+{
+  int dim; SizeSquare(A,dim);
+  for(int l = dim - 1 - offset; l >= 1; l--) {
+    if((U)abs(A[l][l - 1]) < (U)small) {
+      A[l][l - 1] = (U)0.0;
+      A[l - 1][l] = (U)0.0;
+      return l;
+    }
+  }
+  return 0;
+}
+/**Assign a submatrix to a larger one**/
+template<class T>
+void AssignSubMtx(Matrix<T> & A,int row_st, int row_end, int col_st, int col_end, Matrix<T> &S)
+{
+  for(int i = row_st; i<row_end; i++){
+    for(int j = col_st; j<col_end; j++){
+      A[i][j] = S[i - row_st][j - col_st];
+    }
+  }
+}
+
+/**Get a square submatrix**/
+template <class T>
+Matrix<T> GetSubMtx(Matrix<T> &A,int row_st, int row_end, int col_st, int col_end)
+{
+  Matrix<T> H; Resize(row_end - row_st,col_end-col_st);
+
+  for(int i = row_st; i<row_end; i++){
+  for(int j = col_st; j<col_end; j++){
+    H[i-row_st][j-col_st]=A[i][j];
+  }}
+  return H;
+}
+  
+ /**Assign a submatrix to a larger one NB remember Vector Vectors are transposes of the matricies they represent**/
+template<class T>
+void AssignSubMtx(Matrix<T> & A,int row_st, int row_end, int col_st, int col_end, Matrix<T> &S)
+{
+  for(int i = row_st; i<row_end; i++){
+  for(int j = col_st; j<col_end; j++){
+    A[i][j] = S[i - row_st][j - col_st];
+  }}
+}
+  
+/** compute b_i A_ij b_j **/ // surprised no Conj
+template<class T> T proj(Matrix<T> A, Vector<T> B){
+  int dim; SizeSquare(A,dim);
+  int dimB; Size(B,dimB);
+  assert(dimB==dim);
+  T C = 0;
+  for(int i=0;i<dim;i++){
+    T sum = 0.0;
+    for(int j=0;j<dim;j++){
+      sum += A[i][j]*B[j];
+    }
+    C +=  B[i]*sum; // No conj?
+  }
+  return C; 
+}
+
+
+/*
+ *************************************************************
+ *
+ * Matrix Vector products
+ *
+ *************************************************************
+ */
+// Instead make a linop and call my CG;
+
+/// q -> q Q
+template <class T,class Fermion> void times(Vector<Fermion> &q, Matrix<T> &Q)
+{
+  int M; SizeSquare(Q,M);
+  int N; Size(q,N); 
+  assert(M==N);
+
+  times(q,Q,N);
+}
+
+/// q -> q Q
+template <class T> void times(multi1d<LatticeFermion> &q, Matrix<T> &Q, int N)
+{
+  GridBase *grid = q[0]._grid;
+  int M; SizeSquare(Q,M);
+  int K; Size(q,K); 
+  assert(N<M);
+  assert(N<K);
+  Vector<Fermion> S(N,grid );
+  for(int j=0;j<N;j++){
+    S[j] = zero;
+    for(int k=0;k<N;k++){
+      S[j] = S[j] +  q[k]* Q[k][j]; 
+    }
+  }
+  for(int j=0;j<q.size();j++){
+    q[j] = S[j];
+  }
+}
+#endif
--- a/lib/algorithms/iterative/bisec.c
+++ b/lib/algorithms/iterative/bisec.c
@@ -0,0 +1,122 @@
+#include <math.h>
+#include <stdlib.h>
+#include <vector>
+
+struct Bisection {
+
+static void get_eig2(int row_num,std::vector<RealD> &ALPHA,std::vector<RealD> &BETA, std::vector<RealD> & eig)
+{
+  int i,j;
+  std::vector<RealD> evec1(row_num+3);
+  std::vector<RealD> evec2(row_num+3);
+  RealD eps2;
+  ALPHA[1]=0.;
+  BETHA[1]=0.;
+  for(i=0;i<row_num-1;i++) {
+    ALPHA[i+1] = A[i*(row_num+1)].real();
+    BETHA[i+2] = A[i*(row_num+1)+1].real();
+  }
+  ALPHA[row_num] = A[(row_num-1)*(row_num+1)].real();
+  bisec(ALPHA,BETHA,row_num,1,row_num,1e-10,1e-10,evec1,eps2);
+  bisec(ALPHA,BETHA,row_num,1,row_num,1e-16,1e-16,evec2,eps2);
+
+  // Do we really need to sort here?
+  int begin=1;
+  int end = row_num;
+  int swapped=1;
+  while(swapped) {
+    swapped=0;
+    for(i=begin;i<end;i++){
+      if(mag(evec2[i])>mag(evec2[i+1]))	{
+	swap(evec2+i,evec2+i+1);
+	swapped=1;
+      }
+    }
+    end--;
+    for(i=end-1;i>=begin;i--){
+      if(mag(evec2[i])>mag(evec2[i+1]))	{
+	swap(evec2+i,evec2+i+1);
+	swapped=1;
+      }
+    }
+    begin++;
+  }
+
+  for(i=0;i<row_num;i++){
+    for(j=0;j<row_num;j++) {
+      if(i==j) H[i*row_num+j]=evec2[i+1];
+      else H[i*row_num+j]=0.;
+    }
+  }
+}
+
+static void bisec(std::vector<RealD> &c,   
+		  std::vector<RealD> &b,
+		  int n,
+		  int m1,
+		  int m2,
+		  RealD eps1,
+		  RealD relfeh,
+		  std::vector<RealD> &x,
+		  RealD &eps2)
+{
+  std::vector<RealD> wu(n+2);
+
+  RealD h,q,x1,xu,x0,xmin,xmax; 
+  int i,a,k;
+
+  b[1]=0.0;
+  xmin=c[n]-fabs(b[n]);
+  xmax=c[n]+fabs(b[n]);
+  for(i=1;i<n;i++){
+    h=fabs(b[i])+fabs(b[i+1]);
+    if(c[i]+h>xmax) xmax= c[i]+h;
+    if(c[i]-h<xmin) xmin= c[i]-h;
+  }
+  xmax *=2.;
+
+  eps2=relfeh*((xmin+xmax)>0.0 ? xmax : -xmin);
+  if(eps1<=0.0) eps1=eps2;
+  eps2=0.5*eps1+7.0*(eps2);
+  x0=xmax;
+  for(i=m1;i<=m2;i++){
+    x[i]=xmax;
+    wu[i]=xmin;
+  }
+
+  for(k=m2;k>=m1;k--){
+    xu=xmin;
+    i=k;
+    do{
+      if(xu<wu[i]){
+	xu=wu[i];
+	i=m1-1;
+      }
+      i--;
+    }while(i>=m1);
+    if(x0>x[k]) x0=x[k];
+    while((x0-xu)>2*relfeh*(fabs(xu)+fabs(x0))+eps1){
+      x1=(xu+x0)/2;
+
+      a=0;
+      q=1.0;
+      for(i=1;i<=n;i++){
+	q=c[i]-x1-((q!=0.0)? b[i]*b[i]/q:fabs(b[i])/relfeh);
+	if(q<0) a++;
+      }
+      //			printf("x1=%e a=%d\n",x1,a);
+      if(a<k){
+	if(a<m1){
+	  xu=x1;
+	  wu[m1]=x1;
+	}else {
+	  xu=x1;
+	  wu[a+1]=x1;
+	  if(x[a]>x1) x[a]=x1;
+	}
+      }else x0=x1;
+    }
+    x[k]=(x0+xu)/2;
+  }
+}
+}
--- a/lib/algorithms/iterative/get_eig.c
+++ b/lib/algorithms/iterative/get_eig.c
@@ -0,0 +1 @@
+
--- a/lib/cshift/Cshift_common.h
+++ b/lib/cshift/Cshift_common.h
@@ -29,14 +29,15 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
  
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
-PARALLEL_NESTED_LOOP2
+  int bo=0;
+    //PARALLEL_NESTED_LOOP21
  for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
      int o  = n*rhs._grid->_slice_stride[dimension];
-      int bo = n*rhs._grid->_slice_block[dimension];
+      //      int bo = n*rhs._grid->_slice_block[dimension];
      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
      if ( ocb &cbmask ) {
-	buffer[bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
+	buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
      }
    }
  }
@@ -59,7 +60,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_

  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
-PARALLEL_NESTED_LOOP2
+  //PARALLEL_NESTED_LOOP2
  for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){

@@ -109,14 +110,15 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
    
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
-PARALLEL_NESTED_LOOP2
+  int bo=0;
+  //PARALLEL_NESTED_LOOP2
  for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
      int o   =n*rhs._grid->_slice_stride[dimension];
-      int bo  =n*rhs._grid->_slice_block[dimension];
+      //      int bo  =n*rhs._grid->_slice_block[dimension];
      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
      if ( ocb & cbmask ) {
-	rhs._odata[so+o+b]=buffer[bo+b];
+	rhs._odata[so+o+b]=buffer[bo++];
      }
    }
  }
--- a/lib/cshift/Cshift_mpi.h
+++ b/lib/cshift/Cshift_mpi.h
@@ -9,7 +9,7 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_type scalar_type;

-  Lattice<vobj> ret(rhs._grid);
+  Lattice<vobj> ret(rhs._grid); 
  
  int fd = rhs._grid->_fdimensions[dimension];
  int rd = rhs._grid->_rdimensions[dimension];
@@ -26,10 +26,13 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension


  if ( !comm_dim ) {
+    //    std::cout << "Cshift_local" <<std::endl;
    Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
  } else if ( splice_dim ) {
+    //    std::cout << "Cshift_comms_simd" <<std::endl;
    Cshift_comms_simd(ret,rhs,dimension,shift);
  } else {
+    //    std::cout << "Cshift_comms" <<std::endl;
    Cshift_comms(ret,rhs,dimension,shift);
  }
  return ret;
@@ -42,9 +45,13 @@ template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &r
  sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
  sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);

+  //  std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
+
  if ( sshift[0] == sshift[1] ) {
+    //    std::cout << "Single pass Cshift_comms" <<std::endl;
    Cshift_comms(ret,rhs,dimension,shift,0x3);
  } else {
+    //    std::cout << "Two pass Cshift_comms" <<std::endl;
    Cshift_comms(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
    Cshift_comms(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
  }
@@ -113,12 +120,16 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      int xmit_to_rank;
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);

+
      grid->SendToRecvFrom((void *)&send_buf[0],
 			   xmit_to_rank,
 			   (void *)&recv_buf[0],
 			   recv_from_rank,
 			   bytes);

+      //      for(int i=0;i<words;i++){
+      //	std::cout << "SendRecv ["<<i<<"] snd "<<send_buf[i]<<" rcv " << recv_buf[i] << "  0x" << cbmask<<std::endl;
+      //      }
      Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
    }
  }
--- a/lib/lattice/Lattice_base.h
+++ b/lib/lattice/Lattice_base.h
@@ -212,9 +212,10 @@ PARALLEL_FOR_LOOP
    // Constructor requires "grid" passed.
    // what about a default grid?
    //////////////////////////////////////////////////////////////////
- Lattice(GridBase *grid) : _grid(grid), _odata(_grid->oSites()) {
+    Lattice(GridBase *grid) : _grid(grid), _odata(_grid->oSites()) {
      //        _odata.reserve(_grid->oSites());
      //        _odata.resize(_grid->oSites());
+    //      std::cout << "Constructing lattice object with Grid pointer "<<_grid<<std::endl;
        assert((((uint64_t)&_odata[0])&0xF) ==0);
        checkerboard=0;
    }
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@@ -1,11 +1,10 @@
 #ifndef GRID_BINARY_IO_H
 #define GRID_BINARY_IO_H

+
 #ifdef HAVE_ENDIAN_H
 #include <endian.h>
 #endif
-
-
 #include <arpa/inet.h>
 #include <algorithm>
 // 64bit endian swap is a portability pain
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -78,7 +78,7 @@ void WilsonKernels<Impl>::DiracOptDhopSite(CartesianStencil &st,DoubledGaugeFiel
  }
  Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st);
  accumReconXm(result,Uchi);
-  
+
  // Ym
  SE=st.GetEntry(ptype,Ym,sF);
  if (  SE->_is_local && SE->_permute ) {
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@@ -8,6 +8,9 @@
 //----------------------------------------------------------------------

 #include <immintrin.h>
+#ifdef AVXFMA4
+#include <x86intrin.h>
+#endif
 // _mm256_set_m128i(hi,lo); // not defined in all versions of immintrin.h
 #ifndef _mm256_set_m128i
 #define _mm256_set_m128i(hi,lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo),(hi),1)
@@ -132,7 +135,7 @@ namespace Optimization {
    }
    //Integer
    inline __m256i operator()(__m256i a, __m256i b){
-#if defined (AVX1) 
+#if defined (AVX1) || defined (AVXFMA4)
          __m128i a0,a1;
          __m128i b0,b1;
          a0 = _mm256_extractf128_si256(a,0);
@@ -146,7 +149,6 @@ namespace Optimization {
 #if defined (AVX2)
            return _mm256_add_epi32(a,b);
 #endif
-
    }
  };

@@ -161,7 +163,7 @@ namespace Optimization {
    }
    //Integer
    inline __m256i operator()(__m256i a, __m256i b){
-#if defined (AVX1) 
+#if defined (AVX1) || defined (AVXFMA4)
          __m128i a0,a1;
          __m128i b0,b1;
          a0 = _mm256_extractf128_si256(a,0);
@@ -182,6 +184,7 @@ namespace Optimization {
  struct MultComplex{
    // Complex float
    inline __m256 operator()(__m256 a, __m256 b){
+#if defined (AVX1) 
      __m256 ymm0,ymm1,ymm2;
      ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
      ymm0 = _mm256_mul_ps(ymm0,b);                       // ymm0 <- ar bi, ar br
@@ -190,6 +193,20 @@ namespace Optimization {
      ymm2 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai
      ymm1 = _mm256_mul_ps(ymm1,ymm2);                    // ymm1 <- br ai, ai bi
      return _mm256_addsub_ps(ymm0,ymm1);  
+#endif
+#if defined (AVXFMA4)
+      __m256 a_real = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ar ar,
+      __m256 a_imag = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ai ai
+      __m256 tmp = _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1));
+      a_imag = _mm256_mul_ps( a_imag,tmp  );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
+      return _mm256_maddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
+#endif
+#if defined (AVX2)
+      __m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar
+      __m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai
+      a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
+      return _mm256_fmaddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
+#endif
    }
    // Complex double
    inline __m256d operator()(__m256d a, __m256d b){
@@ -215,6 +232,7 @@ namespace Optimization {
 	IF IMM0[3] = 0
 	THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i   ; 0xC unchanged
      */
+#if defined (AVX1) 
      __m256d ymm0,ymm1,ymm2;
      ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
      ymm0 = _mm256_mul_pd(ymm0,b);      // ymm0 <- ar bi, ar br
@@ -222,10 +240,71 @@ namespace Optimization {
      ymm2 = _mm256_shuffle_pd(a,a,0xF); // ymm2 <- ai,ai  b'11,11
      ymm1 = _mm256_mul_pd(ymm1,ymm2);   // ymm1 <- br ai, ai bi
      return _mm256_addsub_pd(ymm0,ymm1);
+#endif
+#if defined (AVXFMA4)
+      __m256d a_real = _mm256_shuffle_pd(a,a,0x0);//arar
+      __m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai
+      a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
+      return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
+#endif
+#if defined (AVX2)
+      __m256d a_real = _mm256_moveldup_pd( a ); // Ar Ar
+      __m256d a_imag = _mm256_movehdup_pd( a ); // Ai Ai
+      a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
+      return _mm256_fmaddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
+#endif
    }
+
+
  };

+#if 0
+  struct ComplexDot {
+
+    inline void Prep(__m256 ari,__m256 &air) {
+      cdotRIperm(ari,air);
+    }
+    inline void Mul(__m256 ari,__m256 air,__m256 b,__m256 &riir,__m256 &iirr) {
+      riir=air*b;
+      iirr=arr*b;
+    };
+    inline void Madd(__m256 ari,__m256 air,__m256 b,__m256 &riir,__m256 &iirr) {
+      mac(riir,air,b);
+      mac(iirr,ari,b);
+    }
+    inline void End(__m256 ari,__m256 &air) {
+      //      cdotRI
+    }
+
+  };
+#endif
+
  struct Mult{
+
+    inline void mac(__m256 &a, __m256 b, __m256 c){
+#if defined (AVX1)
+      a= _mm256_add_ps(_mm256_mul_ps(b,c),a);
+#endif
+#if defined (AVXFMA4)
+      a= _mm256_macc_ps(b,c,a);
+#endif
+#if defined (AVX2)
+      a= _mm256_fmadd_ps( b, c, a);
+#endif
+    }
+
+    inline void mac(__m256d &a, __m256d b, __m256d c){
+#if defined (AVX1)
+      a= _mm256_add_pd(_mm256_mul_pd(b,c),a);
+#endif
+#if defined (AVXFMA4)
+      a= _mm256_macc_pd(b,c,a);
+#endif
+#if defined (AVX2)
+      a= _mm256_fmadd_pd( b, c, a);
+#endif
+    }
+
    // Real float
    inline __m256 operator()(__m256 a, __m256 b){
      return _mm256_mul_ps(a,b);
--- a/lib/simd/Grid_empty.h
+++ b/lib/simd/Grid_empty.h
@@ -157,6 +157,12 @@ namespace Optimization {
  };

  struct Mult{
+    inline float  mac(float a, float b,double c){
+      return 0;
+    }
+    inline double mac(double a, double b,double c){
+      return 0;
+    }
    // Real float
    inline float operator()(float a, float b){
      return 0;
--- a/lib/simd/Grid_neon.h
+++ b/lib/simd/Grid_neon.h
@@ -171,6 +171,12 @@ namespace Optimization {

  struct Mult{
    // Real float
+    inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){
+      return vaddq_f32(vmulq_f32(b,c),a);
+    }
+    inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){
+      return vaddq_f64(vmulq_f64(b,c),a);
+    }
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
      return vmulq_f32(a,b);
    }
--- a/lib/simd/Grid_sse4.h
+++ b/lib/simd/Grid_sse4.h
@@ -171,6 +171,15 @@ namespace Optimization {
  };

  struct Mult{
+
+    inline void mac(__m128 &a, __m128 b, __m128 c){
+      a= _mm128_add_ps(_mm128_mul_ps(b,c),a);
+    }
+
+    inline void mac(__m128d &a, __m128d b, __m128d c){
+      a= _mm128_add_pd(_mm128_mul_pd(b,c),a);
+    }
+
    // Real float
    inline __m128 operator()(__m128 a, __m128 b){
      return _mm_mul_ps(a,b);
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -13,7 +13,7 @@
 #ifdef SSE4
 #include "Grid_sse4.h"
 #endif
-#if defined (AVX1)|| defined (AVX2)
+#if defined (AVX1)|| defined (AVX2) || defined (AVXFMA4)
 #include "Grid_avx.h"
 #endif
 #if defined AVX512
@@ -133,7 +133,11 @@ namespace Grid {
    ///////////////////////////////////////////////
    // mac, mult, sub, add, adj
    ///////////////////////////////////////////////
+
+    // FIXME -- alias this to an inline MAC struct.
    friend inline void mac (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ a,const Grid_simd *__restrict__ x){ *y = (*a)*(*x)+(*y); };
+
+
    friend inline void mult(Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) * (*r); }
    friend inline void sub (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) - (*r); }
    friend inline void add (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) + (*r); }
--- a/lib/stencil/Stencil_common.cc
+++ b/lib/stencil/Stencil_common.cc
@@ -8,7 +8,7 @@ namespace Grid {
 				     int checkerboard,
 				     const std::vector<int> &directions,
 				     const std::vector<int> &distances) 
-    :   _entries(npoints), _permute_type(npoints)
+    :   _entries(npoints), _permute_type(npoints), _comm_buf_size(npoints)
    {
      _npoints = npoints;
      _grid    = grid;
@@ -61,11 +61,17 @@ namespace Grid {
 	  sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
 	  if ( sshift[0] == sshift[1] ) {
 	    Comms(point,dimension,shift,0x3);
+	    //	    std::cout<<"Comms 0x3"<<std::endl;
 	  } else {
 	    Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
 	    Comms(point,dimension,shift,0x2);// both with block stride loop iteration
+	    //	    std::cout<<"Comms 0x1 ; 0x2"<<std::endl;
 	  }
 	}
+	//	for(int ss=0;ss<osites;ss++){
+	  //	  std::cout << "point["<<i<<"] "<<ss<<"-> o"<<_entries[i][ss]._offset<<"; l"<<
+	  //	    _entries[i][ss]._is_local<<"; p"<<_entries[i][ss]._permute<<std::endl;
+	//	}
      }
    }

@@ -139,13 +145,14 @@ namespace Grid {
      int cb= (cbmask==0x2)? Odd : Even;
      int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
      
-      for(int x=0;x<rd;x++){       
-	
-	int offnode = (((x+sshift)%fd) >= rd ); 
-	//	int comm_proc   = ((x+sshift)/ld)%pd;        
-	//	int offnode     = (comm_proc!=0);
-	int sx          = (x+sshift)%rd;

+      for(int x=0;x<rd;x++){       
+
+	int sx        =  (x+sshift)%rd;
+	int comm_proc = ((x+sshift)/rd)%pd;
+    	int offnode = (comm_proc!= 0);
+
+	//	std::cout << "Stencil shift "<<shift<<" sshift "<<sshift<<" fd "<<fd<<" rd " <<rd<<" offnode "<<offnode<<" sx "<<sx<<std::endl;
 	int wraparound=0;
 	if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
 	  wraparound = 1;
@@ -249,7 +256,7 @@ namespace Grid {
 	int so  = plane*_grid->_ostride[dimension]; // base offset for start of plane 
 	int o   = 0;                                      // relative offset to base within plane
 	int bo  = 0;                                      // offset in buffer
-    
+
 	for(int n=0;n<_grid->_slice_nblock[dimension];n++){
 	  for(int b=0;b<_grid->_slice_block[dimension];b++){

--- a/tests/Make.inc
+++ b/tests/Make.inc
@@ -1,5 +1,5 @@

-bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_hdcr Test_gamma Test_hmc_EODWFRatio Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi
+bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_hdcr Test_dwf_lanczos Test_gamma Test_hmc_EODWFRatio Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi


 Test_GaugeAction_SOURCES=Test_GaugeAction.cc
@@ -86,16 +86,16 @@ Test_dwf_hdcr_SOURCES=Test_dwf_hdcr.cc
 Test_dwf_hdcr_LDADD=-lGrid


-#Test_dwf_lanczos_SOURCES=Test_dwf_lanczos.cc
-#Test_dwf_lanczos_LDADD=-lGrid
+Test_dwf_lanczos_SOURCES=Test_dwf_lanczos.cc
+Test_dwf_lanczos_LDADD=-lGrid


 Test_gamma_SOURCES=Test_gamma.cc
 Test_gamma_LDADD=-lGrid


-#Test_gparity_SOURCES=Test_gparity.cc
-#Test_gparity_LDADD=-lGrid
+Test_gparity_SOURCES=Test_gparity.cc
+Test_gparity_LDADD=-lGrid


 Test_gpwilson_even_odd_SOURCES=Test_gpwilson_even_odd.cc
@@ -190,6 +190,10 @@ Test_stencil_SOURCES=Test_stencil.cc
 Test_stencil_LDADD=-lGrid


+Test_synthetic_lanczos_SOURCES=Test_synthetic_lanczos.cc
+Test_synthetic_lanczos_LDADD=-lGrid
+
+
 Test_wilson_cg_prec_SOURCES=Test_wilson_cg_prec.cc
 Test_wilson_cg_prec_LDADD=-lGrid

--- a/tests/Test_cshift_red_black.cc
+++ b/tests/Test_cshift_red_black.cc
@@ -54,27 +54,27 @@ int main (int argc, char ** argv)

  TComplex cm;
  for(int dir=0;dir<Nd;dir++){
-    if ( dir!=1 ) continue;
+    //    if ( dir!=1 ) continue;
    for(int shift=0;shift<latt_size[dir];shift++){

 	std::cout<<GridLogMessage<<"Shifting by "<<shift<<" in direction"<<dir<<std::endl;

-	//	std::cout<<GridLogMessage<<"Even grid"<<std::endl;
+	std::cout<<GridLogMessage<<"Even grid"<<std::endl;
 	ShiftUe = Cshift(Ue,dir,shift);    // Shift everything cb by cb
-	//	std::cout<<GridLogMessage << "\tShiftUe " <<norm2(ShiftUe)<<std::endl;
+	std::cout<<GridLogMessage << "\tShiftUe " <<norm2(ShiftUe)<<std::endl;

-	//	std::cout<<GridLogMessage<<"Odd grid"<<std::endl;
+	std::cout<<GridLogMessage<<"Odd grid"<<std::endl;
 	ShiftUo = Cshift(Uo,dir,shift);    
-	//	std::cout<<GridLogMessage << "\tShiftUo " <<norm2(ShiftUo)<<std::endl;
+	std::cout<<GridLogMessage << "\tShiftUo " <<norm2(ShiftUo)<<std::endl;

-	//	std::cout<<GridLogMessage<<"Recombined Even/Odd grids"<<std::endl;
+	std::cout<<GridLogMessage<<"Recombined Even/Odd grids"<<std::endl;
 	setCheckerboard(rbShiftU,ShiftUe);
 	setCheckerboard(rbShiftU,ShiftUo);
-	//	std::cout<<GridLogMessage << "\trbShiftU " <<norm2(rbShiftU)<<std::endl;
+	std::cout<<GridLogMessage << "\trbShiftU " <<norm2(rbShiftU)<<std::endl;

-	//	std::cout<<GridLogMessage<<"Full grid shift"<<std::endl;
+	std::cout<<GridLogMessage<<"Full grid shift"<<std::endl;
 	ShiftU  = Cshift(U,dir,shift);    // Shift everything
-	//	std::cout<<GridLogMessage << "\tShiftU " <<norm2(rbShiftU)<<std::endl;
+	std::cout<<GridLogMessage << "\tShiftU " <<norm2(rbShiftU)<<std::endl;

 	std::vector<int> coor(4);

@@ -105,18 +105,18 @@ int main (int argc, char ** argv)
 	  Fine.CoorFromIndex(peer,index,latt_size);

 	  if (nrm > 0){
-	    std::cerr<<"FAIL shift "<< shift<<" in dir "<< dir
+	    std::cout<<"FAIL shift "<< shift<<" in dir "<< dir
 		     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
 		     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
-	    std::cerr<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
+	    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 	    index=real(scm);
 	    Fine.CoorFromIndex(peer,index,latt_size);
-	    std::cerr<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
+	    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 	    exit(-1);
 	  }
 	}}}}

-
+	int exx=0;
 	std::cout<<GridLogMessage << "Checking the checkerboard shift"<<std::endl;
 	for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
 	for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
@@ -144,20 +144,21 @@ int main (int argc, char ** argv)
 	  Fine.CoorFromIndex(peer,index,latt_size);

 	  if (nrm > 0){
-	    std::cerr<<"FAIL shift "<< shift<<" in dir "<< dir
+	    std::cout<<"FAIL shift "<< shift<<" in dir "<< dir
 		     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
 		     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
-	    std::cerr<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
+	    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 	    index=real(scm);
 	    Fine.CoorFromIndex(peer,index,latt_size);
-	    std::cerr<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
-	    exit(-1);
-	  } else if (0) { 
+	    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
+	    exx=1;
+	  } else if (1) { 
 	    std::cout<<GridLogMessage<<"PASS shift "<< shift<<" in dir "<< dir
 		     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
 		     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 	  }
 	}}}}
+	if (exx) exit(-1);

    }
  }
--- a/tests/Test_dwf_lanczos.cc
+++ b/tests/Test_dwf_lanczos.cc
@@ -35,21 +35,26 @@ int main (int argc, char ** argv)

  MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermOp(Ddwf);

-  const int Nk = 10;
-  const int Np = 1;
-  RealD enorm  = 1.0;
-  RealD vthrs  = 1;
-  const int Nit= 1000;
+  const int Nk = 30;
+  const int Np = 10;
+  const int Nm = Nk+Np;
+  const int MaxIt= 10000;
+  RealD resid = 1.0e-8;

-  ImplicitlyRestartedLanczos<LatticeFermion> IRL(HermOp,PolyX,
-						 Nk,Np,enorm,vthrs,Nit);
+  std::vector<double> Coeffs(1,1.0);
+  Polynomial<LatticeFermion> PolyX(Coeffs);
+  ImplicitlyRestartedLanczos<LatticeFermion> IRL(HermOp,PolyX,Nk,Nm,resid,MaxIt);

  
-  std::vector<RealD>          eval(Nk);
-  std::vector<LatticeFermion> evec(Nk,FGrid);
+  std::vector<RealD>          eval(Nm);
+  std::vector<LatticeFermion> evec(Nm,FGrid);
+  for(int i=0;i<Nm;i++){
+    std::cout << i<<" / "<< Nm<< " grid pointer "<<evec[i]._grid<<std::endl;
+  };
+
+  int Nconv;
  IRL.calc(eval,evec,
 	   src,
-	   Nsbt,
 	   Nconv);


--- a/tests/Test_main.cc
+++ b/tests/Test_main.cc
@@ -298,7 +298,7 @@ int main (int argc, char ** argv)
      c          = scm()(1,1)(1,2);
      scm()(1,1)(2,1) = c;

-      pokeIndex<ColourIndex> (c_m,c,0,0);
+      //      pokeIndex<ColourIndex> (c_m,c,0,0);
    }

    FooBar = Bar;
--- a/tests/Test_stencil.cc
+++ b/tests/Test_stencil.cc
@@ -8,6 +8,10 @@ int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);

+  //  typedef LatticeColourMatrix Field;
+  typedef LatticeComplex Field;
+  typedef typename Field::vector_object vobj;
+  typedef typename vobj::scalar_object sobj;

  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
@@ -18,23 +22,40 @@ int main (int argc, char ** argv)
  GridCartesian Fine(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian rbFine(latt_size,simd_layout,mpi_layout);
  GridParallelRNG       fRNG(&Fine);
+
  //  fRNG.SeedRandomDevice();
  std::vector<int> seeds({1,2,3,4});
  fRNG.SeedFixedIntegers(seeds);
  
-  LatticeColourMatrix Foo(&Fine);
-  LatticeColourMatrix Bar(&Fine);
-  LatticeColourMatrix Check(&Fine);
-  LatticeColourMatrix Diff(&Fine);
-  
+  Field Foo(&Fine);
+  Field Bar(&Fine);
+  Field Check(&Fine);
+  Field Diff(&Fine);
+  LatticeComplex lex(&Fine);
+
+  lex = zero;  
  random(fRNG,Foo);
  gaussian(fRNG,Bar);

+  /*
+  Integer stride =1000;
+  {
+    double nrm;
+    LatticeComplex coor(&Fine);
+
+    for(int d=0;d<Nd;d++){
+      LatticeCoordinate(coor,d);
+      lex = lex + coor*stride;
+      stride=stride/10;
+    }
+    Foo=lex;
+  }
+  */

    for(int dir=0;dir<4;dir++){
      for(int disp=0;disp<Fine._fdimensions[dir];disp++){

-	std::cout<<GridLogMessage << "Using stencil to shift dim "<<dir<< " by "<<disp<<std::endl;
+	std::cout<< std::fixed <<GridLogMessage << "Using stencil to shift dim "<<dir<< " by "<<disp<<std::endl;
 	// start to test the Cartesian npoint stencil infrastructure
 	int npoint=1;
 	std::vector<int> directions(npoint,dir);
@@ -48,8 +69,8 @@ int main (int argc, char ** argv)
 	  ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir];
 	}
 	
-	std::vector<vColourMatrix,alignedAllocator<vColourMatrix> >  comm_buf(myStencil._unified_buffer_size);
-	SimpleCompressor<vColourMatrix> compress;
+	std::vector<vobj,alignedAllocator<vobj> >  comm_buf(myStencil._unified_buffer_size);
+	SimpleCompressor<vobj> compress;
 	myStencil.HaloExchange(Foo,comm_buf,compress);

 	Bar = Cshift(Foo,dir,disp);
@@ -75,9 +96,114 @@ int main (int argc, char ** argv)
 	Real nrm  = norm2(Diff);
 	std::cout<<GridLogMessage<<"N2diff ="<<nrm<<" "<<nrmC<<" " <<nrmB<<std::endl;

-	Real snrmC =0;
-	Real snrmB =0;
-	Real snrm  =0;
+	std::vector<int> coor(4);
+	for(coor[3]=0;coor[3]<latt_size[3]/mpi_layout[3];coor[3]++){
+	for(coor[2]=0;coor[2]<latt_size[2]/mpi_layout[2];coor[2]++){
+	for(coor[1]=0;coor[1]<latt_size[1]/mpi_layout[1];coor[1]++){
+	for(coor[0]=0;coor[0]<latt_size[0]/mpi_layout[0];coor[0]++){
+
+	  RealD diff;
+	  sobj check,bar;
+	  peekSite(check,Check,coor);
+	  peekSite(bar,Bar,coor);
+
+	  sobj ddiff;
+	  ddiff = check -bar;
+	  diff =norm2(ddiff);
+	  if ( diff > 0){
+	    std::cout <<"Coor (" << coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]
+		      <<") " <<check<<" vs "<<bar<<std::endl;
+	  }
+
+	 
+	}}}}
+
+
+
+      }
+    }
+
+    std::cout<<GridLogMessage<<"Testing RedBlack\n ";
+
+
+  Field EFoo(&rbFine);
+  Field OFoo(&rbFine);
+  Field ECheck(&rbFine);
+  Field OCheck(&rbFine);
+  pickCheckerboard(Even,EFoo,Foo);
+  pickCheckerboard(Odd ,OFoo,Foo);
+
+    for(int dir=0;dir<4;dir++){
+      for(int disp=0;disp<rbFine._fdimensions[dir];disp++){
+
+	std::cout<<GridLogMessage << "Using stencil to shift rb dim "<<dir<< " by "<<disp<<std::endl;
+	// start to test the Cartesian npoint stencil infrastructure
+	int npoint=1;
+	std::vector<int> directions(npoint,dir);
+	std::vector<int> displacements(npoint,disp);
+
+	CartesianStencil EStencil(&rbFine,npoint,Even,directions,displacements);
+	CartesianStencil OStencil(&rbFine,npoint,Odd,directions,displacements);
+
+	std::vector<int> ocoor(4);
+	for(int o=0;o<Fine.oSites();o++){
+	  Fine.oCoorFromOindex(ocoor,o);
+	  ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir];
+	}
+	
+	std::vector<vobj,alignedAllocator<vobj> >  Ecomm_buf(EStencil._unified_buffer_size);
+	std::vector<vobj,alignedAllocator<vobj> >  Ocomm_buf(OStencil._unified_buffer_size);
+
+	SimpleCompressor<vobj> compress;
+
+	EStencil.HaloExchange(EFoo,Ecomm_buf,compress);
+	OStencil.HaloExchange(OFoo,Ocomm_buf,compress);
+	
+	Bar = Cshift(Foo,dir,disp);
+
+	if ( disp & 0x1 ) {
+	  ECheck.checkerboard = Even;
+	  OCheck.checkerboard = Odd;
+	} else { 
+	  ECheck.checkerboard = Odd;
+	  OCheck.checkerboard = Even;
+	}
+	// Implement a stencil code that should agree with that darn cshift!
+	for(int i=0;i<OCheck._grid->oSites();i++){
+	  int permute_type;
+	  StencilEntry *SE;
+	  SE = EStencil.GetEntry(permute_type,0,i);
+	  std::cout << "Even source "<< i<<" -> " <<SE->_offset << " "<< SE->_is_local<<std::endl;
+
+	  if ( SE->_is_local && SE->_permute )
+	    permute(OCheck._odata[i],EFoo._odata[SE->_offset],permute_type);
+	  else if (SE->_is_local)
+	    OCheck._odata[i] = EFoo._odata[SE->_offset];
+	  else 
+	    OCheck._odata[i] = Ecomm_buf[SE->_offset];
+	}
+	for(int i=0;i<ECheck._grid->oSites();i++){
+	  int permute_type;
+	  StencilEntry *SE;
+	  SE = OStencil.GetEntry(permute_type,0,i);
+	  std::cout << "ODD source "<< i<<" -> " <<SE->_offset << " "<< SE->_is_local<<std::endl;
+	  
+	  if ( SE->_is_local && SE->_permute )
+	    permute(ECheck._odata[i],OFoo._odata[SE->_offset],permute_type);
+	  else if (SE->_is_local)
+	    ECheck._odata[i] = OFoo._odata[SE->_offset];
+	  else 
+	    ECheck._odata[i] = Ocomm_buf[SE->_offset];
+	}
+	
+	setCheckerboard(Check,ECheck);
+	setCheckerboard(Check,OCheck);
+	
+	Real nrmC = norm2(Check);
+	Real nrmB = norm2(Bar);
+	Diff = Check-Bar;
+	Real nrm  = norm2(Diff);
+	std::cout<<GridLogMessage<<"RB N2diff ="<<nrm<<" "<<nrmC<<" " <<nrmB<<std::endl;

 	std::vector<int> coor(4);
 	for(coor[3]=0;coor[3]<latt_size[3]/mpi_layout[3];coor[3]++){
@@ -85,33 +211,22 @@ int main (int argc, char ** argv)
 	for(coor[1]=0;coor[1]<latt_size[1]/mpi_layout[1];coor[1]++){
 	for(coor[0]=0;coor[0]<latt_size[0]/mpi_layout[0];coor[0]++){

-	  Complex diff;
-	  ColourMatrix check,bar;
+	  RealD diff;
+	  sobj check,bar;
 	  peekSite(check,Check,coor);
 	  peekSite(bar,Bar,coor);

-	  for(int r=0;r<3;r++){
-	  for(int c=0;c<3;c++){
-            diff =check()()(r,c)-bar()()(r,c);
-            double nn=real(conjugate(diff)*diff);
-            if ( nn > 0){
-	      printf("Coor (%d %d %d %d) \t rc %d%d \t %le (%le,%le) %le\n",
-		     coor[0],coor[1],coor[2],coor[3],r,c,
-		     nn,
-		     real(check()()(r,c)),
-		     imag(check()()(r,c)),
-		     real(bar()()(r,c))
-		     );
-	    }
-	    snrmC=snrmC+real(conjugate(check()()(r,c))*check()()(r,c));
-	    snrmB=snrmB+real(conjugate(bar()()(r,c))*bar()()(r,c));
-	    snrm=snrm+nn;
-	  }}
+	  sobj ddiff;
+	  ddiff = check -bar;
+	  diff =norm2(ddiff);
+	  if ( diff > 0){
+	    std::cout <<"Coor (" << coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3] <<") "
+		      <<"shift "<<disp<<" dir "<< dir 
+		      << "  stencil impl " <<check<<" vs cshift impl "<<bar<<std::endl;
+	  }
 	 
 	}}}}

-	std::cout<<GridLogMessage<<"scalar N2diff = "<<snrm<<" " <<snrmC<<" "<<snrmB<<std::endl;
-

      }
    }
--- a/tests/Test_synthetic_lanczos.cc
+++ b/tests/Test_synthetic_lanczos.cc
@@ -0,0 +1,123 @@
+#include <fenv.h>
+#include <Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+static int
+FEenableexcept (unsigned int excepts)
+{
+  static fenv_t fenv;
+  unsigned int new_excepts = excepts & FE_ALL_EXCEPT,
+    old_excepts;  // previous masks
+
+  if ( fegetenv (&fenv) ) return -1;
+  old_excepts = fenv.__control & FE_ALL_EXCEPT;
+
+  // unmask
+  fenv.__control &= ~new_excepts;
+  fenv.__mxcsr   &= ~(new_excepts << 7);
+
+  return ( fesetenv (&fenv) ? -1 : old_excepts );
+}
+
+
+template<class Field> class DumbOperator  : public LinearOperatorBase<Field> {
+public:
+  LatticeComplex scale;
+
+  DumbOperator(GridBase *grid)    : scale(grid)
+  {
+    GridParallelRNG  pRNG(grid);  
+    std::vector<int> seeds({5,6,7,8});
+    pRNG.SeedFixedIntegers(seeds);
+
+    random(pRNG,scale);
+
+    scale = exp(-real(scale)*6.0);
+    std::cout << " True matrix \n"<< scale <<std::endl;
+  }
+
+  // Support for coarsening to a multigrid
+  void OpDiag (const Field &in, Field &out) {};
+  void OpDir  (const Field &in, Field &out,int dir,int disp){};
+
+  void Op     (const Field &in, Field &out){
+    out = scale * in;
+  }
+  void AdjOp  (const Field &in, Field &out){
+    out = scale * in;
+  }
+  void HermOp(const Field &in, Field &out){
+    double n1, n2;
+    HermOpAndNorm(in,out,n1,n2);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,double &n1,double &n2){
+    ComplexD dot;
+
+    out = scale * in;
+
+    dot= innerProduct(in,out);
+    n1=real(dot);
+
+    dot = innerProduct(out,out);
+    n2=real(dot);
+  }
+};
+
+
+int main (int argc, char ** argv)
+{
+
+  FEenableexcept(FE_ALL_EXCEPT & ~FE_INEXACT); 
+
+  Grid_init(&argc,&argv);
+
+  GridCartesian *grid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
+						       GridDefaultSimd(Nd,vComplex::Nsimd()),
+						       GridDefaultMpi());
+
+  GridParallelRNG  RNG(grid);  
+  std::vector<int> seeds({1,2,3,4});
+  RNG.SeedFixedIntegers(seeds);
+
+
+  RealD alpha = 1.0;
+  RealD beta  = 0.03;
+  RealD mu    = 0.0;
+  int order = 11;
+  ChebyshevLanczos<LatticeComplex> Cheby(alpha,beta,mu,order);
+
+  std::ofstream file("pooh.dat");
+  Cheby.csv(file);
+
+  HermOpOperatorFunction<LatticeComplex> X;
+  DumbOperator<LatticeComplex> HermOp(grid);
+
+  const int Nk = 40;
+  const int Nm = 80;
+  const int Nit= 10000;
+
+  int Nconv;
+  RealD eresid = 1.0e-8;
+
+  ImplicitlyRestartedLanczos<LatticeComplex> IRL(HermOp,X,Nk,Nm,eresid,Nit);
+
+  ImplicitlyRestartedLanczos<LatticeComplex> ChebyIRL(HermOp,Cheby,Nk,Nm,eresid,Nit);
+
+  LatticeComplex src(grid); gaussian(RNG,src);
+  {
+    std::vector<RealD>          eval(Nm);
+    std::vector<LatticeComplex> evec(Nm,grid);
+    IRL.calc(eval,evec,src, Nconv);
+  }
+  
+  {
+    std::vector<RealD>          eval(Nm);
+    std::vector<LatticeComplex> evec(Nm,grid);
+    ChebyIRL.calc(eval,evec,src, Nconv);
+  }
+
+  Grid_finalize();
+}