Made checkerboard choice in staggered preconditioned solvers switchable

Re-adding staggered operators and Schur solvers, consistent with CPS & MILC preconditioning
Re-added Stag preconditioned opertor and preconditioned solvers, as the current versions are not consistent with CPS or MILC conventions.
2026-07-08 11:23:30 +01:00 · 2018-02-13 11:49:02 -05:00 · 2018-02-11 02:46:50 -05:00 · 2018-02-11 02:39:34 -05:00 · 2017-10-30 15:49:17 -04:00 · 2017-10-27 17:34:35 -04:00
133 changed files with 26850 additions and 7426 deletions
@@ -3,19 +3,19 @@ TODO:

 Large item work list:

-1)- BG/Q port and check
+1)- BG/Q port and check ; Andrew says ok.
 2)- Christoph's local basis expansion Lanczos
-3)- Precision conversion and sort out localConvert      <-- partial
-
-  - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
+--
+3a)- RNG I/O in ILDG/SciDAC (minor)
+3b)- Precision conversion and sort out localConvert      <-- partial/easy
+3c)- Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
 4)- Physical propagator interface
 5)- Conserved currents
 6)- Multigrid Wilson and DWF, compare to other Multigrid implementations
 7)- HDCR resume

 Recent DONE 
-
-- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O.  <--- DONE
+-- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O ; <-- DONE ; bmark cori
 -- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE
 -- GaugeFix into central location                      <-- DONE
 -- Scidac and Ildg metadata handling                   <-- DONE
@@ -701,12 +701,14 @@ int main (int argc, char ** argv)
  if ( do_su3 ) {
    // empty for now
  }
-
+#if 1
  int sel=2;
  std::vector<int> L_list({8,12,16,24});
-
-  //int sel=1;
-  //  std::vector<int> L_list({8,12});
+#else
+  int sel=1;
+  std::vector<int> L_list({8,12});
+#endif
+  int selm1=sel-1;
  std::vector<double> robust_list;

  std::vector<double> wilson;
@@ -785,7 +787,8 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;

  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << " Comparison point     result: "  << dwf4[sel]/NN << " Mflop/s per node"<<std::endl;
+  std::cout<<GridLogMessage << " Comparison point     result: "  << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
+  std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
  std::cout<<std::setprecision(3);
  std::cout<<GridLogMessage << " Comparison point robustness: "  << robust_list[sel] <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
@@ -51,7 +51,13 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;

  std::vector<int> latt4 = GridDefaultLatt();
-  const int Ls=16;
+  int Ls=16;
+  for(int i=0;i<argc;i++)
+    if(std::string(argv[i]) == "-Ls"){
+      std::stringstream ss(argv[i+1]); ss >> Ls;
+    }
+
+
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
@@ -0,0 +1,190 @@
+#include <Grid/Grid.h>
+#include <sstream>
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::Algebra Gmu [] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ,
+    Gamma::Algebra::GammaT
+  };
+
+typedef typename GparityDomainWallFermionF::FermionField GparityLatticeFermionF;
+typedef typename GparityDomainWallFermionD::FermionField GparityLatticeFermionD;
+
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  int Ls=16;
+  for(int i=0;i<argc;i++)
+    if(std::string(argv[i]) == "-Ls"){
+      std::stringstream ss(argv[i+1]); ss >> Ls;
+    }
+
+
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+  std::cout<<GridLogMessage << "Ls = " << Ls << std::endl;
+
+  std::vector<int> latt4 = GridDefaultLatt();
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  
+  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
+  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+  GparityLatticeFermionF src   (FGrid); random(RNG5,src);
+  RealD N2 = 1.0/::sqrt(norm2(src));
+  src = src*N2;
+
+  GparityLatticeFermionF result(FGrid); result=zero;
+  GparityLatticeFermionF    ref(FGrid);    ref=zero;
+  GparityLatticeFermionF    tmp(FGrid);
+  GparityLatticeFermionF    err(FGrid);
+
+  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
+  LatticeGaugeFieldF Umu(UGrid); 
+  SU3::HotConfiguration(RNG4,Umu); 
+  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
+
+  RealD mass=0.1;
+  RealD M5  =1.8;
+
+  RealD NP = UGrid->_Nprocessors;
+  RealD NN = UGrid->NodeCount();
+
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::Dhop                  "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
+#ifdef GRID_OMP
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
+#endif
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+
+
+
+  std::cout << GridLogMessage<< "* SINGLE/SINGLE"<<std::endl;
+  GparityDomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  int ncall =1000;
+  if (1) {
+    FGrid->Barrier();
+    Dw.ZeroCounters();
+    Dw.Dhop(src,result,0);
+    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+      __SSC_START;
+      Dw.Dhop(src,result,0);
+      __SSC_STOP;
+    }
+    double t1=usecond();
+    FGrid->Barrier();
+    
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=2*1344*volume*ncall;
+
+    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
+    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
+    //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
+    Dw.Report();
+  }
+
+  std::cout << GridLogMessage<< "* SINGLE/HALF"<<std::endl;
+  GparityDomainWallFermionFH DwH(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  if (1) {
+    FGrid->Barrier();
+    DwH.ZeroCounters();
+    DwH.Dhop(src,result,0);
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+      __SSC_START;
+      DwH.Dhop(src,result,0);
+      __SSC_STOP;
+    }
+    double t1=usecond();
+    FGrid->Barrier();
+    
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=2*1344*volume*ncall;
+
+    std::cout<<GridLogMessage << "Called half prec comms Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
+    DwH.Report();
+  }
+
+  GridCartesian         * UGrid_d   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid_d = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_d);
+  GridCartesian         * FGrid_d   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_d);
+  GridRedBlackCartesian * FrbGrid_d = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_d);
+
+  
+  std::cout << GridLogMessage<< "* DOUBLE/DOUBLE"<<std::endl;
+  GparityLatticeFermionD src_d(FGrid_d);
+  precisionChange(src_d,src);
+
+  LatticeGaugeFieldD Umu_d(UGrid_d); 
+  precisionChange(Umu_d,Umu);
+
+  GparityLatticeFermionD result_d(FGrid_d);
+
+  GparityDomainWallFermionD DwD(Umu_d,*FGrid_d,*FrbGrid_d,*UGrid_d,*UrbGrid_d,mass,M5);
+  if (1) {
+    FGrid_d->Barrier();
+    DwD.ZeroCounters();
+    DwD.Dhop(src_d,result_d,0);
+    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+      __SSC_START;
+      DwD.Dhop(src_d,result_d,0);
+      __SSC_STOP;
+    }
+    double t1=usecond();
+    FGrid_d->Barrier();
+    
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=2*1344*volume*ncall;
+
+    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
+    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
+    //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
+    DwD.Report();
+  }
+
+  Grid_finalize();
+}
+
@@ -40,7 +40,7 @@ int main (int argc, char ** argv)
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
-  GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian     RBGrid(&Grid);

  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
@@ -58,7 +58,7 @@ int main (int argc, char ** argv)
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
-  GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian     RBGrid(&Grid);

  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
@@ -93,7 +93,7 @@ int main (int argc, char ** argv)
 	  std::cout << latt_size.back() << "\t\t";

 	  GridCartesian           Grid(latt_size,simd_layout,mpi_layout);
-	  GridRedBlackCartesian RBGrid(latt_size,simd_layout,mpi_layout);
+	  GridRedBlackCartesian RBGrid(&Grid);

 	  GridParallelRNG  pRNG(&Grid); pRNG.SeedFixedIntegers(seeds);
 	  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
@@ -5,7 +5,7 @@ EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.3.3.tar.bz2'
 echo "-- deploying Eigen source..."
 wget ${EIGEN_URL} --no-check-certificate
 ./scripts/update_eigen.sh `basename ${EIGEN_URL}`
-rm `basename ${EIGEN_URL}`
+#rm `basename ${EIGEN_URL}`

 echo '-- generating Make.inc files...'
 ./scripts/filelist
@@ -550,6 +550,7 @@ AC_CONFIG_FILES(tests/forces/Makefile)
 AC_CONFIG_FILES(tests/hadrons/Makefile)
 AC_CONFIG_FILES(tests/hmc/Makefile)
 AC_CONFIG_FILES(tests/solver/Makefile)
+AC_CONFIG_FILES(tests/lanczos/Makefile)
 AC_CONFIG_FILES(tests/smearing/Makefile)
 AC_CONFIG_FILES(tests/qdpxx/Makefile)
 AC_CONFIG_FILES(tests/testu01/Makefile)
@@ -1,6 +1,6 @@
    /*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+    Grid physics library, www.github.com/paboyle/Grid

    Source file: ./lib/Algorithms.h

@@ -37,6 +37,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/algorithms/approx/Chebyshev.h>
 #include <Grid/algorithms/approx/Remez.h>
 #include <Grid/algorithms/approx/MultiShiftFunction.h>
+#include <Grid/algorithms/approx/Forecast.h>
+
+#include <Grid/algorithms/densematrix/DenseMatrix.h>
+#include <Grid/algorithms/densematrix/Francis.h>
+#include <Grid/algorithms/densematrix/Householder.h>

 #include <Grid/algorithms/iterative/ConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateResidual.h>
@@ -44,30 +49,18 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
-
-// Lanczos support
-//#include <Grid/algorithms/iterative/MatrixUtils.h>
+#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
+#include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczosCJ.h>
+#include <Grid/algorithms/iterative/SimpleLanczos.h>
 #include <Grid/algorithms/CoarsenedMatrix.h>
 #include <Grid/algorithms/FFT.h>

-// Eigen/lanczos
 // EigCg
-// MCR
 // Pcg
-// Multishift CG
 // Hdcg
 // GCR
 // etc..

-// integrator/Leapfrog
-// integrator/Omelyan
-// integrator/ForceGradient
-
-// montecarlo/hmc
-// montecarlo/rhmc
-// montecarlo/metropolis
-// etc...
-
-
 #endif
@@ -230,6 +230,7 @@ namespace Grid {
      // Barrel shift and collect global pencil
      std::vector<int> lcoor(Nd), gcoor(Nd);
      result = source;
+      int pc = processor_coor[dim];
      for(int p=0;p<processors[dim];p++) {
        PARALLEL_REGION
        {
@@ -240,7 +241,8 @@ namespace Grid {
          for(int idx=0;idx<sgrid->lSites();idx++) {
            sgrid->LocalIndexToLocalCoor(idx,cbuf);
            peekLocalSite(s,result,cbuf);
-            cbuf[dim]+=p*L;
+	    cbuf[dim]+=((pc+p) % processors[dim])*L;
+	    //            cbuf[dim]+=p*L;
            pokeLocalSite(s,pgbuf,cbuf);
          }
        }
@@ -278,7 +280,6 @@ namespace Grid {
      flops+= flops_call*NN;
      
      // writing out result
-      int pc = processor_coor[dim];
      PARALLEL_REGION
      {
        std::vector<int> clbuf(Nd), cgbuf(Nd);
@@ -8,6 +8,7 @@

 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Chulwoo Jung <chulwoo@bnl.gov>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -162,15 +163,10 @@ namespace Grid {
 	_Mat.M(in,out);
      }
      void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
-	ComplexD dot;
-
 	_Mat.M(in,out);
 	
-	dot= innerProduct(in,out);
-	n1=real(dot);
-
-	dot = innerProduct(out,out);
-	n2=real(dot);
+	ComplexD dot= innerProduct(in,out); n1=real(dot);
+	n2=norm2(out);
      }
      void HermOp(const Field &in, Field &out){
 	_Mat.M(in,out);
@@ -192,10 +188,10 @@ namespace Grid {
 	ni=Mpc(in,tmp);
 	no=MpcDag(tmp,out);
      }
-      void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+      virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
 	MpcDagMpc(in,out,n1,n2);
      }
-      void HermOp(const Field &in, Field &out){
+      virtual void HermOp(const Field &in, Field &out){
 	RealD n1,n2;
 	HermOpAndNorm(in,out,n1,n2);
      }
@@ -212,7 +208,6 @@ namespace Grid {
      void OpDir  (const Field &in, Field &out,int dir,int disp) {
 	assert(0);
      }
-
    };
    template<class Matrix,class Field>
      class SchurDiagMooeeOperator :  public SchurOperatorBase<Field> {
@@ -270,7 +265,6 @@ namespace Grid {
 	return axpy_norm(out,-1.0,tmp,in);
      }
    };
-
    template<class Matrix,class Field>
      class SchurDiagTwoOperator :  public SchurOperatorBase<Field> {
    protected:
@@ -299,6 +293,168 @@ namespace Grid {
 	return axpy_norm(out,-1.0,tmp,in);
      }
    };
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    // Left  handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta  -->  ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta
+    // Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta  -->  ( 1 - Moe Mee^-1 Meo ) Moo^-1 phi=eta ; psi = Moo^-1 phi
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    template<class Matrix,class Field> using SchurDiagOneRH = SchurDiagTwoOperator<Matrix,Field> ;
+    template<class Matrix,class Field> using SchurDiagOneLH = SchurDiagOneOperator<Matrix,Field> ;
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    //  Staggered use
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    template<class Matrix,class Field>
+      class SchurStaggeredOperator :  public SchurOperatorBase<Field> {
+    protected:
+      Matrix &_Mat;
+    public:
+      SchurStaggeredOperator (Matrix &Mat): _Mat(Mat){};
+      virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+	n2 = Mpc(in,out);
+	ComplexD dot= innerProduct(in,out);
+	n1 = real(dot);
+      }
+      virtual void HermOp(const Field &in, Field &out){
+	Mpc(in,out);
+      }
+      virtual  RealD Mpc      (const Field &in, Field &out) {
+	Field tmp(in._grid);
+	_Mat.Meooe(in,tmp);
+	_Mat.MooeeInv(tmp,out);
+	_Mat.Meooe(out,tmp);
+	_Mat.Mooee(in,out);
+        return axpy_norm(out,-1.0,tmp,out);
+      }
+      virtual  RealD MpcDag   (const Field &in, Field &out){
+	return Mpc(in,out);
+      }
+      virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
+	assert(0);// Never need with staggered
+      }
+    };
+//    template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>;
+    template<class Matrix,class Field>
+//      class SchurStagOperator :  public LinearOperatorBase<Field> {
+      class SchurStagOperator :  public SchurOperatorBase<Field> {
+    protected:
+      Matrix &_Mat;
+    public:
+      SchurStagOperator (Matrix &Mat): _Mat(Mat){};
+      virtual  RealD Mpc      (const Field &in, Field &out) {
+	Field tmp(in._grid);
+	Field tmp2(in._grid);
+
+	_Mat.Mooee(in,out);
+	_Mat.Mooee(out,tmp);
+
+	_Mat.Meooe(in,out);
+	_Mat.Meooe(out,tmp2);
+
+	return axpy_norm(out,-1.0,tmp2,tmp);
+      }
+      virtual  RealD MpcDag   (const Field &in, Field &out){
+
+	return Mpc(in,out);
+      }
+#if 0
+      virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
+	Field tmp(in._grid);
+	ni=Mpc(in,tmp);
+	no=MpcDag(tmp,out);
+      }
+#endif
+      void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+	n2 = Mpc(in,out);
+	ComplexD dot = innerProduct(in,out);
+	n1 = real(dot);
+      }
+      void HermOp(const Field &in, Field &out){
+	RealD n1,n2;
+	HermOpAndNorm(in,out,n1,n2);
+      }
+      void Op     (const Field &in, Field &out){
+	Mpc(in,out);
+      }
+      void AdjOp     (const Field &in, Field &out){ 
+	MpcDag(in,out);
+      }
+      // Support for coarsening to a multigrid
+      void OpDiag (const Field &in, Field &out) {
+	assert(0); // must coarsen the unpreconditioned system
+      }
+      void OpDir  (const Field &in, Field &out,int dir,int disp) {
+	assert(0);
+      }
+
+    };
+
+#if 0
+  // This is specific to (Z)mobius fermions
+  template<class Matrix, class Field>
+    class KappaSimilarityTransform {
+  public:
+//    INHERIT_IMPL_TYPES(Matrix);
+    typedef typename Matrix::Coeff_t                     Coeff_t;
+    std::vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag;
+
+    KappaSimilarityTransform (Matrix &zmob) {
+      for (int i=0;i<(int)zmob.bs.size();i++) {
+	Coeff_t k = 1.0 / ( 2.0 * (zmob.bs[i] *(4 - zmob.M5) + 1.0) );
+	kappa.push_back( k );
+	kappaDag.push_back( conj(k) );
+	kappaInv.push_back( 1.0 / k );
+	kappaInvDag.push_back( 1.0 / conj(k) );
+      }
+    }
+
+  template<typename vobj>
+    void sscale(const Lattice<vobj>& in, Lattice<vobj>& out, Coeff_t* s) {
+    GridBase *grid=out._grid;
+    out.checkerboard = in.checkerboard;
+    assert(grid->_simd_layout[0] == 1); // should be fine for ZMobius for now
+    int Ls = grid->_rdimensions[0];
+    parallel_for(int ss=0;ss<grid->oSites();ss++){
+      vobj tmp = s[ss % Ls]*in._odata[ss];
+      vstream(out._odata[ss],tmp);
+    }
+  }
+
+  RealD sscale_norm(const Field& in, Field& out, Coeff_t* s) {
+    sscale(in,out,s);
+    return norm2(out);
+  }
+
+  virtual RealD M       (const Field& in, Field& out) { return sscale_norm(in,out,&kappa[0]);   }
+  virtual RealD MDag    (const Field& in, Field& out) { return sscale_norm(in,out,&kappaDag[0]);}
+  virtual RealD MInv    (const Field& in, Field& out) { return sscale_norm(in,out,&kappaInv[0]);}
+  virtual RealD MInvDag (const Field& in, Field& out) { return sscale_norm(in,out,&kappaInvDag[0]);}
+
+  };
+
+  template<class Matrix,class Field>
+    class SchurDiagTwoKappaOperator :  public SchurOperatorBase<Field> {
+  public:
+    KappaSimilarityTransform<Matrix, Field> _S;
+    SchurDiagTwoOperator<Matrix, Field> _Mat;
+
+    SchurDiagTwoKappaOperator (Matrix &Mat): _S(Mat), _Mat(Mat) {};
+
+    virtual  RealD Mpc      (const Field &in, Field &out) {
+      Field tmp(in._grid);
+
+      _S.MInv(in,out);
+      _Mat.Mpc(out,tmp);
+      return _S.M(tmp,out);
+
+    }
+    virtual  RealD MpcDag   (const Field &in, Field &out){
+      Field tmp(in._grid);
+
+      _S.MDag(in,out);
+      _Mat.MpcDag(out,tmp);
+      return _S.MInvDag(tmp,out);
+    }
+  };
+#endif


    /////////////////////////////////////////////////////////////
@@ -8,6 +8,7 @@

 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Christoph Lehner <clehner@bnl.gov>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -193,6 +194,47 @@ namespace Grid {
      return sum;
    };

+    RealD approxD(RealD x)
+    {
+      RealD Un;
+      RealD Unm;
+      RealD Unp;
+      
+      RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
+      
+      RealD U0=1;
+      RealD U1=2*y;
+      
+      RealD sum;
+      sum = Coeffs[1]*U0;
+      sum+= Coeffs[2]*U1*2.0;
+      
+      Un =U1;
+      Unm=U0;
+      for(int i=2;i<order-1;i++){
+	Unp=2*y*Un-Unm;
+	Unm=Un;
+	Un =Unp;
+	sum+= Un*Coeffs[i+1]*(i+1.0);
+      }
+      return sum/(0.5*(hi-lo));
+    };
+    
+    RealD approxInv(RealD z, RealD x0, int maxiter, RealD resid) {
+      RealD x = x0;
+      RealD eps;
+      
+      int i;
+      for (i=0;i<maxiter;i++) {
+	eps = approx(x) - z;
+	if (fabs(eps / z) < resid)
+	  return x;
+	x = x - eps / approxD(x);
+      }
+      
+      return std::numeric_limits<double>::quiet_NaN();
+    }
+    
    // Implement the required interface
    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {

@@ -0,0 +1,152 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/algorithms/approx/Forecast.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef INCLUDED_FORECAST_H
+#define INCLUDED_FORECAST_H
+
+namespace Grid {
+
+  // Abstract base class.
+  // Takes a matrix (Mat), a source (phi), and a vector of Fields (chi)
+  // and returns a forecasted solution to the system D*psi = phi (psi).
+  template<class Matrix, class Field>
+  class Forecast
+  {
+    public:
+      virtual Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& chi) = 0;
+  };
+
+  // Implementation of Brower et al.'s chronological inverter (arXiv:hep-lat/9509012),
+  // used to forecast solutions across poles of the EOFA heatbath.
+  //
+  // Modified from CPS (cps_pp/src/util/dirac_op/d_op_base/comsrc/minresext.C)
+  template<class Matrix, class Field>
+  class ChronoForecast : public Forecast<Matrix,Field>
+  {
+    public:
+      Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& prev_solns)
+      {
+        int degree = prev_solns.size();
+        Field chi(phi); // forecasted solution
+
+        // Trivial cases
+        if(degree == 0){ chi = zero; return chi; }
+        else if(degree == 1){ return prev_solns[0]; }
+
+        RealD dot;
+        ComplexD xp;
+        Field r(phi); // residual
+        Field Mv(phi);
+        std::vector<Field> v(prev_solns); // orthonormalized previous solutions
+        std::vector<Field> MdagMv(degree,phi);
+
+        // Array to hold the matrix elements
+        std::vector<std::vector<ComplexD>> G(degree, std::vector<ComplexD>(degree));
+
+        // Solution and source vectors
+        std::vector<ComplexD> a(degree);
+        std::vector<ComplexD> b(degree);
+
+        // Orthonormalize the vector basis
+        for(int i=0; i<degree; i++){
+          v[i] *= 1.0/std::sqrt(norm2(v[i]));
+          for(int j=i+1; j<degree; j++){ v[j] -= innerProduct(v[i],v[j]) * v[i]; }
+        }
+
+        // Perform sparse matrix multiplication and construct rhs
+        for(int i=0; i<degree; i++){
+          b[i] = innerProduct(v[i],phi);
+          Mat.M(v[i],Mv);
+          Mat.Mdag(Mv,MdagMv[i]);
+          G[i][i] = innerProduct(v[i],MdagMv[i]);
+        }
+
+        // Construct the matrix
+        for(int j=0; j<degree; j++){
+        for(int k=j+1; k<degree; k++){
+          G[j][k] = innerProduct(v[j],MdagMv[k]);
+          G[k][j] = std::conj(G[j][k]);
+        }}
+
+        // Gauss-Jordan elimination with partial pivoting
+        for(int i=0; i<degree; i++){
+
+          // Perform partial pivoting
+          int k = i;
+          for(int j=i+1; j<degree; j++){ if(std::abs(G[j][j]) > std::abs(G[k][k])){ k = j; } }
+          if(k != i){
+            xp = b[k];
+            b[k] = b[i];
+            b[i] = xp;
+            for(int j=0; j<degree; j++){
+              xp = G[k][j];
+              G[k][j] = G[i][j];
+              G[i][j] = xp;
+            }
+          }
+
+          // Convert matrix to upper triangular form
+          for(int j=i+1; j<degree; j++){
+            xp = G[j][i]/G[i][i];
+            b[j] -= xp * b[i];
+            for(int k=0; k<degree; k++){ G[j][k] -= xp*G[i][k]; }
+          }
+        }
+
+        // Use Gaussian elimination to solve equations and calculate initial guess
+        chi = zero;
+        r = phi;
+        for(int i=degree-1; i>=0; i--){
+          a[i] = 0.0;
+          for(int j=i+1; j<degree; j++){ a[i] += G[i][j] * a[j]; }
+          a[i] = (b[i]-a[i])/G[i][i];
+          chi += a[i]*v[i];
+          r -= a[i]*MdagMv[i];
+        }
+
+        RealD true_r(0.0);
+        ComplexD tmp;
+        for(int i=0; i<degree; i++){
+          tmp = -b[i];
+          for(int j=0; j<degree; j++){ tmp += G[i][j]*a[j]; }
+          tmp = std::conj(tmp)*tmp;
+          true_r += std::sqrt(tmp.real());
+        }
+
+        RealD error = std::sqrt(norm2(r)/norm2(phi));
+        std::cout << GridLogMessage << "ChronoForecast: |res|/|src| = " << error << std::endl;
+
+        return chi;
+      };
+  };
+
+}
+
+#endif
@@ -0,0 +1,137 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/DenseMatrix.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_DENSE_MATRIX_H
+#define GRID_DENSE_MATRIX_H
+
+namespace Grid {
+    /////////////////////////////////////////////////////////////
+    // Matrix untils
+    /////////////////////////////////////////////////////////////
+
+template<class T> using DenseVector = std::vector<T>;
+template<class T> using DenseMatrix = DenseVector<DenseVector<T> >;
+
+template<class T> void Size(DenseVector<T> & vec, int &N) 
+{ 
+  N= vec.size();
+}
+template<class T> void Size(DenseMatrix<T> & mat, int &N,int &M) 
+{ 
+  N= mat.size();
+  M= mat[0].size();
+}
+
+template<class T> void SizeSquare(DenseMatrix<T> & mat, int &N) 
+{ 
+  int M; Size(mat,N,M);
+  assert(N==M);
+}
+
+template<class T> void Resize(DenseVector<T > & mat, int N) { 
+  mat.resize(N);
+}
+template<class T> void Resize(DenseMatrix<T > & mat, int N, int M) { 
+  mat.resize(N);
+  for(int i=0;i<N;i++){
+    mat[i].resize(M);
+  }
+}
+template<class T> void Fill(DenseMatrix<T> & mat, T&val) { 
+  int N,M;
+  Size(mat,N,M);
+  for(int i=0;i<N;i++){
+  for(int j=0;j<M;j++){
+    mat[i][j] = val;
+  }}
+}
+
+/** Transpose of a matrix **/
+template<class T> DenseMatrix<T> Transpose(DenseMatrix<T> & mat){
+  int N,M;
+  Size(mat,N,M);
+  DenseMatrix<T> C; Resize(C,M,N);
+  for(int i=0;i<M;i++){
+  for(int j=0;j<N;j++){
+    C[i][j] = mat[j][i];
+  }} 
+  return C;
+}
+/** Set DenseMatrix to unit matrix **/
+template<class T> void Unity(DenseMatrix<T> &A){
+  int N;  SizeSquare(A,N);
+  for(int i=0;i<N;i++){
+    for(int j=0;j<N;j++){
+      if ( i==j ) A[i][j] = 1;
+      else        A[i][j] = 0;
+    } 
+  } 
+}
+
+/** Add C * I to matrix **/
+template<class T>
+void PlusUnit(DenseMatrix<T> & A,T c){
+  int dim;  SizeSquare(A,dim);
+  for(int i=0;i<dim;i++){A[i][i] = A[i][i] + c;} 
+}
+
+/** return the Hermitian conjugate of matrix **/
+template<class T>
+DenseMatrix<T> HermitianConj(DenseMatrix<T> &mat){
+
+  int dim; SizeSquare(mat,dim);
+
+  DenseMatrix<T> C; Resize(C,dim,dim);
+
+  for(int i=0;i<dim;i++){
+    for(int j=0;j<dim;j++){
+      C[i][j] = conj(mat[j][i]);
+    } 
+  } 
+  return C;
+}
+/**Get a square submatrix**/
+template <class T>
+DenseMatrix<T> GetSubMtx(DenseMatrix<T> &A,int row_st, int row_end, int col_st, int col_end)
+{
+  DenseMatrix<T> H; Resize(H,row_end - row_st,col_end-col_st);
+
+  for(int i = row_st; i<row_end; i++){
+  for(int j = col_st; j<col_end; j++){
+    H[i-row_st][j-col_st]=A[i][j];
+  }}
+  return H;
+}
+
+}
+
+#include "Householder.h"
+#include "Francis.h"
+
+#endif
+
@@ -0,0 +1,525 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/Francis.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef FRANCIS_H
+#define FRANCIS_H
+
+#include <cstdlib>
+#include <string>
+#include <cmath>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <fstream>
+#include <complex>
+#include <algorithm>
+
+//#include <timer.h>
+//#include <lapacke.h>
+//#include <Eigen/Dense>
+
+namespace Grid {
+
+template <class T> int SymmEigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small);
+template <class T> int     Eigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small);
+
+/**
+  Find the eigenvalues of an upper hessenberg matrix using the Francis QR algorithm.
+H =
+      x  x  x  x  x  x  x  x  x
+      x  x  x  x  x  x  x  x  x
+      0  x  x  x  x  x  x  x  x
+      0  0  x  x  x  x  x  x  x
+      0  0  0  x  x  x  x  x  x
+      0  0  0  0  x  x  x  x  x
+      0  0  0  0  0  x  x  x  x
+      0  0  0  0  0  0  x  x  x
+      0  0  0  0  0  0  0  x  x
+Factorization is P T P^H where T is upper triangular (mod cc blocks) and P is orthagonal/unitary.
+**/
+template <class T>
+int QReigensystem(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small)
+{
+  DenseMatrix<T> H = Hin; 
+
+  int N ; SizeSquare(H,N);
+  int M = N;
+
+  Fill(evals,0);
+  Fill(evecs,0);
+
+  T s,t,x=0,y=0,z=0;
+  T u,d;
+  T apd,amd,bc;
+  DenseVector<T> p(N,0);
+  T nrm = Norm(H);    ///DenseMatrix Norm
+  int n, m;
+  int e = 0;
+  int it = 0;
+  int tot_it = 0;
+  int l = 0;
+  int r = 0;
+  DenseMatrix<T> P; Resize(P,N,N); Unity(P);
+  DenseVector<int> trows(N,0);
+
+  /// Check if the matrix is really hessenberg, if not abort
+  RealD sth = 0;
+  for(int j=0;j<N;j++){
+    for(int i=j+2;i<N;i++){
+      sth = abs(H[i][j]);
+      if(sth > small){
+	std::cout << "Non hessenberg H = " << sth << " > " << small << std::endl;
+	exit(1);
+      }
+    }
+  }
+
+  do{
+    std::cout << "Francis QR Step N = " << N << std::endl;
+    /** Check for convergence
+      x  x  x  x  x
+      0  x  x  x  x
+      0  0  x  x  x
+      0  0  x  x  x
+      0  0  0  0  x
+      for this matrix l = 4
+     **/
+    do{
+      l = Chop_subdiag(H,nrm,e,small);
+      r = 0;    ///May have converged on more than one eval
+      ///Single eval
+      if(l == N-1){
+        evals[e] = H[l][l];
+        N--; e++; r++; it = 0;
+      }
+      ///RealD eval
+      if(l == N-2){
+        trows[l+1] = 1;    ///Needed for UTSolve
+        apd = H[l][l] + H[l+1][l+1];
+        amd = H[l][l] - H[l+1][l+1];
+        bc =  (T)4.0*H[l+1][l]*H[l][l+1];
+        evals[e]   = (T)0.5*( apd + sqrt(amd*amd + bc) );
+        evals[e+1] = (T)0.5*( apd - sqrt(amd*amd + bc) );
+        N-=2; e+=2; r++; it = 0;
+      }
+    } while(r>0);
+
+    if(N ==0) break;
+
+    DenseVector<T > ck; Resize(ck,3);
+    DenseVector<T> v;   Resize(v,3);
+
+    for(int m = N-3; m >= l; m--){
+      ///Starting vector essentially random shift.
+      if(it%10 == 0 && N >= 3 && it > 0){
+        s = (T)1.618033989*( abs( H[N-1][N-2] ) + abs( H[N-2][N-3] ) );
+        t = (T)0.618033989*( abs( H[N-1][N-2] ) + abs( H[N-2][N-3] ) );
+        x = H[m][m]*H[m][m] + H[m][m+1]*H[m+1][m] - s*H[m][m] + t;
+        y = H[m+1][m]*(H[m][m] + H[m+1][m+1] - s);
+        z = H[m+1][m]*H[m+2][m+1];
+      }
+      ///Starting vector implicit Q theorem
+      else{
+        s = (H[N-2][N-2] + H[N-1][N-1]);
+        t = (H[N-2][N-2]*H[N-1][N-1] - H[N-2][N-1]*H[N-1][N-2]);
+        x = H[m][m]*H[m][m] + H[m][m+1]*H[m+1][m] - s*H[m][m] + t;
+        y = H[m+1][m]*(H[m][m] + H[m+1][m+1] - s);
+        z = H[m+1][m]*H[m+2][m+1];
+      }
+      ck[0] = x; ck[1] = y; ck[2] = z;
+
+      if(m == l) break;
+
+      /** Some stupid thing from numerical recipies, seems to work**/
+      // PAB.. for heaven's sake quote page, purpose, evidence it works.
+      //       what sort of comment is that!?!?!?
+      u=abs(H[m][m-1])*(abs(y)+abs(z));
+      d=abs(x)*(abs(H[m-1][m-1])+abs(H[m][m])+abs(H[m+1][m+1]));
+      if ((T)abs(u+d) == (T)abs(d) ){
+	l = m; break;
+      }
+
+      //if (u < small){l = m; break;}
+    }
+    if(it > 100000){
+     std::cout << "QReigensystem: bugger it got stuck after 100000 iterations" << std::endl;
+     std::cout << "got " << e << " evals " << l << " " << N << std::endl;
+      exit(1);
+    }
+    normalize(ck);    ///Normalization cancels in PHP anyway
+    T beta;
+    Householder_vector<T >(ck, 0, 2, v, beta);
+    Householder_mult<T >(H,v,beta,0,l,l+2,0);
+    Householder_mult<T >(H,v,beta,0,l,l+2,1);
+    ///Accumulate eigenvector
+    Householder_mult<T >(P,v,beta,0,l,l+2,1);
+    int sw = 0;      ///Are we on the last row?
+    for(int k=l;k<N-2;k++){
+      x = H[k+1][k];
+      y = H[k+2][k];
+      z = (T)0.0;
+      if(k+3 <= N-1){
+	z = H[k+3][k];
+      } else{
+	sw = 1; 
+	v[2] = (T)0.0;
+      }
+      ck[0] = x; ck[1] = y; ck[2] = z;
+      normalize(ck);
+      Householder_vector<T >(ck, 0, 2-sw, v, beta);
+      Householder_mult<T >(H,v, beta,0,k+1,k+3-sw,0);
+      Householder_mult<T >(H,v, beta,0,k+1,k+3-sw,1);
+      ///Accumulate eigenvector
+      Householder_mult<T >(P,v, beta,0,k+1,k+3-sw,1);
+    }
+    it++;
+    tot_it++;
+  }while(N > 1);
+  N = evals.size();
+  ///Annoying - UT solves in reverse order;
+  DenseVector<T> tmp; Resize(tmp,N);
+  for(int i=0;i<N;i++){
+    tmp[i] = evals[N-i-1];
+  } 
+  evals = tmp;
+  UTeigenvectors(H, trows, evals, evecs);
+  for(int i=0;i<evals.size();i++){evecs[i] = P*evecs[i]; normalize(evecs[i]);}
+  return tot_it;
+}
+
+template <class T>
+int my_Wilkinson(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small)
+{
+  /**
+  Find the eigenvalues of an upper Hessenberg matrix using the Wilkinson QR algorithm.
+  H =
+  x  x  0  0  0  0
+  x  x  x  0  0  0
+  0  x  x  x  0  0
+  0  0  x  x  x  0
+  0  0  0  x  x  x
+  0  0  0  0  x  x
+  Factorization is P T P^H where T is upper triangular (mod cc blocks) and P is orthagonal/unitary.  **/
+  return my_Wilkinson(Hin, evals, evecs, small, small);
+}
+
+template <class T>
+int my_Wilkinson(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small, RealD tol)
+{
+  int N; SizeSquare(Hin,N);
+  int M = N;
+
+  ///I don't want to modify the input but matricies must be passed by reference
+  //Scale a matrix by its "norm"
+  //RealD Hnorm = abs( Hin.LargestDiag() ); H =  H*(1.0/Hnorm);
+  DenseMatrix<T> H;  H = Hin;
+  
+  RealD Hnorm = abs(Norm(Hin));
+  H = H * (1.0 / Hnorm);
+
+  // TODO use openmp and memset
+  Fill(evals,0);
+  Fill(evecs,0);
+
+  T s, t, x = 0, y = 0, z = 0;
+  T u, d;
+  T apd, amd, bc;
+  DenseVector<T> p; Resize(p,N); Fill(p,0);
+
+  T nrm = Norm(H);    ///DenseMatrix Norm
+  int n, m;
+  int e = 0;
+  int it = 0;
+  int tot_it = 0;
+  int l = 0;
+  int r = 0;
+  DenseMatrix<T> P; Resize(P,N,N);
+  Unity(P);
+  DenseVector<int> trows(N, 0);
+  /// Check if the matrix is really symm tridiag
+  RealD sth = 0;
+  for(int j = 0; j < N; ++j)
+  {
+    for(int i = j + 2; i < N; ++i)
+    {
+      if(abs(H[i][j]) > tol || abs(H[j][i]) > tol)
+      {
+	std::cout << "Non Tridiagonal H(" << i << ","<< j << ") = |" << Real( real( H[j][i] ) ) << "| > " << tol << std::endl;
+	std::cout << "Warning tridiagonalize and call again" << std::endl;
+        // exit(1); // see what is going on
+        //return;
+      }
+    }
+  }
+
+  do{
+    do{
+      //Jasper
+      //Check if the subdiagonal term is small enough (<small)
+      //if true then it is converged.
+      //check start from H.dim - e - 1
+      //How to deal with more than 2 are converged?
+      //What if Chop_symm_subdiag return something int the middle?
+      //--------------
+      l = Chop_symm_subdiag(H,nrm, e, small);
+      r = 0;    ///May have converged on more than one eval
+      //Jasper
+      //In this case
+      // x  x  0  0  0  0
+      // x  x  x  0  0  0
+      // 0  x  x  x  0  0
+      // 0  0  x  x  x  0
+      // 0  0  0  x  x  0
+      // 0  0  0  0  0  x  <- l
+      //--------------
+      ///Single eval
+      if(l == N - 1)
+      {
+        evals[e] = H[l][l];
+        N--;
+        e++;
+        r++;
+        it = 0;
+      }
+      //Jasper
+      // x  x  0  0  0  0
+      // x  x  x  0  0  0
+      // 0  x  x  x  0  0
+      // 0  0  x  x  0  0
+      // 0  0  0  0  x  x  <- l
+      // 0  0  0  0  x  x
+      //--------------
+      ///RealD eval
+      if(l == N - 2)
+      {
+        trows[l + 1] = 1;    ///Needed for UTSolve
+        apd = H[l][l] + H[l + 1][ l + 1];
+        amd = H[l][l] - H[l + 1][l + 1];
+        bc =  (T) 4.0 * H[l + 1][l] * H[l][l + 1];
+        evals[e] = (T) 0.5 * (apd + sqrt(amd * amd + bc));
+        evals[e + 1] = (T) 0.5 * (apd - sqrt(amd * amd + bc));
+        N -= 2;
+        e += 2;
+        r++;
+        it = 0;
+      }
+    }while(r > 0);
+    //Jasper
+    //Already converged
+    //--------------
+    if(N == 0) break;
+
+    DenseVector<T> ck,v; Resize(ck,2); Resize(v,2);
+
+    for(int m = N - 3; m >= l; m--)
+    {
+      ///Starting vector essentially random shift.
+      if(it%10 == 0 && N >= 3 && it > 0)
+      {
+        t = abs(H[N - 1][N - 2]) + abs(H[N - 2][N - 3]);
+        x = H[m][m] - t;
+        z = H[m + 1][m];
+      } else {
+      ///Starting vector implicit Q theorem
+        d = (H[N - 2][N - 2] - H[N - 1][N - 1]) * (T) 0.5;
+        t =  H[N - 1][N - 1] - H[N - 1][N - 2] * H[N - 1][N - 2] 
+	  / (d + sign(d) * sqrt(d * d + H[N - 1][N - 2] * H[N - 1][N - 2]));
+        x = H[m][m] - t;
+        z = H[m + 1][m];
+      }
+      //Jasper
+      //why it is here????
+      //-----------------------
+      if(m == l)
+        break;
+
+      u = abs(H[m][m - 1]) * (abs(y) + abs(z));
+      d = abs(x) * (abs(H[m - 1][m - 1]) + abs(H[m][m]) + abs(H[m + 1][m + 1]));
+      if ((T)abs(u + d) == (T)abs(d))
+      {
+        l = m;
+        break;
+      }
+    }
+    //Jasper
+    if(it > 1000000)
+    {
+      std::cout << "Wilkinson: bugger it got stuck after 100000 iterations" << std::endl;
+      std::cout << "got " << e << " evals " << l << " " << N << std::endl;
+      exit(1);
+    }
+    //
+    T s, c;
+    Givens_calc<T>(x, z, c, s);
+    Givens_mult<T>(H, l, l + 1, c, -s, 0);
+    Givens_mult<T>(H, l, l + 1, c,  s, 1);
+    Givens_mult<T>(P, l, l + 1, c,  s, 1);
+    //
+    for(int k = l; k < N - 2; ++k)
+    {
+      x = H.A[k + 1][k];
+      z = H.A[k + 2][k];
+      Givens_calc<T>(x, z, c, s);
+      Givens_mult<T>(H, k + 1, k + 2, c, -s, 0);
+      Givens_mult<T>(H, k + 1, k + 2, c,  s, 1);
+      Givens_mult<T>(P, k + 1, k + 2, c,  s, 1);
+    }
+    it++;
+    tot_it++;
+  }while(N > 1);
+
+  N = evals.size();
+  ///Annoying - UT solves in reverse order;
+  DenseVector<T> tmp(N);
+  for(int i = 0; i < N; ++i)
+    tmp[i] = evals[N-i-1];
+  evals = tmp;
+  //
+  UTeigenvectors(H, trows, evals, evecs);
+  //UTSymmEigenvectors(H, trows, evals, evecs);
+  for(int i = 0; i < evals.size(); ++i)
+  {
+    evecs[i] = P * evecs[i];
+    normalize(evecs[i]);
+    evals[i] = evals[i] * Hnorm;
+  }
+  // // FIXME this is to test
+  // Hin.write("evecs3", evecs);
+  // Hin.write("evals3", evals);
+  // // check rsd
+  // for(int i = 0; i < M; i++) {
+  //   vector<T> Aevec = Hin * evecs[i];
+  //   RealD norm2(0.);
+  //   for(int j = 0; j < M; j++) {
+  //     norm2 += (Aevec[j] - evals[i] * evecs[i][j]) * (Aevec[j] - evals[i] * evecs[i][j]);
+  //   }
+  // }
+  return tot_it;
+}
+
+template <class T>
+void Hess(DenseMatrix<T > &A, DenseMatrix<T> &Q, int start){
+
+  /**
+  turn a matrix A =
+  x  x  x  x  x
+  x  x  x  x  x
+  x  x  x  x  x
+  x  x  x  x  x
+  x  x  x  x  x
+  into
+  x  x  x  x  x
+  x  x  x  x  x
+  0  x  x  x  x
+  0  0  x  x  x
+  0  0  0  x  x
+  with householder rotations
+  Slow.
+  */
+  int N ; SizeSquare(A,N);
+  DenseVector<T > p; Resize(p,N); Fill(p,0);
+
+  for(int k=start;k<N-2;k++){
+    //cerr << "hess" << k << std::endl;
+    DenseVector<T > ck,v; Resize(ck,N-k-1); Resize(v,N-k-1);
+    for(int i=k+1;i<N;i++){ck[i-k-1] = A(i,k);}  ///kth column
+    normalize(ck);    ///Normalization cancels in PHP anyway
+    T beta;
+    Householder_vector<T >(ck, 0, ck.size()-1, v, beta);  ///Householder vector
+    Householder_mult<T>(A,v,beta,start,k+1,N-1,0);  ///A -> PA
+    Householder_mult<T >(A,v,beta,start,k+1,N-1,1);  ///PA -> PAP^H
+    ///Accumulate eigenvector
+    Householder_mult<T >(Q,v,beta,start,k+1,N-1,1);  ///Q -> QP^H
+  }
+  /*for(int l=0;l<N-2;l++){
+    for(int k=l+2;k<N;k++){
+    A(0,k,l);
+    }
+    }*/
+}
+
+template <class T>
+void Tri(DenseMatrix<T > &A, DenseMatrix<T> &Q, int start){
+///Tridiagonalize a matrix
+  int N; SizeSquare(A,N);
+  Hess(A,Q,start);
+  /*for(int l=0;l<N-2;l++){
+    for(int k=l+2;k<N;k++){
+    A(0,l,k);
+    }
+    }*/
+}
+
+template <class T>
+void ForceTridiagonal(DenseMatrix<T> &A){
+///Tridiagonalize a matrix
+  int N ; SizeSquare(A,N);
+  for(int l=0;l<N-2;l++){
+    for(int k=l+2;k<N;k++){
+      A[l][k]=0;
+      A[k][l]=0;
+    }
+  }
+}
+
+template <class T>
+int my_SymmEigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
+  ///Solve a symmetric eigensystem, not necessarily in tridiagonal form
+  int N; SizeSquare(Ain,N);
+  DenseMatrix<T > A; A = Ain;
+  DenseMatrix<T > Q; Resize(Q,N,N); Unity(Q);
+  Tri(A,Q,0);
+  int it = my_Wilkinson<T>(A, evals, evecs, small);
+  for(int k=0;k<N;k++){evecs[k] = Q*evecs[k];}
+  return it;
+}
+
+
+template <class T>
+int Wilkinson(DenseMatrix<T> &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
+  return my_Wilkinson(Ain, evals, evecs, small);
+}
+
+template <class T>
+int SymmEigensystem(DenseMatrix<T> &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
+  return my_SymmEigensystem(Ain, evals, evecs, small);
+}
+
+template <class T>
+int Eigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
+///Solve a general eigensystem, not necessarily in tridiagonal form
+  int N = Ain.dim;
+  DenseMatrix<T > A(N); A = Ain;
+  DenseMatrix<T > Q(N);Q.Unity();
+  Hess(A,Q,0);
+  int it = QReigensystem<T>(A, evals, evecs, small);
+  for(int k=0;k<N;k++){evecs[k] = Q*evecs[k];}
+  return it;
+}
+
+}
+#endif
@@ -0,0 +1,242 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/Householder.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef HOUSEHOLDER_H
+#define HOUSEHOLDER_H
+
+#define TIMER(A) std::cout << GridLogMessage << __FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
+#define ENTER()  std::cout << GridLogMessage << "ENTRY "<<__FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
+#define LEAVE()  std::cout << GridLogMessage << "EXIT  "<<__FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
+
+#include <cstdlib>
+#include <string>
+#include <cmath>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <fstream>
+#include <complex>
+#include <algorithm>
+
+namespace Grid {
+/** Comparison function for finding the max element in a vector **/
+template <class T> bool cf(T i, T j) { 
+  return abs(i) < abs(j); 
+}
+
+/** 
+	Calculate a real Givens angle 
+ **/
+template <class T> inline void Givens_calc(T y, T z, T &c, T &s){
+
+  RealD mz = (RealD)abs(z);
+  
+  if(mz==0.0){
+    c = 1; s = 0;
+  }
+  if(mz >= (RealD)abs(y)){
+    T t = -y/z;
+    s = (T)1.0 / sqrt ((T)1.0 + t * t);
+    c = s * t;
+  } else {
+    T t = -z/y;
+    c = (T)1.0 / sqrt ((T)1.0 + t * t);
+    s = c * t;
+  }
+}
+
+template <class T> inline void Givens_mult(DenseMatrix<T> &A,  int i, int k, T c, T s, int dir)
+{
+  int q ; SizeSquare(A,q);
+
+  if(dir == 0){
+    for(int j=0;j<q;j++){
+      T nu = A[i][j];
+      T w  = A[k][j];
+      A[i][j] = (c*nu + s*w);
+      A[k][j] = (-s*nu + c*w);
+    }
+  }
+
+  if(dir == 1){
+    for(int j=0;j<q;j++){
+      T nu = A[j][i];
+      T w  = A[j][k];
+      A[j][i] = (c*nu - s*w);
+      A[j][k] = (s*nu + c*w);
+    }
+  }
+}
+
+/**
+	from input = x;
+	Compute the complex Householder vector, v, such that
+	P = (I - b v transpose(v) )
+	b = 2/v.v
+
+	P | x |    | x | k = 0
+	| x |    | 0 | 
+	| x | =  | 0 |
+	| x |    | 0 | j = 3
+	| x |	   | x |
+
+	These are the "Unreduced" Householder vectors.
+
+ **/
+template <class T> inline void Householder_vector(DenseVector<T> input, int k, int j, DenseVector<T> &v, T &beta)
+{
+  int N ; Size(input,N);
+  T m = *max_element(input.begin() + k, input.begin() + j + 1, cf<T> );
+
+  if(abs(m) > 0.0){
+    T alpha = 0;
+
+    for(int i=k; i<j+1; i++){
+      v[i] = input[i]/m;
+      alpha = alpha + v[i]*conj(v[i]);
+    }
+    alpha = sqrt(alpha);
+    beta = (T)1.0/(alpha*(alpha + abs(v[k]) ));
+
+    if(abs(v[k]) > 0.0)  v[k] = v[k] + (v[k]/abs(v[k]))*alpha;
+    else                 v[k] = -alpha;
+  } else{
+    for(int i=k; i<j+1; i++){
+      v[i] = 0.0;
+    } 
+  }
+}
+
+/**
+	from input = x;
+	Compute the complex Householder vector, v, such that
+	P = (I - b v transpose(v) )
+	b = 2/v.v
+
+	Px = alpha*e_dir
+
+	These are the "Unreduced" Householder vectors.
+
+ **/
+
+template <class T> inline void Householder_vector(DenseVector<T> input, int k, int j, int dir, DenseVector<T> &v, T &beta)
+{
+  int N = input.size();
+  T m = *max_element(input.begin() + k, input.begin() + j + 1, cf);
+  
+  if(abs(m) > 0.0){
+    T alpha = 0;
+
+    for(int i=k; i<j+1; i++){
+      v[i] = input[i]/m;
+      alpha = alpha + v[i]*conj(v[i]);
+    }
+    
+    alpha = sqrt(alpha);
+    beta = 1.0/(alpha*(alpha + abs(v[dir]) ));
+	
+    if(abs(v[dir]) > 0.0) v[dir] = v[dir] + (v[dir]/abs(v[dir]))*alpha;
+    else                  v[dir] = -alpha;
+  }else{
+    for(int i=k; i<j+1; i++){
+      v[i] = 0.0;
+    } 
+  }
+}
+
+/**
+	Compute the product PA if trans = 0
+	AP if trans = 1
+	P = (I - b v transpose(v) )
+	b = 2/v.v
+	start at element l of matrix A
+	v is of length j - k + 1 of v are nonzero
+ **/
+
+template <class T> inline void Householder_mult(DenseMatrix<T> &A , DenseVector<T> v, T beta, int l, int k, int j, int trans)
+{
+  int N ; SizeSquare(A,N);
+
+  if(abs(beta) > 0.0){
+    for(int p=l; p<N; p++){
+      T s = 0;
+      if(trans==0){
+	for(int i=k;i<j+1;i++) s += conj(v[i-k])*A[i][p];
+	s *= beta;
+	for(int i=k;i<j+1;i++){ A[i][p] = A[i][p]-s*conj(v[i-k]);}
+      } else {
+	for(int i=k;i<j+1;i++){ s += conj(v[i-k])*A[p][i];}
+	s *= beta;
+	for(int i=k;i<j+1;i++){ A[p][i]=A[p][i]-s*conj(v[i-k]);}
+      }
+    }
+  }
+}
+
+/**
+	Compute the product PA if trans = 0
+	AP if trans = 1
+	P = (I - b v transpose(v) )
+	b = 2/v.v
+	start at element l of matrix A
+	v is of length j - k + 1 of v are nonzero
+	A is tridiagonal
+ **/
+template <class T> inline void Householder_mult_tri(DenseMatrix<T> &A , DenseVector<T> v, T beta, int l, int M, int k, int j, int trans)
+{
+  if(abs(beta) > 0.0){
+
+    int N ; SizeSquare(A,N);
+
+    DenseMatrix<T> tmp; Resize(tmp,N,N); Fill(tmp,0); 
+
+    T s;
+    for(int p=l; p<M; p++){
+      s = 0;
+      if(trans==0){
+	for(int i=k;i<j+1;i++) s = s + conj(v[i-k])*A[i][p];
+      }else{
+	for(int i=k;i<j+1;i++) s = s + v[i-k]*A[p][i];
+      }
+      s = beta*s;
+      if(trans==0){
+	for(int i=k;i<j+1;i++) tmp[i][p] = tmp(i,p) - s*v[i-k];
+      }else{
+	for(int i=k;i<j+1;i++) tmp[p][i] = tmp[p][i] - s*conj(v[i-k]);
+      }
+    }
+    for(int p=l; p<M; p++){
+      if(trans==0){
+	for(int i=k;i<j+1;i++) A[i][p] = A[i][p] + tmp[i][p];
+      }else{
+	for(int i=k;i<j+1;i++) A[p][i] = A[p][i] + tmp[p][i];
+      }
+    }
+  }
+}
+}
+#endif
@@ -87,15 +87,22 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  sliceInnerProductMatrix(m_rr,R,R,Orthog);

-  ////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Cholesky from Eigen
-  // There exists a ldlt that is documented as more stable
-  ////////////////////////////////////////////////////////////////////////////////////////////////////
-  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
+  // Force manifest hermitian to avoid rounding related
+  m_rr = 0.5*(m_rr+m_rr.adjoint());

+#if 0
+  std::cout << " Calling Cholesky  ldlt on m_rr "  << m_rr <<std::endl;
+  Eigen::MatrixXcd L_ldlt = m_rr.ldlt().matrixL(); 
+  std::cout << " Called Cholesky  ldlt on m_rr "  << L_ldlt <<std::endl;
+  auto  D_ldlt = m_rr.ldlt().vectorD(); 
+  std::cout << " Called Cholesky  ldlt on m_rr "  << D_ldlt <<std::endl;
+#endif
+
+  //  std::cout << " Calling Cholesky  llt on m_rr "  <<std::endl;
+  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
+  //  std::cout << " Called Cholesky  llt on m_rr "  << L <<std::endl;
  C    = L.adjoint();
  Cinv = C.inverse();
-
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  // Q = R C^{-1}
  //
@@ -103,7 +110,6 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
  //
  // NB maddMatrix conventions are Right multiplication X[j] a[j,i] already
  ////////////////////////////////////////////////////////////////////////////////////////////////////
-  // FIXME:: make a sliceMulMatrix to avoid zero vector
  sliceMulMatrix(Q,Cinv,R,Orthog);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -0,0 +1,753 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Chulwoo Jung <chulwoo@bnl.gov>
+Author: Christoph Lehner <clehner@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_BIRL_H
+#define GRID_BIRL_H
+
+#include <string.h> //memset
+
+#include <zlib.h>
+#include <sys/stat.h>
+
+#include <Grid/algorithms/iterative/BlockImplicitlyRestartedLanczos/BlockedGrid.h>
+#include <Grid/algorithms/iterative/BlockImplicitlyRestartedLanczos/FieldBasisVector.h>
+#include <Grid/algorithms/iterative/BlockImplicitlyRestartedLanczos/BlockProjector.h>
+
+namespace Grid { 
+
+/////////////////////////////////////////////////////////////
+// Implicitly restarted lanczos
+/////////////////////////////////////////////////////////////
+
+ template<class Field> 
+ class BlockImplicitlyRestartedLanczos {
+
+    const RealD small = 1.0e-16;
+public:       
+    int lock;
+    int get;
+    int Niter;
+    int converged;
+
+    int Nminres; // Minimum number of restarts; only check for convergence after
+    int Nstop;   // Number of evecs checked for convergence
+    int Nk;      // Number of converged sought
+    int Np;      // Np -- Number of spare vecs in kryloc space
+    int Nm;      // Nm -- total number of vectors
+
+    int orth_period;
+
+    RealD OrthoTime;
+
+    RealD eresid, betastp;
+    SortEigen<Field> _sort;
+    LinearFunction<Field> &_HermOp;
+    LinearFunction<Field> &_HermOpTest;
+    /////////////////////////
+    // Constructor
+    /////////////////////////
+
+    BlockImplicitlyRestartedLanczos(
+			       LinearFunction<Field> & HermOp,
+			       LinearFunction<Field> & HermOpTest,
+			       int _Nstop, // sought vecs
+			       int _Nk, // sought vecs
+			       int _Nm, // spare vecs
+			       RealD _eresid, // resid in lmdue deficit 
+			       RealD _betastp, // if beta(k) < betastp: converged
+			       int _Niter, // Max iterations
+			       int _Nminres, int _orth_period = 1) :
+      _HermOp(HermOp),
+      _HermOpTest(HermOpTest),
+      Nstop(_Nstop),
+      Nk(_Nk),
+      Nm(_Nm),
+      eresid(_eresid),
+      betastp(_betastp),
+      Niter(_Niter),
+	Nminres(_Nminres),
+	orth_period(_orth_period)
+    { 
+      Np = Nm-Nk; assert(Np>0);
+    };
+
+    BlockImplicitlyRestartedLanczos(
+			       LinearFunction<Field> & HermOp,
+			       LinearFunction<Field> & HermOpTest,
+			       int _Nk, // sought vecs
+			       int _Nm, // spare vecs
+			       RealD _eresid, // resid in lmdue deficit 
+			       RealD _betastp, // if beta(k) < betastp: converged
+			       int _Niter, // Max iterations
+			       int _Nminres,
+			       int _orth_period = 1) : 
+      _HermOp(HermOp),
+      _HermOpTest(HermOpTest),
+      Nstop(_Nk),
+      Nk(_Nk),
+      Nm(_Nm),
+      eresid(_eresid),
+      betastp(_betastp),
+      Niter(_Niter),
+	Nminres(_Nminres),
+	orth_period(_orth_period)
+    { 
+      Np = Nm-Nk; assert(Np>0);
+    };
+
+
+/* Saad PP. 195
+1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0
+2. For k = 1,2,...,m Do:
+3. wk:=Avk−βkv_{k−1}      
+4. αk:=(wk,vk)       // 
+5. wk:=wk−αkvk       // wk orthog vk 
+6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
+7. vk+1 := wk/βk+1
+8. EndDo
+ */
+    void step(std::vector<RealD>& lmd,
+	      std::vector<RealD>& lme, 
+	      BasisFieldVector<Field>& evec,
+	      Field& w,int Nm,int k)
+    {
+      assert( k< Nm );
+
+      GridStopWatch gsw_op,gsw_o;
+
+      Field& evec_k = evec[k];
+
+      gsw_op.Start();
+      _HermOp(evec_k,w);
+      gsw_op.Stop();
+
+      if(k>0){
+	w -= lme[k-1] * evec[k-1];
+      }    
+
+      ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk)
+      RealD     alph = real(zalph);
+
+      w = w - alph * evec_k;// 5. wk:=wk−αkvk
+
+      RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
+                                 // 7. vk+1 := wk/βk+1
+
+      std::cout<<GridLogMessage << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
+      const RealD tiny = 1.0e-20;
+      if ( beta < tiny ) { 
+	std::cout<<GridLogMessage << " beta is tiny "<<beta<<std::endl;
+     }
+      lmd[k] = alph;
+      lme[k]  = beta;
+
+      gsw_o.Start();
+      if (k>0 && k % orth_period == 0) { 
+	orthogonalize(w,evec,k); // orthonormalise
+      }
+      gsw_o.Stop();
+
+      if(k < Nm-1) { 
+	evec[k+1] = w;
+      }
+
+      std::cout << GridLogMessage << "Timing: operator=" << gsw_op.Elapsed() <<
+	" orth=" << gsw_o.Elapsed() << std::endl;
+
+    }
+
+    void qr_decomp(std::vector<RealD>& lmd,
+		   std::vector<RealD>& lme,
+		   int Nk,
+		   int Nm,
+		   std::vector<RealD>& Qt,
+		   RealD Dsh, 
+		   int kmin,
+		   int kmax)
+    {
+      int k = kmin-1;
+      RealD x;
+
+      RealD Fden = 1.0/hypot(lmd[k]-Dsh,lme[k]);
+      RealD c = ( lmd[k] -Dsh) *Fden;
+      RealD s = -lme[k] *Fden;
+      
+      RealD tmpa1 = lmd[k];
+      RealD tmpa2 = lmd[k+1];
+      RealD tmpb  = lme[k];
+
+      lmd[k]   = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
+      lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
+      lme[k]   = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
+      x        =-s*lme[k+1];
+      lme[k+1] = c*lme[k+1];
+      
+      for(int i=0; i<Nk; ++i){
+	RealD Qtmp1 = Qt[i+Nm*k  ];
+	RealD Qtmp2 = Qt[i+Nm*(k+1)];
+	Qt[i+Nm*k    ] = c*Qtmp1 - s*Qtmp2;
+	Qt[i+Nm*(k+1)] = s*Qtmp1 + c*Qtmp2; 
+      }
+
+      // Givens transformations
+      for(int k = kmin; k < kmax-1; ++k){
+
+	RealD Fden = 1.0/hypot(x,lme[k-1]);
+	RealD c = lme[k-1]*Fden;
+	RealD s = - x*Fden;
+	
+	RealD tmpa1 = lmd[k];
+	RealD tmpa2 = lmd[k+1];
+	RealD tmpb  = lme[k];
+
+	lmd[k]   = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
+	lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
+	lme[k]   = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
+	lme[k-1] = c*lme[k-1] -s*x;
+
+	if(k != kmax-2){
+	  x = -s*lme[k+1];
+	  lme[k+1] = c*lme[k+1];
+	}
+
+	for(int i=0; i<Nk; ++i){
+	  RealD Qtmp1 = Qt[i+Nm*k    ];
+	  RealD Qtmp2 = Qt[i+Nm*(k+1)];
+	  Qt[i+Nm*k    ] = c*Qtmp1 -s*Qtmp2;
+	  Qt[i+Nm*(k+1)] = s*Qtmp1 +c*Qtmp2;
+	}
+      }
+    }
+
+#ifdef USE_LAPACK_IRL
+#define LAPACK_INT int
+//long long
+    void diagonalize_lapack(std::vector<RealD>& lmd,
+			    std::vector<RealD>& lme, 
+			    int N1,
+			    int N2,
+			    std::vector<RealD>& Qt,
+			    GridBase *grid){
+
+      std::cout << GridLogMessage << "diagonalize_lapack start\n";
+      GridStopWatch gsw;
+
+      const int size = Nm;
+      //  tevals.resize(size);
+      //  tevecs.resize(size);
+      LAPACK_INT NN = N1;
+      std::vector<double> evals_tmp(NN);
+      std::vector<double> evec_tmp(NN*NN);
+      memset(&evec_tmp[0],0,sizeof(double)*NN*NN);
+      //  double AA[NN][NN];
+      std::vector<double> DD(NN);
+      std::vector<double> EE(NN);
+      for (int i = 0; i< NN; i++)
+	for (int j = i - 1; j <= i + 1; j++)
+	  if ( j < NN && j >= 0 ) {
+	    if (i==j) DD[i] = lmd[i];
+	    if (i==j) evals_tmp[i] = lmd[i];
+	    if (j==(i-1)) EE[j] = lme[j];
+	  }
+      LAPACK_INT evals_found;
+      LAPACK_INT lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ;
+      LAPACK_INT liwork =  3+NN*10 ;
+      std::vector<LAPACK_INT> iwork(liwork);
+      std::vector<double> work(lwork);
+      std::vector<LAPACK_INT> isuppz(2*NN);
+      char jobz = 'V'; // calculate evals & evecs
+      char range = 'I'; // calculate all evals
+      //    char range = 'A'; // calculate all evals
+      char uplo = 'U'; // refer to upper half of original matrix
+      char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
+      std::vector<int> ifail(NN);
+      LAPACK_INT info;
+      //  int total = QMP_get_number_of_nodes();
+      //  int node = QMP_get_node_number();
+      //  GridBase *grid = evec[0]._grid;
+      int total = grid->_Nprocessors;
+      int node = grid->_processor;
+      int interval = (NN/total)+1;
+      double vl = 0.0, vu = 0.0;
+      LAPACK_INT il = interval*node+1 , iu = interval*(node+1);
+      if (iu > NN)  iu=NN;
+      double tol = 0.0;
+      if (1) {
+	memset(&evals_tmp[0],0,sizeof(double)*NN);
+	if ( il <= NN){
+	  std::cout << GridLogMessage << "dstegr started" << std::endl; 
+	  gsw.Start();
+	  dstegr(&jobz, &range, &NN,
+		 (double*)&DD[0], (double*)&EE[0],
+		 &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
+		 &tol, // tolerance
+		 &evals_found, &evals_tmp[0], (double*)&evec_tmp[0], &NN,
+		 &isuppz[0],
+		 &work[0], &lwork, &iwork[0], &liwork,
+		 &info);
+	  gsw.Stop();
+	  std::cout << GridLogMessage << "dstegr completed in " << gsw.Elapsed() << std::endl;
+	  for (int i = iu-1; i>= il-1; i--){
+	    evals_tmp[i] = evals_tmp[i - (il-1)];
+	    if (il>1) evals_tmp[i-(il-1)]=0.;
+	    for (int j = 0; j< NN; j++){
+	      evec_tmp[i*NN + j] = evec_tmp[(i - (il-1)) * NN + j];
+	      if (il>1) evec_tmp[(i-(il-1)) * NN + j]=0.;
+	    }
+	  }
+	}
+	{
+	  //        QMP_sum_double_array(evals_tmp,NN);
+	  //        QMP_sum_double_array((double *)evec_tmp,NN*NN);
+	  grid->GlobalSumVector(&evals_tmp[0],NN);
+	  grid->GlobalSumVector(&evec_tmp[0],NN*NN);
+	}
+      } 
+      // cheating a bit. It is better to sort instead of just reversing it, but the document of the routine says evals are sorted in increasing order. qr gives evals in decreasing order.
+      for(int i=0;i<NN;i++){
+	for(int j=0;j<NN;j++)
+	  Qt[(NN-1-i)*N2+j]=evec_tmp[i*NN + j];
+	lmd [NN-1-i]=evals_tmp[i];
+      }
+
+      std::cout << GridLogMessage << "diagonalize_lapack complete\n";
+    }
+#undef LAPACK_INT 
+#endif
+
+
+    void diagonalize(std::vector<RealD>& lmd,
+		     std::vector<RealD>& lme, 
+		     int N2,
+		     int N1,
+		     std::vector<RealD>& Qt,
+		     GridBase *grid)
+    {
+
+#ifdef USE_LAPACK_IRL
+    const int check_lapack=0; // just use lapack if 0, check against lapack if 1
+
+    if(!check_lapack)
+	return diagonalize_lapack(lmd,lme,N2,N1,Qt,grid);
+
+	std::vector <RealD> lmd2(N1);
+	std::vector <RealD> lme2(N1);
+	std::vector<RealD> Qt2(N1*N1);
+         for(int k=0; k<N1; ++k){
+	    lmd2[k] = lmd[k];
+	    lme2[k] = lme[k];
+	  }
+         for(int k=0; k<N1*N1; ++k)
+	Qt2[k] = Qt[k];
+
+//	diagonalize_lapack(lmd2,lme2,Nm2,Nm,Qt,grid);
+#endif
+
+      int Niter = 10000*N1;
+      int kmin = 1;
+      int kmax = N2;
+      // (this should be more sophisticated)
+
+      for(int iter=0; ; ++iter){
+      if ( (iter+1)%(100*N1)==0) 
+      std::cout<<GridLogMessage << "[QL method] Not converged - iteration "<<iter+1<<"\n";
+
+	// determination of 2x2 leading submatrix
+	RealD dsub = lmd[kmax-1]-lmd[kmax-2];
+	RealD dd = sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]);
+	RealD Dsh = 0.5*(lmd[kmax-2]+lmd[kmax-1] +dd*(dsub/fabs(dsub)));
+	// (Dsh: shift)
+	
+	// transformation
+	qr_decomp(lmd,lme,N2,N1,Qt,Dsh,kmin,kmax);
+	
+	// Convergence criterion (redef of kmin and kamx)
+	for(int j=kmax-1; j>= kmin; --j){
+	  RealD dds = fabs(lmd[j-1])+fabs(lmd[j]);
+	  if(fabs(lme[j-1])+dds > dds){
+	    kmax = j+1;
+	    goto continued;
+	  }
+	}
+	Niter = iter;
+#ifdef USE_LAPACK_IRL
+    if(check_lapack){
+	const double SMALL=1e-8;
+	diagonalize_lapack(lmd2,lme2,N2,N1,Qt2,grid);
+	std::vector <RealD> lmd3(N2);
+         for(int k=0; k<N2; ++k) lmd3[k]=lmd[k];
+        _sort.push(lmd3,N2);
+        _sort.push(lmd2,N2);
+         for(int k=0; k<N2; ++k){
+	    if (fabs(lmd2[k] - lmd3[k]) >SMALL)  std::cout<<GridLogMessage <<"lmd(qr) lmd(lapack) "<< k << ": " << lmd2[k] <<" "<< lmd3[k] <<std::endl;
+//	    if (fabs(lme2[k] - lme[k]) >SMALL)  std::cout<<GridLogMessage <<"lme(qr)-lme(lapack) "<< k << ": " << lme2[k] - lme[k] <<std::endl;
+	  }
+         for(int k=0; k<N1*N1; ++k){
+//	    if (fabs(Qt2[k] - Qt[k]) >SMALL)  std::cout<<GridLogMessage <<"Qt(qr)-Qt(lapack) "<< k << ": " << Qt2[k] - Qt[k] <<std::endl;
+	}
+    }
+#endif
+	return;
+
+      continued:
+	for(int j=0; j<kmax-1; ++j){
+	  RealD dds = fabs(lmd[j])+fabs(lmd[j+1]);
+	  if(fabs(lme[j])+dds > dds){
+	    kmin = j+1;
+	    break;
+	  }
+	}
+      }
+      std::cout<<GridLogMessage << "[QL method] Error - Too many iteration: "<<Niter<<"\n";
+      abort();
+    }
+
+#if 1
+    template<typename T>
+    static RealD normalise(T& v) 
+    {
+      RealD nn = norm2(v);
+      nn = sqrt(nn);
+      v = v * (1.0/nn);
+      return nn;
+    }
+
+    void orthogonalize(Field& w,
+		       BasisFieldVector<Field>& evec,
+		       int k)
+    {
+      double t0=-usecond()/1e6;
+
+      evec.orthogonalize(w,k);
+
+      normalise(w);
+      t0+=usecond()/1e6;
+      OrthoTime +=t0;
+    }
+
+    void setUnit_Qt(int Nm, std::vector<RealD> &Qt) {
+      for(int i=0; i<Qt.size(); ++i) Qt[i] = 0.0;
+      for(int k=0; k<Nm; ++k) Qt[k + k*Nm] = 1.0;
+    }
+
+/* Rudy Arthur's thesis pp.137
+------------------------
+Require: M > K P = M − K †
+Compute the factorization AVM = VM HM + fM eM 
+repeat
+  Q=I
+  for i = 1,...,P do
+    QiRi =HM −θiI Q = QQi
+    H M = Q †i H M Q i
+  end for
+  βK =HM(K+1,K) σK =Q(M,K)
+  r=vK+1βK +rσK
+  VK =VM(1:M)Q(1:M,1:K)
+  HK =HM(1:K,1:K)
+  →AVK =VKHK +fKe†K † Extend to an M = K + P step factorization AVM = VMHM + fMeM
+until convergence
+*/
+
+    void calc(std::vector<RealD>& eval,
+	      BasisFieldVector<Field>& evec,
+	      const Field& src,
+	      int& Nconv,
+	      bool reverse,
+	      int SkipTest)
+      {
+
+	GridBase *grid = evec._v[0]._grid;//evec.get(0 + evec_offset)._grid;
+	assert(grid == src._grid);
+
+	std::cout<<GridLogMessage << " -- Nk = " << Nk << " Np = "<< Np << std::endl;
+	std::cout<<GridLogMessage << " -- Nm = " << Nm << std::endl;
+	std::cout<<GridLogMessage << " -- size of eval   = " << eval.size() << std::endl;
+	std::cout<<GridLogMessage << " -- size of evec  = " << evec.size() << std::endl;
+	
+	assert(Nm <= evec.size() && Nm <= eval.size());
+
+	// quickly get an idea of the largest eigenvalue to more properly normalize the residuum
+	RealD evalMaxApprox = 0.0;
+	{
+	  auto src_n = src;
+	  auto tmp = src;
+	  const int _MAX_ITER_IRL_MEVAPP_ = 50;
+	  for (int i=0;i<_MAX_ITER_IRL_MEVAPP_;i++) {
+	    _HermOpTest(src_n,tmp);
+	    RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
+	    RealD vden = norm2(src_n);
+	    RealD na = vnum/vden;
+	    if (fabs(evalMaxApprox/na - 1.0) < 0.05)
+	      i=_MAX_ITER_IRL_MEVAPP_;
+	    evalMaxApprox = na;
+	    std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
+	    src_n = tmp;
+	  }
+	}
+	
+	std::vector<RealD> lme(Nm);  
+	std::vector<RealD> lme2(Nm);
+	std::vector<RealD> eval2(Nm);
+	std::vector<RealD> eval2_copy(Nm);
+	std::vector<RealD> Qt(Nm*Nm);
+
+
+	Field f(grid);
+	Field v(grid);
+  
+	int k1 = 1;
+	int k2 = Nk;
+
+	Nconv = 0;
+
+	RealD beta_k;
+  
+	// Set initial vector
+	evec[0] = src;
+	normalise(evec[0]);
+	std:: cout<<GridLogMessage <<"norm2(evec[0])= " << norm2(evec[0])<<std::endl;
+	
+	// Initial Nk steps
+	OrthoTime=0.;
+	double t0=usecond()/1e6;
+	for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k);
+	double t1=usecond()/1e6;
+	std::cout<<GridLogMessage <<"IRL::Initial steps: "<<t1-t0<< "seconds"<<std::endl; t0=t1;
+	std::cout<<GridLogMessage <<"IRL::Initial steps:OrthoTime "<<OrthoTime<< "seconds"<<std::endl;
+	t1=usecond()/1e6;
+
+	// Restarting loop begins
+	for(int iter = 0; iter<Niter; ++iter){
+	  
+	  std::cout<<GridLogMessage<<"\n Restart iteration = "<< iter << std::endl;
+	  
+	  // 
+	  // Rudy does a sort first which looks very different. Getting fed up with sorting out the algo defs.
+	  // We loop over 
+	  //
+	  OrthoTime=0.;
+	  for(int k=Nk; k<Nm; ++k) step(eval,lme,evec,f,Nm,k);
+	  t1=usecond()/1e6;
+	  std::cout<<GridLogMessage <<"IRL:: "<<Np <<" steps: "<<t1-t0<< "seconds"<<std::endl; t0=t1;
+	  std::cout<<GridLogMessage <<"IRL::Initial steps:OrthoTime "<<OrthoTime<< "seconds"<<std::endl;
+	  f *= lme[Nm-1];
+	  
+	  t1=usecond()/1e6;
+
+	  
+	  // getting eigenvalues
+	  for(int k=0; k<Nm; ++k){
+	    eval2[k] = eval[k+k1-1];
+	    lme2[k] = lme[k+k1-1];
+	  }
+	  setUnit_Qt(Nm,Qt);
+	  diagonalize(eval2,lme2,Nm,Nm,Qt,grid);
+	  t1=usecond()/1e6;
+	  std::cout<<GridLogMessage <<"IRL:: diagonalize: "<<t1-t0<< "seconds"<<std::endl; t0=t1;
+	  
+	  // sorting
+	  eval2_copy = eval2;
+
+	  _sort.push(eval2,Nm);
+	  t1=usecond()/1e6;
+	  std::cout<<GridLogMessage <<"IRL:: eval sorting: "<<t1-t0<< "seconds"<<std::endl; t0=t1;
+	  
+	  // Implicitly shifted QR transformations
+	  setUnit_Qt(Nm,Qt);
+	  for(int ip=0; ip<k2; ++ip){
+	    std::cout<<GridLogMessage << "eval "<< ip << " "<< eval2[ip] << std::endl;
+	  }
+
+	  for(int ip=k2; ip<Nm; ++ip){ 
+	    std::cout<<GridLogMessage << "qr_decomp "<< ip << " "<< eval2[ip] << std::endl;
+	    qr_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
+	    
+	  }
+	  t1=usecond()/1e6;
+	  std::cout<<GridLogMessage <<"IRL::qr_decomp: "<<t1-t0<< "seconds"<<std::endl; t0=t1;
+	  assert(k2<Nm);
+	  
+
+	  assert(k2<Nm);
+	  assert(k1>0);
+	  evec.rotate(Qt,k1-1,k2+1,0,Nm,Nm);
+	  
+	  t1=usecond()/1e6;
+	  std::cout<<GridLogMessage <<"IRL::QR rotation: "<<t1-t0<< "seconds"<<std::endl; t0=t1;
+	  fflush(stdout);
+	  
+	  // Compressed vector f and beta(k2)
+	  f *= Qt[Nm-1+Nm*(k2-1)];
+	  f += lme[k2-1] * evec[k2];
+	  beta_k = norm2(f);
+	  beta_k = sqrt(beta_k);
+	  std::cout<<GridLogMessage<<" beta(k) = "<<beta_k<<std::endl;
+	  
+	  RealD betar = 1.0/beta_k;
+	  evec[k2] = betar * f;
+	  lme[k2-1] = beta_k;
+	  
+	  // Convergence test
+	  for(int k=0; k<Nm; ++k){    
+	    eval2[k] = eval[k];
+	    lme2[k] = lme[k];
+
+	    std::cout<<GridLogMessage << "eval2[" << k << "] = " << eval2[k] << std::endl;
+	  }
+	  setUnit_Qt(Nm,Qt);
+	  diagonalize(eval2,lme2,Nk,Nm,Qt,grid);
+	  t1=usecond()/1e6;
+	  std::cout<<GridLogMessage <<"IRL::diagonalize: "<<t1-t0<< "seconds"<<std::endl; t0=t1;
+	  
+	  
+	  Nconv = 0;
+	  
+	  if (iter >= Nminres) {
+	    std::cout << GridLogMessage << "Rotation to test convergence " << std::endl;
+	    
+	    Field ev0_orig(grid);
+	    ev0_orig = evec[0];
+	    
+	    evec.rotate(Qt,0,Nk,0,Nk,Nm);
+	    
+	    {
+	      std::cout << GridLogMessage << "Test convergence" << std::endl;
+	      Field B(grid);
+	      
+	      for(int j = 0; j<Nk; j+=SkipTest){
+		B=evec[j];
+		//std::cout << "Checkerboard: " << evec[j].checkerboard << std::endl; 
+		B.checkerboard = evec[0].checkerboard;
+
+		_HermOpTest(B,v);
+		
+		RealD vnum = real(innerProduct(B,v)); // HermOp.
+		RealD vden = norm2(B);
+		RealD vv0 = norm2(v);
+		eval2[j] = vnum/vden;
+		v -= eval2[j]*B;
+		RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);
+		std::cout.precision(13);
+		std::cout<<GridLogMessage << "[" << std::setw(3)<< std::setiosflags(std::ios_base::right) <<j<<"] "
+			 <<"eval = "<<std::setw(25)<< std::setiosflags(std::ios_base::left)<< eval2[j] << " (" << eval2_copy[j] << ")"
+			 <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25)<< std::setiosflags(std::ios_base::right)<< vv
+			 <<" "<< vnum/(sqrt(vden)*sqrt(vv0))
+			 << " norm(B["<<j<<"])="<< vden <<std::endl;
+		
+		// change the criteria as evals are supposed to be sorted, all evals smaller(larger) than Nstop should have converged
+		if((vv<eresid*eresid) && (j == Nconv) ){
+		  Nconv+=SkipTest;
+		}
+	      }
+	      
+	      // test if we converged, if so, terminate
+	      t1=usecond()/1e6;
+	      std::cout<<GridLogMessage <<"IRL::convergence testing: "<<t1-t0<< "seconds"<<std::endl; t0=t1;
+	      
+	      std::cout<<GridLogMessage<<" #modes converged: "<<Nconv<<std::endl;
+	      
+	      if( Nconv>=Nstop || beta_k < betastp){
+		goto converged;
+	      }
+	      
+	      std::cout << GridLogMessage << "Rotate back" << std::endl;
+	      //B[j] +=Qt[k+_Nm*j] * _v[k]._odata[ss];
+	      {
+		Eigen::MatrixXd qm = Eigen::MatrixXd::Zero(Nk,Nk);
+		for (int k=0;k<Nk;k++)
+		  for (int j=0;j<Nk;j++)
+		    qm(j,k) = Qt[k+Nm*j];
+		GridStopWatch timeInv;
+		timeInv.Start();
+		Eigen::MatrixXd qmI = qm.inverse();
+		timeInv.Stop();
+		std::vector<RealD> QtI(Nm*Nm);
+		for (int k=0;k<Nk;k++)
+		  for (int j=0;j<Nk;j++)
+		    QtI[k+Nm*j] = qmI(j,k);
+		
+		RealD res_check_rotate_inverse = (qm*qmI - Eigen::MatrixXd::Identity(Nk,Nk)).norm(); // sqrt( |X|^2 )
+		assert(res_check_rotate_inverse < 1e-7);
+		evec.rotate(QtI,0,Nk,0,Nk,Nm);
+		
+		axpy(ev0_orig,-1.0,evec[0],ev0_orig);
+		std::cout << GridLogMessage << "Rotation done (in " << timeInv.Elapsed() << " = " << timeInv.useconds() << " us" <<
+		  ", error = " << res_check_rotate_inverse << 
+		  "); | evec[0] - evec[0]_orig | = " << ::sqrt(norm2(ev0_orig)) << std::endl;
+	      }
+	    }
+	  } else {
+	    std::cout << GridLogMessage << "iter < Nminres: do not yet test for convergence\n";
+	  } // end of iter loop
+	}
+
+	std::cout<<GridLogMessage<<"\n NOT converged.\n";
+	abort();
+	
+      converged:
+
+	if (SkipTest == 1) {
+	  eval = eval2;
+	} else {
+
+	  // test quickly
+	  for (int j=0;j<Nstop;j+=SkipTest) {
+	    std::cout<<GridLogMessage << "Eigenvalue[" << j << "] = " << eval2[j] << " (" << eval2_copy[j] << ")" << std::endl;
+	  }
+
+	  eval2_copy.resize(eval2.size());
+	  eval = eval2_copy;
+	}
+
+	evec.sortInPlace(eval,reverse);
+
+	{
+	  
+	 // test
+	 for (int j=0;j<Nstop;j++) {
+	   std::cout<<GridLogMessage << " |e[" << j << "]|^2 = " << norm2(evec[j]) << std::endl;
+	 }
+       }
+       
+       //_sort.push(eval,evec,Nconv);
+       //evec.sort(eval,Nconv);
+       
+       std::cout<<GridLogMessage << "\n Converged\n Summary :\n";
+       std::cout<<GridLogMessage << " -- Iterations  = "<< Nconv  << "\n";
+       std::cout<<GridLogMessage << " -- beta(k)     = "<< beta_k << "\n";
+       std::cout<<GridLogMessage << " -- Nconv       = "<< Nconv  << "\n";
+      }
+#endif
+
+    };
+
+}
+#endif
+
@@ -0,0 +1,143 @@
+namespace Grid { 
+
+/*
+  BlockProjector
+
+  If _HP_BLOCK_PROJECTORS_ is defined, we assume that _evec is a basis that is not
+  fully orthonormalized (to the precision of the coarse field) and we allow for higher-precision
+  coarse field than basis field.
+
+*/
+//#define _HP_BLOCK_PROJECTORS_
+
+template<typename Field>
+class BlockProjector {
+public:
+
+  BasisFieldVector<Field>& _evec;
+  BlockedGrid<Field>& _bgrid;
+
+  BlockProjector(BasisFieldVector<Field>& evec, BlockedGrid<Field>& bgrid) : _evec(evec), _bgrid(bgrid) {
+  }
+
+  void createOrthonormalBasis(RealD thres = 0.0) {
+
+    GridStopWatch sw;
+    sw.Start();
+
+    int cnt = 0;
+
+#pragma omp parallel shared(cnt)
+    {
+      int lcnt = 0;
+
+#pragma omp for
+      for (int b=0;b<_bgrid._o_blocks;b++) {
+	
+	for (int i=0;i<_evec._Nm;i++) {
+	  
+	  auto nrm0 = _bgrid.block_sp(b,_evec._v[i],_evec._v[i]);
+	  
+	  // |i> -= <j|i> |j>
+	  for (int j=0;j<i;j++) {
+	    _bgrid.block_caxpy(b,_evec._v[i],-_bgrid.block_sp(b,_evec._v[j],_evec._v[i]),_evec._v[j],_evec._v[i]);
+	  }
+	  
+	  auto nrm = _bgrid.block_sp(b,_evec._v[i],_evec._v[i]);
+	  
+	  auto eps = nrm/nrm0;
+	  if (Reduce(eps).real() < thres) {
+	    lcnt++;
+	  }
+	  
+	  // TODO: if norm is too small, remove this eigenvector/mark as not needed; in practice: set it to zero norm here and return a mask
+	  // that is then used later to decide not to write certain eigenvectors to disk (add a norm calculation before subtraction step and look at nrm/nrm0 < eps to decide)
+	  _bgrid.block_cscale(b,1.0 / sqrt(nrm),_evec._v[i]);
+	  
+	}
+	
+      }
+
+#pragma omp critical
+      {
+	cnt += lcnt;
+      }
+    }
+    sw.Stop();
+    std::cout << GridLogMessage << "Gram-Schmidt to create blocked basis took " << sw.Elapsed() << " (" << ((RealD)cnt / (RealD)_bgrid._o_blocks / (RealD)_evec._Nm) 
+	      << " below threshold)" << std::endl;
+
+  }
+
+  template<typename CoarseField>
+  void coarseToFine(const CoarseField& in, Field& out) {
+
+    out = zero;
+    out.checkerboard = _evec._v[0].checkerboard;
+
+    int Nbasis = sizeof(in._odata[0]._internal._internal) / sizeof(in._odata[0]._internal._internal[0]);
+    assert(Nbasis == _evec._Nm);
+    
+#pragma omp parallel for
+    for (int b=0;b<_bgrid._o_blocks;b++) {
+      for (int j=0;j<_evec._Nm;j++) {
+	_bgrid.block_caxpy(b,out,in._odata[b]._internal._internal[j],_evec._v[j],out);
+      }
+    }
+
+  }
+
+  template<typename CoarseField>
+  void fineToCoarse(const Field& in, CoarseField& out) {
+
+    out = zero;
+
+    int Nbasis = sizeof(out._odata[0]._internal._internal) / sizeof(out._odata[0]._internal._internal[0]);
+    assert(Nbasis == _evec._Nm);
+
+
+    Field tmp(_bgrid._grid);
+    tmp = in;
+    
+#pragma omp parallel for
+    for (int b=0;b<_bgrid._o_blocks;b++) {
+      for (int j=0;j<_evec._Nm;j++) {
+	// |rhs> -= <j|rhs> |j>
+	auto c = _bgrid.block_sp(b,_evec._v[j],tmp);
+	_bgrid.block_caxpy(b,tmp,-c,_evec._v[j],tmp); // may make this more numerically stable
+	out._odata[b]._internal._internal[j] = c;
+      }
+    }
+
+  }
+
+  template<typename CoarseField>
+    void deflateFine(BasisFieldVector<CoarseField>& _coef,const std::vector<RealD>& eval,int N,const Field& src_orig,Field& result) {
+    result = zero;
+    for (int i=0;i<N;i++) {
+      Field tmp(result._grid);
+      coarseToFine(_coef._v[i],tmp);
+      axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
+    }
+  }
+
+  template<typename CoarseField>
+    void deflateCoarse(BasisFieldVector<CoarseField>& _coef,const std::vector<RealD>& eval,int N,const Field& src_orig,Field& result) {
+    CoarseField src_coarse(_coef._v[0]._grid);
+    CoarseField result_coarse = src_coarse;
+    result_coarse = zero;
+    fineToCoarse(src_orig,src_coarse);
+    for (int i=0;i<N;i++) {
+      axpy(result_coarse,TensorRemove(innerProduct(_coef._v[i],src_coarse)) / eval[i],_coef._v[i],result_coarse);
+    }
+    coarseToFine(result_coarse,result);
+  }
+
+  template<typename CoarseField>
+    void deflate(BasisFieldVector<CoarseField>& _coef,const std::vector<RealD>& eval,int N,const Field& src_orig,Field& result) {
+    // Deflation on coarse Grid is much faster, so use it by default.  Deflation on fine Grid is kept for legacy reasons for now.
+    deflateCoarse(_coef,eval,N,src_orig,result);
+  }
+
+};
+}
@@ -0,0 +1,401 @@
+namespace Grid {
+
+template<typename Field>
+class BlockedGrid {
+public:
+  GridBase* _grid;
+  typedef typename Field::scalar_type  Coeff_t;
+  typedef typename Field::vector_type vCoeff_t;
+  
+  std::vector<int> _bs; // block size
+  std::vector<int> _nb; // number of blocks
+  std::vector<int> _l;  // local dimensions irrespective of cb
+  std::vector<int> _l_cb;  // local dimensions of checkerboarded vector
+  std::vector<int> _l_cb_o;  // local dimensions of inner checkerboarded vector
+  std::vector<int> _bs_cb; // block size in checkerboarded vector
+  std::vector<int> _nb_o; // number of blocks of simd o-sites
+
+  int _nd, _blocks, _cf_size, _cf_block_size, _cf_o_block_size, _o_blocks, _block_sites;
+  
+  BlockedGrid(GridBase* grid, const std::vector<int>& block_size) :
+    _grid(grid), _bs(block_size), _nd((int)_bs.size()), 
+      _nb(block_size), _l(block_size), _l_cb(block_size), _nb_o(block_size),
+      _l_cb_o(block_size), _bs_cb(block_size) {
+
+    _blocks = 1;
+    _o_blocks = 1;
+    _l = grid->FullDimensions();
+    _l_cb = grid->LocalDimensions();
+    _l_cb_o = grid->_rdimensions;
+
+    _cf_size = 1;
+    _block_sites = 1;
+    for (int i=0;i<_nd;i++) {
+      _l[i] /= grid->_processors[i];
+
+      assert(!(_l[i] % _bs[i])); // lattice must accommodate choice of blocksize
+
+      int r = _l[i] / _l_cb[i];
+      assert(!(_bs[i] % r)); // checkerboarding must accommodate choice of blocksize
+      _bs_cb[i] = _bs[i] / r;
+      _block_sites *= _bs_cb[i];
+      _nb[i] = _l[i] / _bs[i];
+      _nb_o[i] = _nb[i] / _grid->_simd_layout[i];
+      if (_nb[i] % _grid->_simd_layout[i]) { // simd must accommodate choice of blocksize
+	std::cout << GridLogMessage << "Problem: _nb[" << i << "] = " << _nb[i] << " _grid->_simd_layout[" << i << "] = " << _grid->_simd_layout[i] << std::endl;
+	assert(0);
+      }
+      _blocks *= _nb[i];
+      _o_blocks *= _nb_o[i];
+      _cf_size *= _l[i];
+    }
+
+    _cf_size *= 12 / 2;
+    _cf_block_size = _cf_size / _blocks;
+    _cf_o_block_size = _cf_size / _o_blocks;
+
+    std::cout << GridLogMessage << "BlockedGrid:" << std::endl;
+    std::cout << GridLogMessage << " _l     = " << _l << std::endl;
+    std::cout << GridLogMessage << " _l_cb     = " << _l_cb << std::endl;
+    std::cout << GridLogMessage << " _l_cb_o     = " << _l_cb_o << std::endl;
+    std::cout << GridLogMessage << " _bs    = " << _bs << std::endl;
+    std::cout << GridLogMessage << " _bs_cb    = " << _bs_cb << std::endl;
+
+    std::cout << GridLogMessage << " _nb    = " << _nb << std::endl;
+    std::cout << GridLogMessage << " _nb_o    = " << _nb_o << std::endl;
+    std::cout << GridLogMessage << " _blocks = " << _blocks << std::endl;
+    std::cout << GridLogMessage << " _o_blocks = " << _o_blocks << std::endl;
+    std::cout << GridLogMessage << " sizeof(vCoeff_t) = " << sizeof(vCoeff_t) << std::endl;
+    std::cout << GridLogMessage << " _cf_size = " << _cf_size << std::endl;
+    std::cout << GridLogMessage << " _cf_block_size = " << _cf_block_size << std::endl;
+    std::cout << GridLogMessage << " _block_sites = " << _block_sites << std::endl;
+    std::cout << GridLogMessage << " _grid->oSites() = " << _grid->oSites() << std::endl;
+
+    //    _grid->Barrier();
+    //abort();
+  }
+
+    void block_to_coor(int b, std::vector<int>& x0) {
+
+      std::vector<int> bcoor;
+      bcoor.resize(_nd);
+      x0.resize(_nd);
+      assert(b < _o_blocks);
+      Lexicographic::CoorFromIndex(bcoor,b,_nb_o);
+      int i;
+
+      for (i=0;i<_nd;i++) {
+	x0[i] = bcoor[i]*_bs_cb[i];
+      }
+
+      //std::cout << GridLogMessage << "Map block b -> " << x0 << std::endl;
+
+    }
+
+    void block_site_to_o_coor(const std::vector<int>& x0, std::vector<int>& coor, int i) {
+      Lexicographic::CoorFromIndex(coor,i,_bs_cb);
+      for (int j=0;j<_nd;j++)
+	coor[j] += x0[j];
+    }
+
+    int block_site_to_o_site(const std::vector<int>& x0, int i) {
+      std::vector<int> coor;  coor.resize(_nd);
+      block_site_to_o_coor(x0,coor,i);
+      Lexicographic::IndexFromCoor(coor,i,_l_cb_o);
+      return i;
+    }
+
+    vCoeff_t block_sp(int b, const Field& x, const Field& y) {
+
+      std::vector<int> x0;
+      block_to_coor(b,x0);
+
+      vCoeff_t ret = 0.0;
+      for (int i=0;i<_block_sites;i++) { // only odd sites
+	int ss = block_site_to_o_site(x0,i);
+	ret += TensorRemove(innerProduct(x._odata[ss],y._odata[ss]));
+      }
+
+      return ret;
+
+    }
+
+    vCoeff_t block_sp(int b, const Field& x, const std::vector< ComplexD >& y) {
+
+      std::vector<int> x0;
+      block_to_coor(b,x0);
+
+      constexpr int nsimd = sizeof(vCoeff_t) / sizeof(Coeff_t);
+      int lsize = _cf_o_block_size / _block_sites;
+
+      std::vector< ComplexD > ret(nsimd);
+      for (int i=0;i<nsimd;i++)
+	ret[i] = 0.0;
+
+      for (int i=0;i<_block_sites;i++) { // only odd sites
+	int ss = block_site_to_o_site(x0,i);
+
+	int n = lsize / nsimd;
+	for (int l=0;l<n;l++) {
+	  for (int j=0;j<nsimd;j++) {
+	    int t = lsize * i + l*nsimd + j;
+
+	    ret[j] += conjugate(((Coeff_t*)&x._odata[ss]._internal)[l*nsimd + j]) * y[t];
+	  }
+	}
+      }
+
+      vCoeff_t vret;
+      for (int i=0;i<nsimd;i++)
+	((Coeff_t*)&vret)[i] = (Coeff_t)ret[i];
+
+      return vret;
+
+    }
+
+    template<class T>
+      void vcaxpy(iScalar<T>& r,const vCoeff_t& a,const iScalar<T>& x,const iScalar<T>& y) {
+      vcaxpy(r._internal,a,x._internal,y._internal);
+    }
+
+    template<class T,int N>
+      void vcaxpy(iVector<T,N>& r,const vCoeff_t& a,const iVector<T,N>& x,const iVector<T,N>& y) {
+      for (int i=0;i<N;i++)
+	vcaxpy(r._internal[i],a,x._internal[i],y._internal[i]);
+    }
+
+    void vcaxpy(vCoeff_t& r,const vCoeff_t& a,const vCoeff_t& x,const vCoeff_t& y) {
+      r = a*x + y;
+    }
+
+    void block_caxpy(int b, Field& ret, const vCoeff_t& a, const Field& x, const Field& y) {
+
+      std::vector<int> x0;
+      block_to_coor(b,x0);
+
+      for (int i=0;i<_block_sites;i++) { // only odd sites
+	int ss = block_site_to_o_site(x0,i);
+	vcaxpy(ret._odata[ss],a,x._odata[ss],y._odata[ss]);
+      }
+
+    }
+
+    void block_caxpy(int b, std::vector< ComplexD >& ret, const vCoeff_t& a, const Field& x, const std::vector< ComplexD >& y) {
+      std::vector<int> x0;
+      block_to_coor(b,x0);
+
+      constexpr int nsimd = sizeof(vCoeff_t) / sizeof(Coeff_t);
+      int lsize = _cf_o_block_size / _block_sites;
+
+      for (int i=0;i<_block_sites;i++) { // only odd sites
+	int ss = block_site_to_o_site(x0,i);
+
+	int n = lsize / nsimd;
+	for (int l=0;l<n;l++) {
+	  vCoeff_t r = a* ((vCoeff_t*)&x._odata[ss]._internal)[l];
+
+	  for (int j=0;j<nsimd;j++) {
+	    int t = lsize * i + l*nsimd + j;
+	    ret[t] = y[t] + ((Coeff_t*)&r)[j];
+	  }
+	}
+      }
+
+    }
+
+    void block_set(int b, Field& ret, const std::vector< ComplexD >& x) {
+      std::vector<int> x0;
+      block_to_coor(b,x0);
+
+      int lsize = _cf_o_block_size / _block_sites;
+
+      for (int i=0;i<_block_sites;i++) { // only odd sites
+	int ss = block_site_to_o_site(x0,i);
+
+	for (int l=0;l<lsize;l++)
+	  ((Coeff_t*)&ret._odata[ss]._internal)[l] = (Coeff_t)x[lsize * i + l]; // convert precision
+      }
+
+    }
+
+    void block_get(int b, const Field& ret, std::vector< ComplexD >& x) {
+      std::vector<int> x0;
+      block_to_coor(b,x0);
+
+      int lsize = _cf_o_block_size / _block_sites;
+
+      for (int i=0;i<_block_sites;i++) { // only odd sites
+	int ss = block_site_to_o_site(x0,i);
+
+	for (int l=0;l<lsize;l++)
+	  x[lsize * i + l] = (ComplexD)((Coeff_t*)&ret._odata[ss]._internal)[l];
+      }
+
+    }
+
+    template<class T>
+    void vcscale(iScalar<T>& r,const vCoeff_t& a,const iScalar<T>& x) {
+      vcscale(r._internal,a,x._internal);
+    }
+
+    template<class T,int N>
+    void vcscale(iVector<T,N>& r,const vCoeff_t& a,const iVector<T,N>& x) {
+      for (int i=0;i<N;i++)
+	vcscale(r._internal[i],a,x._internal[i]);
+    }
+
+    void vcscale(vCoeff_t& r,const vCoeff_t& a,const vCoeff_t& x) {
+      r = a*x;
+    }
+
+    void block_cscale(int b, const vCoeff_t& a, Field& ret) {
+
+      std::vector<int> x0;
+      block_to_coor(b,x0);
+      
+      for (int i=0;i<_block_sites;i++) { // only odd sites
+	int ss = block_site_to_o_site(x0,i);
+	vcscale(ret._odata[ss],a,ret._odata[ss]);
+      }
+    }
+
+    void getCanonicalBlockOffset(int cb, std::vector<int>& x0) {
+      const int ndim = 5;
+      assert(_nb.size() == ndim);
+      std::vector<int> _nbc = { _nb[1], _nb[2], _nb[3], _nb[4], _nb[0] };
+      std::vector<int> _bsc = { _bs[1], _bs[2], _bs[3], _bs[4], _bs[0] };
+      x0.resize(ndim);
+
+      assert(cb >= 0);
+      assert(cb < _nbc[0]*_nbc[1]*_nbc[2]*_nbc[3]*_nbc[4]);
+
+      Lexicographic::CoorFromIndex(x0,cb,_nbc);
+      int i;
+
+      for (i=0;i<ndim;i++) {
+	x0[i] *= _bsc[i];
+      }
+
+      //if (cb < 2)
+      //	std::cout << GridLogMessage << "Map: " << cb << " To: " << x0 << std::endl;
+    }
+
+    void pokeBlockOfVectorCanonical(int cb,Field& v,const std::vector<float>& buf) {
+      std::vector<int> _bsc = { _bs[1], _bs[2], _bs[3], _bs[4], _bs[0] };
+      std::vector<int> ldim = v._grid->LocalDimensions();
+      std::vector<int> cldim = { ldim[1], ldim[2], ldim[3], ldim[4], ldim[0] };
+      const int _nbsc = _bs_cb[0]*_bs_cb[1]*_bs_cb[2]*_bs_cb[3]*_bs_cb[4];
+      // take canonical block cb of v and put it in canonical ordering in buf
+      std::vector<int> cx0;
+      getCanonicalBlockOffset(cb,cx0);
+
+#pragma omp parallel
+      {
+	std::vector<int> co0,cl0;
+	co0=cx0; cl0=cx0;
+
+#pragma omp for
+	for (int i=0;i<_nbsc;i++) {
+	  Lexicographic::CoorFromIndex(co0,2*i,_bsc); // 2* for eo
+	  for (int j=0;j<(int)_bsc.size();j++)
+	    cl0[j] = cx0[j] + co0[j];
+	  
+	  std::vector<int> l0 = { cl0[4], cl0[0], cl0[1], cl0[2], cl0[3] };
+	  int oi = v._grid->oIndex(l0);
+	  int ii = v._grid->iIndex(l0);
+	  int lti = i;
+
+	  //if (cb < 2 && i<2)
+	  //  std::cout << GridLogMessage << "Map: " << cb << ", " << i << " To: " << cl0 << ", " << cx0 << ", " << oi << ", " << ii << std::endl;
+	  
+	  for (int s=0;s<4;s++)
+	    for (int c=0;c<3;c++) {
+	      Coeff_t& ld = ((Coeff_t*)&v._odata[oi]._internal._internal[s]._internal[c])[ii];
+	      int ti = 12*lti + 3*s + c;
+	      ld = Coeff_t(buf[2*ti+0], buf[2*ti+1]);
+	    }
+	}
+      }
+    }
+
+    void peekBlockOfVectorCanonical(int cb,const Field& v,std::vector<float>& buf) {
+      std::vector<int> _bsc = { _bs[1], _bs[2], _bs[3], _bs[4], _bs[0] };
+      std::vector<int> ldim = v._grid->LocalDimensions();
+      std::vector<int> cldim = { ldim[1], ldim[2], ldim[3], ldim[4], ldim[0] };
+      const int _nbsc = _bs_cb[0]*_bs_cb[1]*_bs_cb[2]*_bs_cb[3]*_bs_cb[4];
+      // take canonical block cb of v and put it in canonical ordering in buf
+      std::vector<int> cx0;
+      getCanonicalBlockOffset(cb,cx0);
+
+      buf.resize(_cf_block_size * 2);
+
+#pragma omp parallel
+      {
+	std::vector<int> co0,cl0;
+	co0=cx0; cl0=cx0;
+
+#pragma omp for
+	for (int i=0;i<_nbsc;i++) {
+	  Lexicographic::CoorFromIndex(co0,2*i,_bsc); // 2* for eo
+	  for (int j=0;j<(int)_bsc.size();j++)
+	    cl0[j] = cx0[j] + co0[j];
+	  
+	  std::vector<int> l0 = { cl0[4], cl0[0], cl0[1], cl0[2], cl0[3] };
+	  int oi = v._grid->oIndex(l0);
+	  int ii = v._grid->iIndex(l0);
+	  int lti = i;
+	  
+	  //if (cb < 2 && i<2)
+	  //  std::cout << GridLogMessage << "Map: " << cb << ", " << i << " To: " << cl0 << ", " << cx0 << ", " << oi << ", " << ii << std::endl;
+
+	  for (int s=0;s<4;s++)
+	    for (int c=0;c<3;c++) {
+	      Coeff_t& ld = ((Coeff_t*)&v._odata[oi]._internal._internal[s]._internal[c])[ii];
+	      int ti = 12*lti + 3*s + c;
+	      buf[2*ti+0] = ld.real();
+	      buf[2*ti+1] = ld.imag();
+	    }
+	}
+      }
+    }
+
+    int globalToLocalCanonicalBlock(int slot,const std::vector<int>& src_nodes,int nb) {
+      // processor coordinate
+      int _nd = (int)src_nodes.size();
+      std::vector<int> _src_nodes = src_nodes;
+      std::vector<int> pco(_nd);
+      Lexicographic::CoorFromIndex(pco,slot,_src_nodes);
+      std::vector<int> cpco = { pco[1], pco[2], pco[3], pco[4], pco[0] };
+
+      // get local block
+      std::vector<int> _nbc = { _nb[1], _nb[2], _nb[3], _nb[4], _nb[0] };
+      assert(_nd == 5);
+      std::vector<int> c_src_local_blocks(_nd);
+      for (int i=0;i<_nd;i++) {
+	assert(_grid->_fdimensions[i] % (src_nodes[i] * _bs[i]) == 0);
+	c_src_local_blocks[(i+4) % 5] = _grid->_fdimensions[i] / src_nodes[i] / _bs[i];
+      }
+      std::vector<int> cbcoor(_nd); // coordinate of block in slot in canonical form
+      Lexicographic::CoorFromIndex(cbcoor,nb,c_src_local_blocks);
+
+      // cpco, cbcoor
+      std::vector<int> clbcoor(_nd);
+      for (int i=0;i<_nd;i++) {
+	int cgcoor = cpco[i] * c_src_local_blocks[i] + cbcoor[i]; // global block coordinate
+	int pcoor = cgcoor / _nbc[i]; // processor coordinate in my Grid
+	int tpcoor = _grid->_processor_coor[(i+1)%5];
+	if (pcoor != tpcoor)
+	  return -1;
+	clbcoor[i] = cgcoor - tpcoor * _nbc[i]; // canonical local block coordinate for canonical dimension i
+      }
+
+      int lnb;
+      Lexicographic::IndexFromCoor(clbcoor,lnb,_nbc);
+      //std::cout << "Mapped slot = " << slot << " nb = " << nb << " to " << lnb << std::endl;
+      return lnb;
+    }
+
+
+ };
+
+}
@@ -0,0 +1,163 @@
+namespace Grid { 
+
+template<class Field>
+class BasisFieldVector {
+ public:
+  int _Nm;
+
+  typedef typename Field::scalar_type Coeff_t;
+  typedef typename Field::vector_type vCoeff_t;
+  typedef typename Field::vector_object vobj;
+  typedef typename vobj::scalar_object sobj;
+
+  std::vector<Field> _v; // _Nfull vectors
+
+  void report(int n,GridBase* value) {
+
+    std::cout << GridLogMessage << "BasisFieldVector allocated:\n";
+    std::cout << GridLogMessage << " Delta N = " << n << "\n";
+    std::cout << GridLogMessage << " Size of full vectors (size) = " << 
+      ((double)n*sizeof(vobj)*value->oSites() / 1024./1024./1024.) << " GB\n";
+    std::cout << GridLogMessage << " Size = " << _v.size() << " Capacity = " << _v.capacity() << std::endl;
+
+    value->Barrier();
+
+    if (value->IsBoss()) {
+      system("cat /proc/meminfo");
+    }
+
+    value->Barrier();
+
+  }
+
+  BasisFieldVector(int Nm,GridBase* value) : _Nm(Nm), _v(Nm,value) {
+    report(Nm,value);
+  }
+  
+  ~BasisFieldVector() {
+  }
+
+  Field& operator[](int i) {
+    return _v[i];
+  }
+
+  void orthogonalize(Field& w, int k) {
+    for(int j=0; j<k; ++j){
+      Coeff_t ip = (Coeff_t)innerProduct(_v[j],w);
+      w = w - ip*_v[j];
+    }
+  }
+
+  void rotate(std::vector<RealD>& Qt,int j0, int j1, int k0,int k1,int Nm) {
+    
+    GridBase* grid = _v[0]._grid;
+      
+#pragma omp parallel
+    {
+      std::vector < vobj > B(Nm);
+      
+#pragma omp for
+      for(int ss=0;ss < grid->oSites();ss++){
+	for(int j=j0; j<j1; ++j) B[j]=0.;
+	
+	for(int j=j0; j<j1; ++j){
+	  for(int k=k0; k<k1; ++k){
+	    B[j] +=Qt[k+Nm*j] * _v[k]._odata[ss];
+	  }
+	}
+	for(int j=j0; j<j1; ++j){
+	  _v[j]._odata[ss] = B[j];
+	}
+      }
+    }
+
+  }
+
+  size_t size() const {
+    return _Nm;
+  }
+
+  void resize(int n) {
+    if (n > _Nm)
+      _v.reserve(n);
+    
+    _v.resize(n,_v[0]._grid);
+
+    if (n < _Nm)
+      _v.shrink_to_fit();
+
+    report(n - _Nm,_v[0]._grid);
+
+    _Nm = n;
+  }
+
+  std::vector<int> getIndex(std::vector<RealD>& sort_vals) {
+
+    std::vector<int> idx(sort_vals.size());
+    iota(idx.begin(), idx.end(), 0);
+
+    // sort indexes based on comparing values in v
+    sort(idx.begin(), idx.end(),
+	 [&sort_vals](int i1, int i2) {return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);});
+
+    return idx;
+  }
+
+  void reorderInPlace(std::vector<RealD>& sort_vals, std::vector<int>& idx) {
+    GridStopWatch gsw;
+    gsw.Start();
+
+    int nswaps = 0;
+    for (size_t i=0;i<idx.size();i++) {
+      if (idx[i] != i) {
+
+	// find proper place (this could be done in logarithmic time, don't bother for now)
+	size_t j;
+	for (j=i;j<idx.size();j++)
+	  if (idx[j]==i)
+	    break;
+	assert(j!=idx.size());
+	
+	Field _t(_v[0]._grid);
+	_t = _v[idx[j]];
+	_v[idx[j]] = _v[idx[i]];
+	_v[idx[i]] = _t;
+
+	RealD _td = sort_vals[idx[j]];
+	sort_vals[idx[j]] = sort_vals[idx[i]];
+	sort_vals[idx[i]] = _td;
+
+	int _tt = idx[i];
+	idx[i] = idx[j];
+	idx[j] = _tt;
+	
+	nswaps++;
+      }
+    }
+
+    // sort values
+    gsw.Stop();
+    std::cout << GridLogMessage << "Sorted eigenspace in place in " << gsw.Elapsed() << " using " << nswaps << " swaps" << std::endl;
+  }
+
+  void sortInPlace(std::vector<RealD>& sort_vals, bool reverse) {
+
+    std::vector<int> idx = getIndex(sort_vals);
+    if (reverse)
+      std::reverse(idx.begin(), idx.end());
+
+    reorderInPlace(sort_vals,idx);
+
+  }
+
+  void deflate(const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
+    result = zero;
+    int N = (int)_v.size();
+    for (int i=0;i<N;i++) {
+      Field& tmp = _v[i];
+      axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
+    }
+  }
+
+ }; 
+}
@@ -52,8 +52,8 @@ class ConjugateGradient : public OperatorFunction<Field> {
        MaxIterations(maxit),
        ErrorOnNoConverge(err_on_no_conv){};

-  void operator()(LinearOperatorBase<Field> &Linop, const Field &src,
-                  Field &psi) {
+  void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
+
    psi.checkerboard = src.checkerboard;
    conformable(psi, src);

@@ -60,6 +60,7 @@ namespace Grid {
    }
  
    void operator() (const FieldD &src_d_in, FieldD &sol_d){
+
      TotalInnerIterations = 0;
 	
      GridStopWatch TotalTimer;
@@ -0,0 +1,256 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ConjugateGradientReliableUpdate.h
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@phys.columbia.edu>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
+#define GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
+
+namespace Grid {
+
+  template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
+  class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> {
+  public:
+    bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
+    // Defaults true.
+    RealD Tolerance;
+    Integer MaxIterations;
+    Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+    Integer ReliableUpdatesPerformed;
+
+    bool DoFinalCleanup; //Final DP cleanup, defaults to true
+    Integer IterationsToCleanup; //Final DP cleanup step iterations
+    
+    LinearOperatorBase<FieldF> &Linop_f;
+    LinearOperatorBase<FieldD> &Linop_d;
+    GridBase* SinglePrecGrid;
+    RealD Delta; //reliable update parameter
+
+    //Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
+    LinearOperatorBase<FieldF> *Linop_fallback;
+    RealD fallback_transition_tol;
+
+    
+    ConjugateGradientReliableUpdate(RealD tol, Integer maxit, RealD _delta, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d, bool err_on_no_conv = true)
+      : Tolerance(tol),
+        MaxIterations(maxit),
+	Delta(_delta),
+	Linop_f(_Linop_f),
+	Linop_d(_Linop_d),
+	SinglePrecGrid(_sp_grid),
+        ErrorOnNoConverge(err_on_no_conv),
+	DoFinalCleanup(true),
+	Linop_fallback(NULL)
+    {};
+
+    void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
+      Linop_fallback = &_Linop_fallback;
+      fallback_transition_tol = _fallback_transition_tol;      
+    }
+    
+    void operator()(const FieldD &src, FieldD &psi) {
+      LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f;
+      bool using_fallback = false;
+      
+      psi.checkerboard = src.checkerboard;
+      conformable(psi, src);
+
+      RealD cp, c, a, d, b, ssq, qq, b_pred;
+
+      FieldD p(src);
+      FieldD mmp(src);
+      FieldD r(src);
+
+      // Initial residual computation & set up
+      RealD guess = norm2(psi);
+      assert(std::isnan(guess) == 0);
+    
+      Linop_d.HermOpAndNorm(psi, mmp, d, b);
+    
+      r = src - mmp;
+      p = r;
+
+      a = norm2(p);
+      cp = a;
+      ssq = norm2(src);
+
+      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: guess " << guess << std::endl;
+      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:   src " << ssq << std::endl;
+      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:    mp " << d << std::endl;
+      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:   mmp " << b << std::endl;
+      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:  cp,r " << cp << std::endl;
+      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:     p " << a << std::endl;
+
+      RealD rsq = Tolerance * Tolerance * ssq;
+
+      // Check if guess is really REALLY good :)
+      if (cp <= rsq) {
+	std::cout << GridLogMessage << "ConjugateGradientReliableUpdate guess was REALLY good\n";
+	std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
+	return;
+      }
+
+      //Single prec initialization
+      FieldF r_f(SinglePrecGrid);
+      r_f.checkerboard = r.checkerboard;
+      precisionChange(r_f, r);
+
+      FieldF psi_f(r_f);
+      psi_f = zero;
+
+      FieldF p_f(r_f);
+      FieldF mmp_f(r_f);
+
+      RealD MaxResidSinceLastRelUp = cp; //initial residual    
+    
+      std::cout << GridLogIterative << std::setprecision(4)
+		<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
+
+      GridStopWatch LinalgTimer;
+      GridStopWatch MatrixTimer;
+      GridStopWatch SolverTimer;
+
+      SolverTimer.Start();
+      int k = 0;
+      int l = 0;
+    
+      for (k = 1; k <= MaxIterations; k++) {
+	c = cp;
+
+	MatrixTimer.Start();
+	Linop_f_use->HermOpAndNorm(p_f, mmp_f, d, qq);
+	MatrixTimer.Stop();
+
+	LinalgTimer.Start();
+
+	a = c / d;
+	b_pred = a * (a * qq - d) / c;
+
+	cp = axpy_norm(r_f, -a, mmp_f, r_f);
+	b = cp / c;
+
+	// Fuse these loops ; should be really easy
+	psi_f = a * p_f + psi_f;
+	//p_f = p_f * b + r_f;
+
+	LinalgTimer.Stop();
+
+	std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: Iteration " << k
+		  << " residual " << cp << " target " << rsq << std::endl;
+	std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << "  b = "<< b << std::endl;
+	std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << "  c = "<< c << std::endl;
+
+	if(cp > MaxResidSinceLastRelUp){
+	  std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: updating MaxResidSinceLastRelUp : " << MaxResidSinceLastRelUp << " -> " << cp << std::endl;
+	  MaxResidSinceLastRelUp = cp;
+	}
+	  
+	// Stopping condition
+	if (cp <= rsq) {
+	  //Although not written in the paper, I assume that I have to add on the final solution
+	  precisionChange(mmp, psi_f);
+	  psi = psi + mmp;
+	
+	
+	  SolverTimer.Stop();
+	  Linop_d.HermOpAndNorm(psi, mmp, d, qq);
+	  p = mmp - src;
+
+	  RealD srcnorm = sqrt(norm2(src));
+	  RealD resnorm = sqrt(norm2(p));
+	  RealD true_residual = resnorm / srcnorm;
+
+	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate Converged on iteration " << k << " after " << l << " reliable updates" << std::endl;
+	  std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
+	  std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
+	  std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
+
+	  std::cout << GridLogMessage << "Time breakdown "<<std::endl;
+	  std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
+	  std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
+	  std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
+
+	  IterationsToComplete = k;	
+	  ReliableUpdatesPerformed = l;
+	  
+	  if(DoFinalCleanup){
+	    //Do a final CG to cleanup
+	    std::cout << GridLogMessage << "ConjugateGradientReliableUpdate performing final cleanup.\n";
+	    ConjugateGradient<FieldD> CG(Tolerance,MaxIterations);
+	    CG.ErrorOnNoConverge = ErrorOnNoConverge;
+	    CG(Linop_d,src,psi);
+	    IterationsToCleanup = CG.IterationsToComplete;
+	  }
+	  else if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
+
+	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n";
+	  return;
+	}
+	else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
+	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
+		    << cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
+	  precisionChange(mmp, psi_f);
+	  psi = psi + mmp;
+
+	  Linop_d.HermOpAndNorm(psi, mmp, d, qq);
+	  r = src - mmp;
+
+	  psi_f = zero;
+	  precisionChange(r_f, r);
+	  cp = norm2(r);
+	  MaxResidSinceLastRelUp = cp;
+
+	  b = cp/c;
+	  
+	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate new residual " << cp << std::endl;
+	  
+	  l = l+1;
+	}
+
+	p_f = p_f * b + r_f; //update search vector after reliable update appears to help convergence
+
+	if(!using_fallback && Linop_fallback != NULL && cp < fallback_transition_tol){
+	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate switching to fallback linear operator on iteration " << k << " at residual " << cp << std::endl;
+	  Linop_f_use = Linop_fallback;
+	  using_fallback = true;
+	}
+
+	
+      }
+      std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge"
+		<< std::endl;
+      
+      if (ErrorOnNoConverge) assert(0);
+      IterationsToComplete = k;
+      ReliableUpdatesPerformed = l;      
+    }    
+  };
+
+
+};
+
+
+
+#endif
@@ -7,6 +7,7 @@
    Copyright (C) 2015

 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Chulwoo Jung <chulwoo@bnl.gov>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -53,16 +54,194 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
   *     M psi = eta
   ***********************
   *Odd
-   * i)   (D_oo)^{\dag} D_oo psi_o = (D_oo)^dag L^{-1}  eta_o
+   * i)                 D_oo psi_o =  L^{-1}  eta_o
   *                        eta_o' = (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
+   *
+   * Wilson:
+   *      (D_oo)^{\dag} D_oo psi_o = (D_oo)^dag L^{-1}  eta_o
+   * Stag:
+   *      D_oo psi_o = L^{-1}  eta =    (eta_o - Moe Mee^{-1} eta_e)
+   *
+   * L^-1 eta_o= (1              0 ) (e
+   *             (-MoeMee^{-1}   1 )   
+   *
   *Even
   * ii)  Mee psi_e + Meo psi_o = src_e
   *
   *   => sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
   *
+   * 
+   * TODO: Other options:
+   * 
+   * a) change checkerboards for Schur e<->o
+   *
+   * Left precon by Moo^-1
+   * b) Doo^{dag} M_oo^-dag Moo^-1 Doo psi_0 =  (D_oo)^dag M_oo^-dag Moo^-1 L^{-1}  eta_o
+   *                              eta_o'     = (D_oo)^dag  M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e)
+   *
+   * Right precon by Moo^-1
+   * c) M_oo^-dag Doo^{dag} Doo Moo^-1 phi_0 = M_oo^-dag (D_oo)^dag L^{-1}  eta_o
+   *                              eta_o'     = M_oo^-dag (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
+   *                              psi_o = M_oo^-1 phi_o
+   * TODO: Deflation 
   */
 namespace Grid {

+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Take a matrix and form a Red Black solver calling a Herm solver
+  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  template<class Field> class SchurRedBlackStaggeredSolve {
+  private:
+    OperatorFunction<Field> & _HermitianRBSolver;
+    int CBfactorise;
+  public:
+
+    /////////////////////////////////////////////////////
+    // Wrap the usual normal equations Schur trick
+    /////////////////////////////////////////////////////
+  SchurRedBlackStaggeredSolve(OperatorFunction<Field> &HermitianRBSolver)  :
+     _HermitianRBSolver(HermitianRBSolver) 
+    { 
+      CBfactorise=0;
+    };
+
+    template<class Matrix>
+      void operator() (Matrix & _Matrix,const Field &in, Field &out){
+
+      // FIXME CGdiagonalMee not implemented virtual function
+      // FIXME use CBfactorise to control schur decomp
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
+ 
+      Field src_e(grid);
+      Field src_o(grid);
+      Field sol_e(grid);
+      Field sol_o(grid);
+      Field   tmp(grid);
+      Field  Mtmp(grid);
+      Field resid(fgrid);
+
+      pickCheckerboard(Even,src_e,in);
+      pickCheckerboard(Odd ,src_o,in);
+      pickCheckerboard(Even,sol_e,out);
+      pickCheckerboard(Odd ,sol_o,out);
+    
+      /////////////////////////////////////////////////////
+      // src_o = (source_o - Moe MeeInv source_e)
+      /////////////////////////////////////////////////////
+      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
+      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
+      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
+
+      src_o = tmp;     assert(src_o.checkerboard ==Odd);
+      //  _Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source
+
+      //////////////////////////////////////////////////////////////
+      // Call the red-black solver
+      //////////////////////////////////////////////////////////////
+      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver calling the Mpc solver" <<std::endl;
+      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
+
+      ///////////////////////////////////////////////////
+      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+      ///////////////////////////////////////////////////
+      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
+      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
+      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
+     
+      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
+      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
+
+      // Verify the unprec residual
+      _Matrix.M(out,resid); 
+      resid = resid-in;
+      RealD ns = norm2(in);
+      RealD nr = norm2(resid);
+
+      std::cout<<GridLogMessage << "SchurRedBlackStaggered solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
+    }     
+  };
+//  template<class Field> using SchurRedBlackStagSolve = SchurRedBlackStaggeredSolve<Field>;
+  template<class Field> class SchurRedBlackStagSolve {
+  private:
+    OperatorFunction<Field> & _HermitianRBSolver;
+    int CBfactorise;
+  public:
+
+    /////////////////////////////////////////////////////
+    // Wrap the usual normal equations Schur trick
+    /////////////////////////////////////////////////////
+  SchurRedBlackStagSolve(OperatorFunction<Field> &HermitianRBSolver, int cb)  :
+     _HermitianRBSolver(HermitianRBSolver), CBfactorise(cb) {}
+
+    template<class Matrix>
+      void operator() (Matrix & _Matrix,const Field &in, Field &out){
+
+      // FIXME CGdiagonalMee not implemented virtual function
+      // FIXME use CBfactorise to control schur decomp
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      SchurStagOperator<Matrix,Field> _HermOpEO(_Matrix);
+      int Schur = CBfactorise;
+      int Other = 1 - CBfactorise;
+ 
+      Field src_e(grid);
+      Field src_o(grid);
+      Field sol_e(grid);
+      Field sol_o(grid);
+      Field   tmp(grid);
+      Field  Mtmp(grid);
+      Field resid(fgrid);
+
+      pickCheckerboard(Other,src_e,in);
+      pickCheckerboard(Schur ,src_o,in);
+      pickCheckerboard(Other,sol_e,out);
+      pickCheckerboard(Schur ,sol_o,out);
+    
+      /////////////////////////////////////////////////////
+      // src_o = Mdag * (source_o - Moe MeeInv source_e)
+      /////////////////////////////////////////////////////
+      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Other);
+      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Schur);     
+      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Schur);     
+
+#if 0
+      // get the right MpcDag
+//      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Schur);       
+#else
+      _Matrix.Mooee(tmp,src_o);     assert(src_o.checkerboard ==Schur);
+#endif
+      //////////////////////////////////////////////////////////////
+      // Call the red-black solver
+      //////////////////////////////////////////////////////////////
+      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
+      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Schur);
+
+      ///////////////////////////////////////////////////
+      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+      ///////////////////////////////////////////////////
+      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Other);
+      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Other);
+      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Other);
+     
+      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Other);
+      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Schur );
+
+      // Verify the unprec residual
+      _Matrix.M(out,resid); 
+      resid = resid-in;
+      RealD ns = norm2(in);
+      RealD nr = norm2(resid);
+
+      std::cout<<GridLogMessage << "SchurRedBlackStag solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
+    }     
+  };
+
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Take a matrix and form a Red Black solver calling a Herm solver
  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
@@ -76,12 +255,10 @@ namespace Grid {
    /////////////////////////////////////////////////////
    // Wrap the usual normal equations Schur trick
    /////////////////////////////////////////////////////
-  SchurRedBlackDiagMooeeSolve(OperatorFunction<Field> &HermitianRBSolver)  :
-     _HermitianRBSolver(HermitianRBSolver) 
-    { 
-      CBfactorise=0;
-    };
-
+  SchurRedBlackDiagMooeeSolve(OperatorFunction<Field> &HermitianRBSolver,int cb=0)  :  _HermitianRBSolver(HermitianRBSolver) 
+  { 
+    CBfactorise=cb;
+  };
    template<class Matrix>
      void operator() (Matrix & _Matrix,const Field &in, Field &out){

@@ -141,5 +318,238 @@ namespace Grid {
    }     
  };

+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Take a matrix and form a Red Black solver calling a Herm solver
+  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field> class SchurRedBlackDiagTwoSolve {
+  private:
+    OperatorFunction<Field> & _HermitianRBSolver;
+    int CBfactorise;
+  public:
+
+    /////////////////////////////////////////////////////
+    // Wrap the usual normal equations Schur trick
+    /////////////////////////////////////////////////////
+  SchurRedBlackDiagTwoSolve(OperatorFunction<Field> &HermitianRBSolver)  :
+     _HermitianRBSolver(HermitianRBSolver) 
+    { 
+      CBfactorise=0;
+    };
+
+    template<class Matrix>
+      void operator() (Matrix & _Matrix,const Field &in, Field &out){
+
+      // FIXME CGdiagonalMee not implemented virtual function
+      // FIXME use CBfactorise to control schur decomp
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
+ 
+      Field src_e(grid);
+      Field src_o(grid);
+      Field sol_e(grid);
+      Field sol_o(grid);
+      Field   tmp(grid);
+      Field  Mtmp(grid);
+      Field resid(fgrid);
+
+      pickCheckerboard(Even,src_e,in);
+      pickCheckerboard(Odd ,src_o,in);
+      pickCheckerboard(Even,sol_e,out);
+      pickCheckerboard(Odd ,sol_o,out);
+    
+      /////////////////////////////////////////////////////
+      // src_o = Mdag * (source_o - Moe MeeInv source_e)
+      /////////////////////////////////////////////////////
+      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
+      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
+      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
+
+      // get the right MpcDag
+      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
+
+      //////////////////////////////////////////////////////////////
+      // Call the red-black solver
+      //////////////////////////////////////////////////////////////
+      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
+//      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
+      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
+      _Matrix.MooeeInv(tmp,sol_o);        assert(  sol_o.checkerboard   ==Odd);
+
+      ///////////////////////////////////////////////////
+      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+      ///////////////////////////////////////////////////
+      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
+      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
+      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
+     
+      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
+      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
+
+      // Verify the unprec residual
+      _Matrix.M(out,resid); 
+      resid = resid-in;
+      RealD ns = norm2(in);
+      RealD nr = norm2(resid);
+
+      std::cout<<GridLogMessage << "SchurRedBlackDiagTwo solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
+    }     
+  };
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Take a matrix and form a Red Black solver calling a Herm solver
+  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field> class SchurRedBlackDiagTwoMixed {
+  private:
+    LinearFunction<Field> & _HermitianRBSolver;
+    int CBfactorise;
+  public:
+
+    /////////////////////////////////////////////////////
+    // Wrap the usual normal equations Schur trick
+    /////////////////////////////////////////////////////
+  SchurRedBlackDiagTwoMixed(LinearFunction<Field> &HermitianRBSolver)  :
+     _HermitianRBSolver(HermitianRBSolver) 
+    { 
+      CBfactorise=0;
+    };
+
+    template<class Matrix>
+      void operator() (Matrix & _Matrix,const Field &in, Field &out){
+
+      // FIXME CGdiagonalMee not implemented virtual function
+      // FIXME use CBfactorise to control schur decomp
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
+ 
+      Field src_e(grid);
+      Field src_o(grid);
+      Field sol_e(grid);
+      Field sol_o(grid);
+      Field   tmp(grid);
+      Field  Mtmp(grid);
+      Field resid(fgrid);
+
+      pickCheckerboard(Even,src_e,in);
+      pickCheckerboard(Odd ,src_o,in);
+      pickCheckerboard(Even,sol_e,out);
+      pickCheckerboard(Odd ,sol_o,out);
+    
+      /////////////////////////////////////////////////////
+      // src_o = Mdag * (source_o - Moe MeeInv source_e)
+      /////////////////////////////////////////////////////
+      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
+      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
+      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
+
+      // get the right MpcDag
+      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
+
+      //////////////////////////////////////////////////////////////
+      // Call the red-black solver
+      //////////////////////////////////////////////////////////////
+      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
+//      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
+//      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
+      _HermitianRBSolver(src_o,tmp);  assert(tmp.checkerboard==Odd);
+      _Matrix.MooeeInv(tmp,sol_o);        assert(  sol_o.checkerboard   ==Odd);
+
+      ///////////////////////////////////////////////////
+      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+      ///////////////////////////////////////////////////
+      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
+      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
+      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
+     
+      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
+      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
+
+      // Verify the unprec residual
+      _Matrix.M(out,resid); 
+      resid = resid-in;
+      RealD ns = norm2(in);
+      RealD nr = norm2(resid);
+
+      std::cout<<GridLogMessage << "SchurRedBlackDiagTwo solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
+    }     
+  };
+
+  template<class Field> class SchurRedBlackStagMixed {
+  private:
+    LinearFunction<Field> & _HermitianRBSolver;
+    int CBfactorise;
+  public:
+
+    /////////////////////////////////////////////////////
+    // Wrap the usual normal equations Schur trick
+    /////////////////////////////////////////////////////
+  SchurRedBlackStagMixed(LinearFunction<Field> &HermitianRBSolver, int cb)  :
+     _HermitianRBSolver(HermitianRBSolver), CBfactorise(cb) {}
+
+    template<class Matrix>
+      void operator() (Matrix & _Matrix,const Field &in, Field &out){
+
+      // FIXME CGdiagonalMee not implemented virtual function
+      // FIXME use CBfactorise to control schur decomp
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      SchurStagOperator<Matrix,Field> _HermOpEO(_Matrix);
+      int Schur = CBfactorise;
+      int Other = 1 - CBfactorise;
+ 
+      Field src_e(grid);
+      Field src_o(grid);
+      Field sol_e(grid);
+      Field sol_o(grid);
+      Field   tmp(grid);
+      Field  Mtmp(grid);
+      Field resid(fgrid);
+
+      pickCheckerboard(Other,src_e,in);
+      pickCheckerboard(Schur ,src_o,in);
+      pickCheckerboard(Other,sol_e,out);
+      pickCheckerboard(Schur ,sol_o,out);
+    
+      /////////////////////////////////////////////////////
+      // src_o = Mdag * (source_o - Moe MeeInv source_e)
+      /////////////////////////////////////////////////////
+      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Other);
+      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Schur);     
+      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Schur);     
+
+      _Matrix.Mooee(tmp,src_o);     assert(src_o.checkerboard ==Schur);
+      //////////////////////////////////////////////////////////////
+      // Call the red-black solver
+      //////////////////////////////////////////////////////////////
+      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
+//      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Schur);
+      _HermitianRBSolver(src_o,sol_o);  assert(sol_o.checkerboard==Other);
+
+      ///////////////////////////////////////////////////
+      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+      ///////////////////////////////////////////////////
+      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Other);
+      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Other);
+      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Other);
+     
+      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Other);
+      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Schur );
+
+      // Verify the unprec residual
+      _Matrix.M(out,resid); 
+      resid = resid-in;
+      RealD ns = norm2(in);
+      RealD nr = norm2(resid);
+
+      std::cout<<GridLogMessage << "SchurRedBlackStag solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
+    }     
+  };
+
 }
 #endif
@@ -0,0 +1,933 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Chulwoo Jung <chulwoo@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LANC_H
+#define GRID_LANC_H
+
+#include <string.h>		//memset
+
+#ifdef USE_LAPACK
+#ifdef USE_MKL
+#include<mkl_lapack.h>
+#else
+void LAPACK_dstegr (char *jobz, char *range, int *n, double *d, double *e,
+		    double *vl, double *vu, int *il, int *iu, double *abstol,
+		    int *m, double *w, double *z, int *ldz, int *isuppz,
+		    double *work, int *lwork, int *iwork, int *liwork,
+		    int *info);
+//#include <lapacke/lapacke.h>
+#endif
+#endif
+
+#include <Grid/algorithms/densematrix/DenseMatrix.h>
+//#include <Grid/algorithms/iterative/EigenSort.h>
+
+// eliminate temorary vector in calc()
+#define MEM_SAVE
+
+namespace Grid
+{
+
+  struct Bisection
+  {
+
+#if 0
+    static void get_eig2 (int row_num, std::vector < RealD > &ALPHA,
+			  std::vector < RealD > &BETA,
+			  std::vector < RealD > &eig)
+    {
+      int i, j;
+        std::vector < RealD > evec1 (row_num + 3);
+        std::vector < RealD > evec2 (row_num + 3);
+      RealD eps2;
+        ALPHA[1] = 0.;
+        BETHA[1] = 0.;
+      for (i = 0; i < row_num - 1; i++)
+	{
+	  ALPHA[i + 1] = A[i * (row_num + 1)].real ();
+	  BETHA[i + 2] = A[i * (row_num + 1) + 1].real ();
+	}
+      ALPHA[row_num] = A[(row_num - 1) * (row_num + 1)].real ();
+        bisec (ALPHA, BETHA, row_num, 1, row_num, 1e-10, 1e-10, evec1, eps2);
+        bisec (ALPHA, BETHA, row_num, 1, row_num, 1e-16, 1e-16, evec2, eps2);
+
+      // Do we really need to sort here?
+      int begin = 1;
+      int end = row_num;
+      int swapped = 1;
+      while (swapped)
+	{
+	  swapped = 0;
+	  for (i = begin; i < end; i++)
+	    {
+	      if (mag (evec2[i]) > mag (evec2[i + 1]))
+		{
+		  swap (evec2 + i, evec2 + i + 1);
+		  swapped = 1;
+		}
+	    }
+	  end--;
+	  for (i = end - 1; i >= begin; i--)
+	    {
+	      if (mag (evec2[i]) > mag (evec2[i + 1]))
+		{
+		  swap (evec2 + i, evec2 + i + 1);
+		  swapped = 1;
+		}
+	    }
+	  begin++;
+	}
+
+      for (i = 0; i < row_num; i++)
+	{
+	  for (j = 0; j < row_num; j++)
+	    {
+	      if (i == j)
+		H[i * row_num + j] = evec2[i + 1];
+	      else
+		H[i * row_num + j] = 0.;
+	    }
+	}
+    }
+#endif
+
+    static void bisec (std::vector < RealD > &c,
+		       std::vector < RealD > &b,
+		       int n,
+		       int m1,
+		       int m2,
+		       RealD eps1,
+		       RealD relfeh, std::vector < RealD > &x, RealD & eps2)
+    {
+      std::vector < RealD > wu (n + 2);
+
+      RealD h, q, x1, xu, x0, xmin, xmax;
+      int i, a, k;
+
+      b[1] = 0.0;
+      xmin = c[n] - fabs (b[n]);
+      xmax = c[n] + fabs (b[n]);
+      for (i = 1; i < n; i++)
+	{
+	  h = fabs (b[i]) + fabs (b[i + 1]);
+	  if (c[i] + h > xmax)
+	    xmax = c[i] + h;
+	  if (c[i] - h < xmin)
+	    xmin = c[i] - h;
+	}
+      xmax *= 2.;
+
+      eps2 = relfeh * ((xmin + xmax) > 0.0 ? xmax : -xmin);
+      if (eps1 <= 0.0)
+	eps1 = eps2;
+      eps2 = 0.5 * eps1 + 7.0 * (eps2);
+      x0 = xmax;
+      for (i = m1; i <= m2; i++)
+	{
+	  x[i] = xmax;
+	  wu[i] = xmin;
+	}
+
+      for (k = m2; k >= m1; k--)
+	{
+	  xu = xmin;
+	  i = k;
+	  do
+	    {
+	      if (xu < wu[i])
+		{
+		  xu = wu[i];
+		  i = m1 - 1;
+		}
+	      i--;
+	    }
+	  while (i >= m1);
+	  if (x0 > x[k])
+	    x0 = x[k];
+	  while ((x0 - xu) > 2 * relfeh * (fabs (xu) + fabs (x0)) + eps1)
+	    {
+	      x1 = (xu + x0) / 2;
+
+	      a = 0;
+	      q = 1.0;
+	      for (i = 1; i <= n; i++)
+		{
+		  q =
+		    c[i] - x1 -
+		    ((q != 0.0) ? b[i] * b[i] / q : fabs (b[i]) / relfeh);
+		  if (q < 0)
+		    a++;
+		}
+//      printf("x1=%0.14e a=%d\n",x1,a);
+	      if (a < k)
+		{
+		  if (a < m1)
+		    {
+		      xu = x1;
+		      wu[m1] = x1;
+		    }
+		  else
+		    {
+		      xu = x1;
+		      wu[a + 1] = x1;
+		      if (x[a] > x1)
+			x[a] = x1;
+		    }
+		}
+	      else
+		x0 = x1;
+	    }
+	  printf ("x0=%0.14e xu=%0.14e k=%d\n", x0, xu, k);
+	  x[k] = (x0 + xu) / 2;
+	}
+    }
+  };
+
+/////////////////////////////////////////////////////////////
+// Implicitly restarted lanczos
+/////////////////////////////////////////////////////////////
+
+
+  template < class Field > class SimpleLanczos
+  {
+
+    const RealD small = 1.0e-16;
+  public:
+    int lock;
+    int get;
+    int Niter;
+    int converged;
+
+    int Nstop;			// Number of evecs checked for convergence
+    int Nk;			// Number of converged sought
+    int Np;			// Np -- Number of spare vecs in kryloc space
+    int Nm;			// Nm -- total number of vectors
+
+
+    RealD OrthoTime;
+
+    RealD eresid;
+
+    SortEigen < Field > _sort;
+
+    LinearOperatorBase < Field > &_Linop;
+
+    OperatorFunction < Field > &_poly;
+
+    /////////////////////////
+    // Constructor
+    /////////////////////////
+    void init (void)
+    {
+    };
+    void Abort (int ff, DenseVector < RealD > &evals,
+		DenseVector < DenseVector < RealD > >&evecs);
+
+    SimpleLanczos (LinearOperatorBase < Field > &Linop,	// op
+		   OperatorFunction < Field > &poly,	// polynmial
+		   int _Nstop,	// sought vecs
+		   int _Nk,	// sought vecs
+		   int _Nm,	// spare vecs
+		   RealD _eresid,	// resid in lmdue deficit 
+		   int _Niter):	// Max iterations
+     
+      _Linop (Linop),
+      _poly (poly),
+      Nstop (_Nstop), Nk (_Nk), Nm (_Nm), eresid (_eresid), Niter (_Niter)
+    {
+      Np = Nm - Nk;
+      assert (Np > 0);
+    };
+
+    /////////////////////////
+    // Sanity checked this routine (step) against Saad.
+    /////////////////////////
+    void RitzMatrix (DenseVector < Field > &evec, int k)
+    {
+
+      if (1)
+	return;
+
+      GridBase *grid = evec[0]._grid;
+      Field w (grid);
+      std::cout << GridLogMessage << "RitzMatrix " << std::endl;
+      for (int i = 0; i < k; i++)
+	{
+	  _Linop.HermOp (evec[i], w);
+//      _poly(_Linop,evec[i],w);
+	  std::cout << GridLogMessage << "[" << i << "] ";
+	  for (int j = 0; j < k; j++)
+	    {
+	      ComplexD in = innerProduct (evec[j], w);
+	      if (fabs ((double) i - j) > 1)
+		{
+		  if (abs (in) > 1.0e-9)
+		    {
+		      std::cout << GridLogMessage << "oops" << std::endl;
+		      abort ();
+		    }
+		  else
+		    std::cout << GridLogMessage << " 0 ";
+		}
+	      else
+		{
+		  std::cout << GridLogMessage << " " << in << " ";
+		}
+	    }
+	  std::cout << GridLogMessage << std::endl;
+	}
+    }
+
+    void step (DenseVector < RealD > &lmd,
+	       DenseVector < RealD > &lme,
+	       Field & last, Field & current, Field & next, uint64_t k)
+    {
+      if (lmd.size () <= k)
+	lmd.resize (k + Nm);
+      if (lme.size () <= k)
+	lme.resize (k + Nm);
+
+
+//      _poly(_Linop,current,next );   // 3. wk:=Avk−βkv_{k−1}
+      _Linop.HermOp (current, next);	// 3. wk:=Avk−βkv_{k−1}
+      if (k > 0)
+	{
+	  next -= lme[k - 1] * last;
+	}
+//      std::cout<<GridLogMessage << "<last|next>" << innerProduct(last,next) <<std::endl;
+
+      ComplexD zalph = innerProduct (current, next);	// 4. αk:=(wk,vk)
+      RealD alph = real (zalph);
+
+      next = next - alph * current;	// 5. wk:=wk−αkvk
+//      std::cout<<GridLogMessage << "<current|next>" << innerProduct(current,next) <<std::endl;
+
+      RealD beta = normalise (next);	// 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
+      // 7. vk+1 := wk/βk+1
+//       norm=beta;
+
+      int interval = Nm / 100 + 1;
+      if ((k % interval) == 0)
+	std::
+	  cout << GridLogMessage << k << " : alpha = " << zalph << " beta " <<
+	  beta << std::endl;
+      const RealD tiny = 1.0e-20;
+      if (beta < tiny)
+	{
+	  std::cout << GridLogMessage << " beta is tiny " << beta << std::
+	    endl;
+	}
+      lmd[k] = alph;
+      lme[k] = beta;
+
+    }
+
+    void qr_decomp (DenseVector < RealD > &lmd,
+		    DenseVector < RealD > &lme,
+		    int Nk,
+		    int Nm,
+		    DenseVector < RealD > &Qt, RealD Dsh, int kmin, int kmax)
+    {
+      int k = kmin - 1;
+      RealD x;
+
+      RealD Fden = 1.0 / hypot (lmd[k] - Dsh, lme[k]);
+      RealD c = (lmd[k] - Dsh) * Fden;
+      RealD s = -lme[k] * Fden;
+
+      RealD tmpa1 = lmd[k];
+      RealD tmpa2 = lmd[k + 1];
+      RealD tmpb = lme[k];
+
+      lmd[k] = c * c * tmpa1 + s * s * tmpa2 - 2.0 * c * s * tmpb;
+      lmd[k + 1] = s * s * tmpa1 + c * c * tmpa2 + 2.0 * c * s * tmpb;
+      lme[k] = c * s * (tmpa1 - tmpa2) + (c * c - s * s) * tmpb;
+      x = -s * lme[k + 1];
+      lme[k + 1] = c * lme[k + 1];
+
+      for (int i = 0; i < Nk; ++i)
+	{
+	  RealD Qtmp1 = Qt[i + Nm * k];
+	  RealD Qtmp2 = Qt[i + Nm * (k + 1)];
+	  Qt[i + Nm * k] = c * Qtmp1 - s * Qtmp2;
+	  Qt[i + Nm * (k + 1)] = s * Qtmp1 + c * Qtmp2;
+	}
+
+      // Givens transformations
+      for (int k = kmin; k < kmax - 1; ++k)
+	{
+
+	  RealD Fden = 1.0 / hypot (x, lme[k - 1]);
+	  RealD c = lme[k - 1] * Fden;
+	  RealD s = -x * Fden;
+
+	  RealD tmpa1 = lmd[k];
+	  RealD tmpa2 = lmd[k + 1];
+	  RealD tmpb = lme[k];
+
+	  lmd[k] = c * c * tmpa1 + s * s * tmpa2 - 2.0 * c * s * tmpb;
+	  lmd[k + 1] = s * s * tmpa1 + c * c * tmpa2 + 2.0 * c * s * tmpb;
+	  lme[k] = c * s * (tmpa1 - tmpa2) + (c * c - s * s) * tmpb;
+	  lme[k - 1] = c * lme[k - 1] - s * x;
+
+	  if (k != kmax - 2)
+	    {
+	      x = -s * lme[k + 1];
+	      lme[k + 1] = c * lme[k + 1];
+	    }
+
+	  for (int i = 0; i < Nk; ++i)
+	    {
+	      RealD Qtmp1 = Qt[i + Nm * k];
+	      RealD Qtmp2 = Qt[i + Nm * (k + 1)];
+	      Qt[i + Nm * k] = c * Qtmp1 - s * Qtmp2;
+	      Qt[i + Nm * (k + 1)] = s * Qtmp1 + c * Qtmp2;
+	    }
+	}
+    }
+
+#ifdef USE_LAPACK
+#ifdef USE_MKL
+#define LAPACK_INT MKL_INT
+#else
+#define LAPACK_INT long long
+#endif
+    void diagonalize_lapack (DenseVector < RealD > &lmd, DenseVector < RealD > &lme, int N1,	// all
+			     int N2,	// get
+			     GridBase * grid)
+    {
+      const int size = Nm;
+      LAPACK_INT NN = N1;
+      double evals_tmp[NN];
+      double DD[NN];
+      double EE[NN];
+      for (int i = 0; i < NN; i++)
+	for (int j = i - 1; j <= i + 1; j++)
+	  if (j < NN && j >= 0)
+	    {
+	      if (i == j)
+		DD[i] = lmd[i];
+	      if (i == j)
+		evals_tmp[i] = lmd[i];
+	      if (j == (i - 1))
+		EE[j] = lme[j];
+	    }
+      LAPACK_INT evals_found;
+      LAPACK_INT lwork =
+	((18 * NN) >
+	 (1 + 4 * NN + NN * NN) ? (18 * NN) : (1 + 4 * NN + NN * NN));
+      LAPACK_INT liwork = 3 + NN * 10;
+      LAPACK_INT iwork[liwork];
+      double work[lwork];
+      LAPACK_INT isuppz[2 * NN];
+      char jobz = 'N';		// calculate evals only
+      char range = 'I';		// calculate il-th to iu-th evals
+      //    char range = 'A'; // calculate all evals
+      char uplo = 'U';		// refer to upper half of original matrix
+      char compz = 'I';		// Compute eigenvectors of tridiagonal matrix
+      int ifail[NN];
+      LAPACK_INT info;
+//  int total = QMP_get_number_of_nodes();
+//  int node = QMP_get_node_number();
+//  GridBase *grid = evec[0]._grid;
+      int total = grid->_Nprocessors;
+      int node = grid->_processor;
+      int interval = (NN / total) + 1;
+      double vl = 0.0, vu = 0.0;
+      LAPACK_INT il = interval * node + 1, iu = interval * (node + 1);
+      if (iu > NN)
+	iu = NN;
+      double tol = 0.0;
+      if (1)
+	{
+	  memset (evals_tmp, 0, sizeof (double) * NN);
+	  if (il <= NN)
+	    {
+	      printf ("total=%d node=%d il=%d iu=%d\n", total, node, il, iu);
+#ifdef USE_MKL
+	      dstegr (&jobz, &range, &NN,
+#else
+	      LAPACK_dstegr (&jobz, &range, &NN,
+#endif
+			     (double *) DD, (double *) EE, &vl, &vu, &il, &iu,	// these four are ignored if second parameteris 'A'
+			     &tol,	// tolerance
+			     &evals_found, evals_tmp, (double *) NULL, &NN,
+			     isuppz, work, &lwork, iwork, &liwork, &info);
+	      for (int i = iu - 1; i >= il - 1; i--)
+		{
+		  printf ("node=%d evals_found=%d evals_tmp[%d] = %g\n", node,
+			  evals_found, i - (il - 1), evals_tmp[i - (il - 1)]);
+		  evals_tmp[i] = evals_tmp[i - (il - 1)];
+		  if (il > 1)
+		    evals_tmp[i - (il - 1)] = 0.;
+		}
+	    }
+	  {
+	    grid->GlobalSumVector (evals_tmp, NN);
+	  }
+	}
+// cheating a bit. It is better to sort instead of just reversing it, but the document of the routine says evals are sorted in increasing order. qr gives evals in decreasing order.
+    }
+#undef LAPACK_INT
+#endif
+
+
+    void diagonalize (DenseVector < RealD > &lmd,
+		      DenseVector < RealD > &lme,
+		      int N2, int N1, GridBase * grid)
+    {
+
+#ifdef USE_LAPACK
+      const int check_lapack = 0;	// just use lapack if 0, check against lapack if 1
+
+      if (!check_lapack)
+	return diagonalize_lapack (lmd, lme, N2, N1, grid);
+
+//      diagonalize_lapack(lmd2,lme2,Nm2,Nm,Qt,grid);
+#endif
+    }
+
+#if 1
+    static RealD normalise (Field & v)
+    {
+      RealD nn = norm2 (v);
+      nn = sqrt (nn);
+      v = v * (1.0 / nn);
+      return nn;
+    }
+
+    void orthogonalize (Field & w, DenseVector < Field > &evec, int k)
+    {
+      double t0 = -usecond () / 1e6;
+      typedef typename Field::scalar_type MyComplex;
+      MyComplex ip;
+
+      if (0)
+	{
+	  for (int j = 0; j < k; ++j)
+	    {
+	      normalise (evec[j]);
+	      for (int i = 0; i < j; i++)
+		{
+		  ip = innerProduct (evec[i], evec[j]);	// are the evecs normalised? ; this assumes so.
+		  evec[j] = evec[j] - ip * evec[i];
+		}
+	    }
+	}
+
+      for (int j = 0; j < k; ++j)
+	{
+	  ip = innerProduct (evec[j], w);	// are the evecs normalised? ; this assumes so.
+	  w = w - ip * evec[j];
+	}
+      normalise (w);
+      t0 += usecond () / 1e6;
+      OrthoTime += t0;
+    }
+
+    void setUnit_Qt (int Nm, DenseVector < RealD > &Qt)
+    {
+      for (int i = 0; i < Qt.size (); ++i)
+	Qt[i] = 0.0;
+      for (int k = 0; k < Nm; ++k)
+	Qt[k + k * Nm] = 1.0;
+    }
+
+
+    void calc (DenseVector < RealD > &eval, const Field & src, int &Nconv)
+    {
+
+      GridBase *grid = src._grid;
+//      assert(grid == src._grid);
+
+      std::
+	cout << GridLogMessage << " -- Nk = " << Nk << " Np = " << Np << std::
+	endl;
+      std::cout << GridLogMessage << " -- Nm = " << Nm << std::endl;
+      std::cout << GridLogMessage << " -- size of eval   = " << eval.
+	size () << std::endl;
+
+//      assert(c.size() && Nm == eval.size());
+
+      DenseVector < RealD > lme (Nm);
+      DenseVector < RealD > lmd (Nm);
+
+
+      Field current (grid);
+      Field last (grid);
+      Field next (grid);
+
+      Nconv = 0;
+
+      RealD beta_k;
+
+      // Set initial vector
+      // (uniform vector) Why not src??
+      //      evec[0] = 1.0;
+      current = src;
+      std::cout << GridLogMessage << "norm2(src)= " << norm2 (src) << std::
+	endl;
+      normalise (current);
+      std::
+	cout << GridLogMessage << "norm2(evec[0])= " << norm2 (current) <<
+	std::endl;
+
+      // Initial Nk steps
+      OrthoTime = 0.;
+      double t0 = usecond () / 1e6;
+      RealD norm;		// sqrt norm of last vector
+
+      uint64_t iter = 0;
+
+      bool initted = false;
+      std::vector < RealD > low (Nstop * 10);
+      std::vector < RealD > high (Nstop * 10);
+      RealD cont = 0.;
+      while (1) {
+	  cont = 0.;
+	  std::vector < RealD > lme2 (Nm);
+	  std::vector < RealD > lmd2 (Nm);
+	  for (uint64_t k = 0; k < Nm; ++k, iter++) {
+	      step (lmd, lme, last, current, next, iter);
+	      last = current;
+	      current = next;
+	    }
+	  double t1 = usecond () / 1e6;
+	  std::cout << GridLogMessage << "IRL::Initial steps: " << t1 -
+	    t0 << "seconds" << std::endl;
+	  t0 = t1;
+	  std::
+	    cout << GridLogMessage << "IRL::Initial steps:OrthoTime " <<
+	    OrthoTime << "seconds" << std::endl;
+
+	  // getting eigenvalues
+	  lmd2.resize (iter + 2);
+	  lme2.resize (iter + 2);
+	  for (uint64_t k = 0; k < iter; ++k) {
+	      lmd2[k + 1] = lmd[k];
+	      lme2[k + 2] = lme[k];
+	    }
+	  t1 = usecond () / 1e6;
+	  std::cout << GridLogMessage << "IRL:: copy: " << t1 -
+	    t0 << "seconds" << std::endl;
+	  t0 = t1;
+	  {
+	    int total = grid->_Nprocessors;
+	    int node = grid->_processor;
+	    int interval = (Nstop / total) + 1;
+	    int iu = (iter + 1) - (interval * node + 1);
+	    int il = (iter + 1) - (interval * (node + 1));
+	    std::vector < RealD > eval2 (iter + 3);
+	    RealD eps2;
+	    Bisection::bisec (lmd2, lme2, iter, il, iu, 1e-16, 1e-10, eval2,
+			      eps2);
+//        diagonalize(eval2,lme2,iter,Nk,grid);
+	    RealD diff = 0.;
+	    for (int i = il; i <= iu; i++) {
+		if (initted)
+		  diff =
+		    fabs (eval2[i] - high[iu-i]) / (fabs (eval2[i]) +
+						      fabs (high[iu-i]));
+		if (initted && (diff > eresid))
+		  cont = 1.;
+		if (initted)
+		  printf ("eval[%d]=%0.14e %0.14e, %0.14e\n", i, eval2[i],
+			  high[iu-i], diff);
+		high[iu-i] = eval2[i];
+	      }
+	    il = (interval * node + 1);
+	    iu = (interval * (node + 1));
+	    Bisection::bisec (lmd2, lme2, iter, il, iu, 1e-16, 1e-10, eval2,
+			      eps2);
+	    for (int i = il; i <= iu; i++) {
+		if (initted)
+		  diff =
+		    fabs (eval2[i] - low[i]) / (fabs (eval2[i]) +
+						fabs (low[i]));
+		if (initted && (diff > eresid))
+		  cont = 1.;
+		if (initted)
+		  printf ("eval[%d]=%0.14e %0.14e, %0.14e\n", i, eval2[i],
+			  low[i], diff);
+		low[i] = eval2[i];
+	      }
+	    t1 = usecond () / 1e6;
+	    std::cout << GridLogMessage << "IRL:: diagonalize: " << t1 -
+	      t0 << "seconds" << std::endl;
+	    t0 = t1;
+	  }
+
+	  for (uint64_t k = 0; k < Nk; ++k) {
+//          eval[k] = eval2[k];
+	    }
+	  if (initted)
+	    {
+	      grid->GlobalSumVector (&cont, 1);
+	      if (cont < 1.) return;
+	    }
+	  initted = true;
+	}
+
+    }
+
+
+
+
+
+
+/**
+   There is some matrix Q such that for any vector y
+   Q.e_1 = y and Q is unitary.
+**/
+    template < class T >
+      static T orthQ (DenseMatrix < T > &Q, DenseVector < T > y)
+    {
+      int N = y.size ();	//Matrix Size
+      Fill (Q, 0.0);
+      T tau;
+      for (int i = 0; i < N; i++)
+	{
+	  Q[i][0] = y[i];
+	}
+      T sig = conj (y[0]) * y[0];
+      T tau0 = fabs (sqrt (sig));
+
+      for (int j = 1; j < N; j++)
+	{
+	  sig += conj (y[j]) * y[j];
+	  tau = abs (sqrt (sig));
+
+	  if (abs (tau0) > 0.0)
+	    {
+
+	      T gam = conj ((y[j] / tau) / tau0);
+	      for (int k = 0; k <= j - 1; k++)
+		{
+		  Q[k][j] = -gam * y[k];
+		}
+	      Q[j][j] = tau0 / tau;
+	    }
+	  else
+	    {
+	      Q[j - 1][j] = 1.0;
+	    }
+	  tau0 = tau;
+	}
+      return tau;
+    }
+
+/**
+	There is some matrix Q such that for any vector y
+	Q.e_k = y and Q is unitary.
+**/
+    template < class T >
+      static T orthU (DenseMatrix < T > &Q, DenseVector < T > y)
+    {
+      T tau = orthQ (Q, y);
+      SL (Q);
+      return tau;
+    }
+
+
+/**
+	Wind up with a matrix with the first con rows untouched
+
+say con = 2
+	Q is such that Qdag H Q has {x, x, val, 0, 0, 0, 0, ...} as 1st colum
+	and the matrix is upper hessenberg
+	and with f and Q appropriately modidied with Q is the arnoldi factorization
+
+**/
+
+    template < class T > static void Lock (DenseMatrix < T > &H,	///Hess mtx     
+					   DenseMatrix < T > &Q,	///Lock Transform
+					   T val,	///value to be locked
+					   int con,	///number already locked
+					   RealD small, int dfg, bool herm)
+    {
+      //ForceTridiagonal(H);
+
+      int M = H.dim;
+      DenseVector < T > vec;
+      Resize (vec, M - con);
+
+      DenseMatrix < T > AH;
+      Resize (AH, M - con, M - con);
+      AH = GetSubMtx (H, con, M, con, M);
+
+      DenseMatrix < T > QQ;
+      Resize (QQ, M - con, M - con);
+
+      Unity (Q);
+      Unity (QQ);
+
+      DenseVector < T > evals;
+      Resize (evals, M - con);
+      DenseMatrix < T > evecs;
+      Resize (evecs, M - con, M - con);
+
+      Wilkinson < T > (AH, evals, evecs, small);
+
+      int k = 0;
+      RealD cold = abs (val - evals[k]);
+      for (int i = 1; i < M - con; i++)
+	{
+	  RealD cnew = abs (val - evals[i]);
+	  if (cnew < cold)
+	    {
+	      k = i;
+	      cold = cnew;
+	    }
+	}
+      vec = evecs[k];
+
+      ComplexD tau;
+      orthQ (QQ, vec);
+      //orthQM(QQ,AH,vec);
+
+      AH = Hermitian (QQ) * AH;
+      AH = AH * QQ;
+
+      for (int i = con; i < M; i++)
+	{
+	  for (int j = con; j < M; j++)
+	    {
+	      Q[i][j] = QQ[i - con][j - con];
+	      H[i][j] = AH[i - con][j - con];
+	    }
+	}
+
+      for (int j = M - 1; j > con + 2; j--)
+	{
+
+	  DenseMatrix < T > U;
+	  Resize (U, j - 1 - con, j - 1 - con);
+	  DenseVector < T > z;
+	  Resize (z, j - 1 - con);
+	  T nm = norm (z);
+	  for (int k = con + 0; k < j - 1; k++)
+	    {
+	      z[k - con] = conj (H (j, k + 1));
+	    }
+	  normalise (z);
+
+	  RealD tmp = 0;
+	  for (int i = 0; i < z.size () - 1; i++)
+	    {
+	      tmp = tmp + abs (z[i]);
+	    }
+
+	  if (tmp < small / ((RealD) z.size () - 1.0))
+	    {
+	      continue;
+	    }
+
+	  tau = orthU (U, z);
+
+	  DenseMatrix < T > Hb;
+	  Resize (Hb, j - 1 - con, M);
+
+	  for (int a = 0; a < M; a++)
+	    {
+	      for (int b = 0; b < j - 1 - con; b++)
+		{
+		  T sum = 0;
+		  for (int c = 0; c < j - 1 - con; c++)
+		    {
+		      sum += H[a][con + 1 + c] * U[c][b];
+		    }		//sum += H(a,con+1+c)*U(c,b);}
+		  Hb[b][a] = sum;
+		}
+	    }
+
+	  for (int k = con + 1; k < j; k++)
+	    {
+	      for (int l = 0; l < M; l++)
+		{
+		  H[l][k] = Hb[k - 1 - con][l];
+		}
+	    }			//H(Hb[k-1-con][l] , l,k);}}
+
+	  DenseMatrix < T > Qb;
+	  Resize (Qb, M, M);
+
+	  for (int a = 0; a < M; a++)
+	    {
+	      for (int b = 0; b < j - 1 - con; b++)
+		{
+		  T sum = 0;
+		  for (int c = 0; c < j - 1 - con; c++)
+		    {
+		      sum += Q[a][con + 1 + c] * U[c][b];
+		    }		//sum += Q(a,con+1+c)*U(c,b);}
+		  Qb[b][a] = sum;
+		}
+	    }
+
+	  for (int k = con + 1; k < j; k++)
+	    {
+	      for (int l = 0; l < M; l++)
+		{
+		  Q[l][k] = Qb[k - 1 - con][l];
+		}
+	    }			//Q(Qb[k-1-con][l] , l,k);}}
+
+	  DenseMatrix < T > Hc;
+	  Resize (Hc, M, M);
+
+	  for (int a = 0; a < j - 1 - con; a++)
+	    {
+	      for (int b = 0; b < M; b++)
+		{
+		  T sum = 0;
+		  for (int c = 0; c < j - 1 - con; c++)
+		    {
+		      sum += conj (U[c][a]) * H[con + 1 + c][b];
+		    }		//sum += conj( U(c,a) )*H(con+1+c,b);}
+		  Hc[b][a] = sum;
+		}
+	    }
+
+	  for (int k = 0; k < M; k++)
+	    {
+	      for (int l = con + 1; l < j; l++)
+		{
+		  H[l][k] = Hc[k][l - 1 - con];
+		}
+	    }			//H(Hc[k][l-1-con] , l,k);}}
+
+	}
+    }
+#endif
+
+
+  };
+
+}
+#endif
@@ -0,0 +1,122 @@
+#include <math.h>
+#include <stdlib.h>
+#include <vector>
+
+struct Bisection {
+
+static void get_eig2(int row_num,std::vector<RealD> &ALPHA,std::vector<RealD> &BETA, std::vector<RealD> & eig)
+{
+  int i,j;
+  std::vector<RealD> evec1(row_num+3);
+  std::vector<RealD> evec2(row_num+3);
+  RealD eps2;
+  ALPHA[1]=0.;
+  BETHA[1]=0.;
+  for(i=0;i<row_num-1;i++) {
+    ALPHA[i+1] = A[i*(row_num+1)].real();
+    BETHA[i+2] = A[i*(row_num+1)+1].real();
+  }
+  ALPHA[row_num] = A[(row_num-1)*(row_num+1)].real();
+  bisec(ALPHA,BETHA,row_num,1,row_num,1e-10,1e-10,evec1,eps2);
+  bisec(ALPHA,BETHA,row_num,1,row_num,1e-16,1e-16,evec2,eps2);
+
+  // Do we really need to sort here?
+  int begin=1;
+  int end = row_num;
+  int swapped=1;
+  while(swapped) {
+    swapped=0;
+    for(i=begin;i<end;i++){
+      if(mag(evec2[i])>mag(evec2[i+1]))	{
+	swap(evec2+i,evec2+i+1);
+	swapped=1;
+      }
+    }
+    end--;
+    for(i=end-1;i>=begin;i--){
+      if(mag(evec2[i])>mag(evec2[i+1]))	{
+	swap(evec2+i,evec2+i+1);
+	swapped=1;
+      }
+    }
+    begin++;
+  }
+
+  for(i=0;i<row_num;i++){
+    for(j=0;j<row_num;j++) {
+      if(i==j) H[i*row_num+j]=evec2[i+1];
+      else H[i*row_num+j]=0.;
+    }
+  }
+}
+
+static void bisec(std::vector<RealD> &c,   
+		  std::vector<RealD> &b,
+		  int n,
+		  int m1,
+		  int m2,
+		  RealD eps1,
+		  RealD relfeh,
+		  std::vector<RealD> &x,
+		  RealD &eps2)
+{
+  std::vector<RealD> wu(n+2);
+
+  RealD h,q,x1,xu,x0,xmin,xmax; 
+  int i,a,k;
+
+  b[1]=0.0;
+  xmin=c[n]-fabs(b[n]);
+  xmax=c[n]+fabs(b[n]);
+  for(i=1;i<n;i++){
+    h=fabs(b[i])+fabs(b[i+1]);
+    if(c[i]+h>xmax) xmax= c[i]+h;
+    if(c[i]-h<xmin) xmin= c[i]-h;
+  }
+  xmax *=2.;
+
+  eps2=relfeh*((xmin+xmax)>0.0 ? xmax : -xmin);
+  if(eps1<=0.0) eps1=eps2;
+  eps2=0.5*eps1+7.0*(eps2);
+  x0=xmax;
+  for(i=m1;i<=m2;i++){
+    x[i]=xmax;
+    wu[i]=xmin;
+  }
+
+  for(k=m2;k>=m1;k--){
+    xu=xmin;
+    i=k;
+    do{
+      if(xu<wu[i]){
+	xu=wu[i];
+	i=m1-1;
+      }
+      i--;
+    }while(i>=m1);
+    if(x0>x[k]) x0=x[k];
+    while((x0-xu)>2*relfeh*(fabs(xu)+fabs(x0))+eps1){
+      x1=(xu+x0)/2;
+
+      a=0;
+      q=1.0;
+      for(i=1;i<=n;i++){
+	q=c[i]-x1-((q!=0.0)? b[i]*b[i]/q:fabs(b[i])/relfeh);
+	if(q<0) a++;
+      }
+      //			printf("x1=%e a=%d\n",x1,a);
+      if(a<k){
+	if(a<m1){
+	  xu=x1;
+	  wu[m1]=x1;
+	}else {
+	  xu=x1;
+	  wu[a+1]=x1;
+	  if(x[a]>x1) x[a]=x1;
+	}
+      }else x0=x1;
+    }
+    x[k]=(x0+xu)/2;
+  }
+}
+}
@@ -1,7 +1,5 @@
-
-
-
 #include <Grid/GridCore.h>
+#include <fcntl.h>

 namespace Grid {

@@ -63,4 +61,37 @@ void *PointerCache::Lookup(size_t bytes) {
  return NULL;
 }

+
+void check_huge_pages(void *Buf,uint64_t BYTES)
+{
+#ifdef __linux__
+  int fd = open("/proc/self/pagemap", O_RDONLY);
+  assert(fd >= 0);
+  const int page_size = 4096;
+  uint64_t virt_pfn = (uint64_t)Buf / page_size;
+  off_t offset = sizeof(uint64_t) * virt_pfn;
+  uint64_t npages = (BYTES + page_size-1) / page_size;
+  uint64_t pagedata[npages];
+  uint64_t ret = lseek(fd, offset, SEEK_SET);
+  assert(ret == offset);
+  ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
+  assert(ret == sizeof(uint64_t) * npages);
+  int nhugepages = npages / 512;
+  int n4ktotal, nnothuge;
+  n4ktotal = 0;
+  nnothuge = 0;
+  for (int i = 0; i < nhugepages; ++i) {
+    uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
+    for (int j = 0; j < 512; ++j) {
+      uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
+      ++n4ktotal;
+      if (pageaddr != baseaddr + j * page_size)
+	++nnothuge;
+      }
+  }
+  int rank = CartesianCommunicator::RankWorld();
+  printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
+#endif
+}
+
 }
@@ -64,6 +64,8 @@ namespace Grid {

  };

+  void check_huge_pages(void *Buf,uint64_t BYTES);
+
 ////////////////////////////////////////////////////////////////////
 // A lattice of something, but assume the something is SIMDized.
 ////////////////////////////////////////////////////////////////////
@@ -49,6 +49,10 @@ public:
    template<class object> friend class Lattice;

    GridBase(const std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
+    GridBase(const std::vector<int> & processor_grid,
+	     const CartesianCommunicator &parent) : CartesianCommunicator(processor_grid,parent) {};
+
+    virtual ~GridBase() = default;

    // Physics Grid information.
    std::vector<int> _simd_layout;// Which dimensions get relayed out over simd lanes.
@@ -210,9 +214,6 @@ public:
      assert(lidx<lSites());
      Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
    }
-
-
-
    void GlobalCoorToGlobalIndex(const std::vector<int> & gcoor,int & gidx){
      gidx=0;
      int mult=1;
@@ -61,9 +61,31 @@ public:
    virtual int CheckerBoardShift(int source_cb,int dim,int shift, int osite){
      return shift;
    }
+    /////////////////////////////////////////////////////////////////////////
+    // Constructor takes a parent grid and possibly subdivides communicator.
+    /////////////////////////////////////////////////////////////////////////
    GridCartesian(const std::vector<int> &dimensions,
-                  const std::vector<int> &simd_layout,
-                  const std::vector<int> &processor_grid) : GridBase(processor_grid)
+		  const std::vector<int> &simd_layout,
+		  const std::vector<int> &processor_grid,
+		  const GridCartesian &parent) : GridBase(processor_grid,parent)
+    {
+      Init(dimensions,simd_layout,processor_grid);
+    }
+    /////////////////////////////////////////////////////////////////////////
+    // Construct from comm world
+    /////////////////////////////////////////////////////////////////////////
+    GridCartesian(const std::vector<int> &dimensions,
+		  const std::vector<int> &simd_layout,
+		  const std::vector<int> &processor_grid) : GridBase(processor_grid)
+    {
+      Init(dimensions,simd_layout,processor_grid);
+    }
+
+    virtual ~GridCartesian() = default;
+
+    void Init(const std::vector<int> &dimensions,
+	      const std::vector<int> &simd_layout,
+	      const std::vector<int> &processor_grid)
    {
      ///////////////////////
      // Grid information
@@ -112,24 +112,59 @@ public:
      }
    };

-    GridRedBlackCartesian(const GridBase *base) : GridRedBlackCartesian(base->_fdimensions,base->_simd_layout,base->_processors)  {};
+    ////////////////////////////////////////////////////////////
+    // Create Redblack from original grid; require full grid pointer ?
+    ////////////////////////////////////////////////////////////
+    GridRedBlackCartesian(const GridBase *base) : GridBase(base->_processors,*base)
+    {
+      int dims = base->_ndimension;
+      std::vector<int> checker_dim_mask(dims,1);
+      int checker_dim = 0;
+      Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim);
+    };

-    GridRedBlackCartesian(const std::vector<int> &dimensions,
+    ////////////////////////////////////////////////////////////
+    // Create redblack from original grid, with non-trivial checker dim mask
+    ////////////////////////////////////////////////////////////
+    GridRedBlackCartesian(const GridBase *base,
+			  const std::vector<int> &checker_dim_mask,
+			  int checker_dim
+			  ) :  GridBase(base->_processors,*base) 
+    {
+      Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim)  ;
+    }
+
+    virtual ~GridRedBlackCartesian() = default;
+#if 0
+    ////////////////////////////////////////////////////////////
+    // Create redblack grid ;; deprecate these. Should not
+    // need direct creation of redblack without a full grid to base on
+    ////////////////////////////////////////////////////////////
+    GridRedBlackCartesian(const GridBase *base,
+			  const std::vector<int> &dimensions,
 			  const std::vector<int> &simd_layout,
 			  const std::vector<int> &processor_grid,
 			  const std::vector<int> &checker_dim_mask,
 			  int checker_dim
-			  ) :  GridBase(processor_grid) 
+			  ) :  GridBase(processor_grid,*base) 
    {
      Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim);
    }
-    GridRedBlackCartesian(const std::vector<int> &dimensions,
+
+    ////////////////////////////////////////////////////////////
+    // Create redblack grid
+    ////////////////////////////////////////////////////////////
+    GridRedBlackCartesian(const GridBase *base,
+			  const std::vector<int> &dimensions,
 			  const std::vector<int> &simd_layout,
-			  const std::vector<int> &processor_grid) : GridBase(processor_grid) 
+			  const std::vector<int> &processor_grid) : GridBase(processor_grid,*base) 
    {
      std::vector<int> checker_dim_mask(dimensions.size(),1);
-      Init(dimensions,simd_layout,processor_grid,checker_dim_mask,0);
+      int checker_dim = 0;
+      Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim);
    }
+#endif
+
    void Init(const std::vector<int> &dimensions,
              const std::vector<int> &simd_layout,
              const std::vector<int> &processor_grid,
@@ -67,7 +67,7 @@ void CartesianCommunicator::ShmBufferFreeAll(void) {
 /////////////////////////////////
 // Grid information queries
 /////////////////////////////////
-int                      CartesianCommunicator::Dimensions(void)         { return _ndimension; };
+int                      CartesianCommunicator::Dimensions(void)        { return _ndimension; };
 int                      CartesianCommunicator::IsBoss(void)            { return _processor==0; };
 int                      CartesianCommunicator::BossRank(void)          { return 0; };
 int                      CartesianCommunicator::ThisRank(void)          { return _processor; };
@@ -96,6 +96,124 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
  GlobalSumVector((double *)c,2*N);
 }

+
+#if defined( GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT)
+
+CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent) 
+{
+  _ndimension = processors.size();
+  
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  // split the communicator
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  int Nparent;
+  MPI_Comm_size(parent.communicator,&Nparent);
+
+  int childsize=1;
+  for(int d=0;d<processors.size();d++) {
+    childsize *= processors[d];
+  }
+  int Nchild = Nparent/childsize;
+  assert (childsize * Nchild == Nparent);
+
+  std::vector<int> ccoor(_ndimension); // coor within subcommunicator
+  std::vector<int> scoor(_ndimension); // coor of split within parent
+  std::vector<int> ssize(_ndimension); // coor of split within parent
+
+  std::vector<int> pcoor(_ndimension,0); 
+  std::vector<int> pdims(_ndimension,1); 
+
+  if(parent._processors.size()==4 && _ndimension==5){
+      for(int i=0;i<4;i++) pcoor[i+1]=parent._processor_coor[i];
+      for(int i=0;i<4;i++) pdims[i+1]=parent._processors[i];
+  } else {
+      assert(_ndimension == parent._ndimension);
+      for(int i=0;i<_ndimension;i++) pcoor[i]=parent._processor_coor[i];
+      for(int i=0;i<_ndimension;i++) pdims[i]=parent._processors[i];
+  }
+
+  for(int d=0;d<_ndimension;d++){
+    ccoor[d] = pcoor[d] % processors[d];
+    scoor[d] = pcoor[d] / processors[d];
+    ssize[d] = pdims[d] / processors[d];
+  }
+  int crank,srank;  // rank within subcomm ; rank of subcomm within blocks of subcomms
+  Lexicographic::IndexFromCoor(ccoor,crank,processors);
+  Lexicographic::IndexFromCoor(scoor,srank,ssize);
+
+  MPI_Comm comm_split;
+  if ( Nchild > 1 ) { 
+
+    //    std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec<<std::endl;
+    //    std::cout << GridLogMessage<<" parent grid["<< parent._ndimension<<"]    ";
+    //    for(int d=0;d<parent._processors.size();d++)  std::cout << parent._processors[d] << " ";
+    //    std::cout<<std::endl;
+
+    //    std::cout << GridLogMessage<<" child grid["<< _ndimension <<"]    ";
+    //    for(int d=0;d<processors.size();d++)  std::cout << processors[d] << " ";
+    //    std::cout<<std::endl;
+
+    int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split);
+    assert(ierr==0);
+    //////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Declare victory
+    //////////////////////////////////////////////////////////////////////////////////////////////////////
+    //    std::cout << GridLogMessage<<"Divided communicator "<< parent._Nprocessors<<" into "
+    // 	      << Nchild <<" communicators with " << childsize << " ranks"<<std::endl;
+  } else {
+    comm_split=parent.communicator;
+  }
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Set up from the new split communicator
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  InitFromMPICommunicator(processors,comm_split);
+}
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+// Take an MPI_Comm and self assemble
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base)
+{
+  //  if ( communicator_base != communicator_world ) {
+  //    std::cout << "Cartesian communicator created with a non-world communicator"<<std::endl;
+  //  }
+  _ndimension = processors.size();
+  _processor_coor.resize(_ndimension);
+
+  /////////////////////////////////
+  // Count the requested nodes
+  /////////////////////////////////
+  _Nprocessors=1;
+  _processors = processors;
+  for(int i=0;i<_ndimension;i++){
+    _Nprocessors*=_processors[i];
+  }
+
+  std::vector<int> periodic(_ndimension,1);
+  MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],1,&communicator);
+  MPI_Comm_rank(communicator,&_processor);
+  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
+
+  int Size;
+  MPI_Comm_size(communicator,&Size);
+
+#ifdef GRID_COMMS_MPIT
+  communicator_halo.resize (2*_ndimension);
+  for(int i=0;i<_ndimension*2;i++){
+    MPI_Comm_dup(communicator,&communicator_halo[i]);
+  }
+#endif
+  
+  assert(Size==_Nprocessors);
+}
+
+CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) 
+{
+  InitFromMPICommunicator(processors,communicator_world);
+}
+
+#endif
+
 #if !defined( GRID_COMMS_MPI3) 

 int                      CartesianCommunicator::NodeCount(void)    { return ProcessorCount();};
@@ -147,8 +265,13 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
 }
 void CartesianCommunicator::ShmInitGeneric(void){
 #if 1
-
-  int mmap_flag = MAP_SHARED | MAP_ANONYMOUS;
+  int mmap_flag =0;
+#ifdef MAP_ANONYMOUS
+  mmap_flag = mmap_flag| MAP_SHARED | MAP_ANONYMOUS;
+#endif
+#ifdef MAP_ANON
+  mmap_flag = mmap_flag| MAP_SHARED | MAP_ANON;
+#endif
 #ifdef MAP_HUGETLB
  if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
 #endif
@@ -157,7 +280,9 @@ void CartesianCommunicator::ShmInitGeneric(void){
    perror("mmap failed ");
    exit(EXIT_FAILURE);  
  }
+#ifdef MADV_HUGEPAGE
  if (!Hugepages ) madvise(ShmCommBuf,MAX_MPI_SHM_BYTES,MADV_HUGEPAGE);
+#endif
 #else 
  ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
  ShmCommBuf=(void *)&ShmBufStorageVector[0];
@@ -83,6 +83,7 @@ class CartesianCommunicator {
  std::vector<MPI_Comm> communicator_halo;

  typedef MPI_Request CommsRequest_t;
+
 #else 
  typedef int CommsRequest_t;
 #endif
@@ -147,11 +148,24 @@ class CartesianCommunicator {
  // Must call in Grid startup
  ////////////////////////////////////////////////
  static void Init(int *argc, char ***argv);
-  
+
  ////////////////////////////////////////////////
-  // Constructor of any given grid
+  // Constructors to sub-divide a parent communicator
+  // and default to comm world
  ////////////////////////////////////////////////
+  CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent);
  CartesianCommunicator(const std::vector<int> &pdimensions_in);
+  virtual ~CartesianCommunicator();
+
+ private:
+#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT) 
+  ////////////////////////////////////////////////
+  // Private initialise from an MPI communicator
+  // Can use after an MPI_Comm_split, but hidden from user so private
+  ////////////////////////////////////////////////
+  void InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base);
+#endif
+ public:
  
  ////////////////////////////////////////////////////////////////////////////////////////
  // Wraps MPI_Cart routines, or implements equivalent on other impls
@@ -249,6 +263,27 @@ class CartesianCommunicator {
  // Broadcast a buffer and composite larger
  ////////////////////////////////////////////////////////////
  void Broadcast(int root,void* data, int bytes);
+
+  ////////////////////////////////////////////////////////////
+  // All2All down one dimension
+  ////////////////////////////////////////////////////////////
+  template<class T> void AllToAll(int dim,std::vector<T> &in, std::vector<T> &out){
+    assert(dim>=0);
+    assert(dim<_ndimension);
+    int numnode = _processors[dim];
+    //    std::cerr << " AllToAll in.size()  "<<in.size()<<std::endl;
+    //    std::cerr << " AllToAll out.size() "<<out.size()<<std::endl;
+    assert(in.size()==out.size());
+    uint64_t bytes=sizeof(T);
+    uint64_t words=in.size()/numnode;
+
+    assert(numnode * words == in.size());
+    assert(words < (1ULL<<32));
+
+    AllToAll(dim,(void *)&in[0],(void *)&out[0],words,bytes);
+  }
+  void AllToAll(int dim  ,void *in,void *out,uint64_t words,uint64_t bytes);
+  void AllToAll(void  *in,void *out,uint64_t words         ,uint64_t bytes);
  
  template<class obj> void Broadcast(int root,obj &data)
    {
@@ -53,28 +53,14 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
  ShmInitGeneric();
 }

-CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
+CartesianCommunicator::~CartesianCommunicator()
 {
-  _ndimension = processors.size();
-  std::vector<int> periodic(_ndimension,1);
-
-  _Nprocessors=1;
-  _processors = processors;
-  _processor_coor.resize(_ndimension);
-  
-  MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
-  MPI_Comm_rank(communicator,&_processor);
-  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
-
-  for(int i=0;i<_ndimension;i++){
-    _Nprocessors*=_processors[i];
-  }
-  
-  int Size; 
-  MPI_Comm_size(communicator,&Size);
-  
-  assert(Size==_Nprocessors);
+  int MPI_is_finalised;
+  MPI_Finalized(&MPI_is_finalised);
+  if (communicator && MPI_is_finalised)
+    MPI_Comm_free(&communicator);
 }
+
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
@@ -210,6 +196,35 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 		     root,
 		     communicator);
  assert(ierr==0);
+}
+void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
+{
+  std::vector<int> row(_ndimension,1);
+  assert(dim>=0 && dim<_ndimension);
+
+  //  Split the communicator
+  row[dim] = _processors[dim];
+
+  CartesianCommunicator Comm(row,*this);
+  Comm.AllToAll(in,out,words,bytes);
+}
+void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t bytes)
+{
+  // MPI is a pain and uses "int" arguments
+  // 64*64*64*128*16 == 500Million elements of data.
+  // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
+  // (Turns up on 32^3 x 64 Gparity too)
+  MPI_Datatype object;
+  int iwords; 
+  int ibytes;
+  iwords = words;
+  ibytes = bytes;
+  assert(words == iwords); // safe to cast to int ?
+  assert(bytes == ibytes); // safe to cast to int ?
+  MPI_Type_contiguous(ibytes,MPI_BYTE,&object);
+  MPI_Type_commit(&object);
+  MPI_Alltoall(in,iwords,object,out,iwords,object,communicator);
+  MPI_Type_free(&object);
 }
  ///////////////////////////////////////////////////////
  // Should only be used prior to Grid Init finished.
@@ -230,5 +245,7 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
  assert(ierr==0);
 }

+
+
 }

@@ -215,8 +215,10 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
      perror("open hugetlbfs");
      exit(0);
    }
-    
-    int mmap_flag = MAP_SHARED |MAP_POPULATE;
+    int mmap_flag = MAP_SHARED ;
+#ifdef MAP_POPULATE    
+    mmap_flag|=MAP_POPULATE;
+#endif
 #ifdef MAP_HUGETLB
    if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
 #endif
@@ -249,7 +251,10 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
      if ( fd < 0 ) {	perror("failed shm_open");	assert(0);      }
      ftruncate(fd, size);
      
-      int mmap_flag = MAP_SHARED|MAP_POPULATE;
+      int mmap_flag = MAP_SHARED;
+#ifdef MAP_POPULATE 
+      mmap_flag |= MAP_POPULATE;
+#endif
 #ifdef MAP_HUGETLB
      if (Hugepages) mmap_flag |= MAP_HUGETLB;
 #endif
@@ -445,6 +450,15 @@ void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &c
  assert(lr!=-1);
  Lexicographic::CoorFromIndex(coor,lr,_processors);
 }
+
+//////////////////////////////////
+// Try to subdivide communicator
+//////////////////////////////////
+CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent) 
+  : CartesianCommunicator(processors) 
+{
+  std::cout << "Attempts to split MPI3 communicators will fail until implemented" <<std::endl;
+}
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 { 
  int ierr;
@@ -698,7 +712,8 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 							 int from,
 							 int bytes,int dir)
 {
-  assert(dir < communicator_halo.size());
+  int ncomm  =communicator_halo.size(); 
+  int commdir=dir%ncomm;

  MPI_Request xrq;
  MPI_Request rrq;
@@ -718,14 +733,14 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  gfrom = MPI_UNDEFINED;
 #endif
  if ( gfrom ==MPI_UNDEFINED) {
-    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[dir],&rrq);
+    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq);
    assert(ierr==0);
    list.push_back(rrq);
    off_node_bytes+=bytes;
  }

  if ( gdest == MPI_UNDEFINED ) {
-    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[dir],&xrq);
+    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[commdir],&xrq);
    assert(ierr==0);
    list.push_back(xrq);
    off_node_bytes+=bytes;
@@ -53,33 +53,13 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
  ShmInitGeneric();
 }

-CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
+CartesianCommunicator::~CartesianCommunicator()
 {
-  _ndimension = processors.size();
-  std::vector<int> periodic(_ndimension,1);
-
-  _Nprocessors=1;
-  _processors = processors;
-  _processor_coor.resize(_ndimension);
-  
-  MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
-  MPI_Comm_rank(communicator,&_processor);
-  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
-
-  for(int i=0;i<_ndimension;i++){
-    _Nprocessors*=_processors[i];
-  }
-
-  communicator_halo.resize (2*_ndimension);
-  for(int i=0;i<_ndimension*2;i++){
-    MPI_Comm_dup(communicator,&communicator_halo[i]);
-  }
-  
-  int Size; 
-  MPI_Comm_size(communicator,&Size);
-  
-  assert(Size==_Nprocessors);
+  if (communicator && !MPI::Is_finalized())
+    MPI_Comm_free(&communicator);
 }
+
+
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
@@ -244,13 +224,14 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 {
  int myrank = _processor;
  int ierr;
-  assert(dir < communicator_halo.size());
+  int ncomm  =communicator_halo.size(); 
+  int commdir=dir%ncomm;
  
  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
  // Give the CPU to MPI immediately; can use threads to overlap optionally
  MPI_Request req[2];
-  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
-  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[dir],&req[0]);
+  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[commdir],&req[1]);
+  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[commdir],&req[0]);

  list.push_back(req[0]);
  list.push_back(req[1]);
@@ -269,13 +250,14 @@ double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
 {
  int myrank = _processor;
  int ierr;
-  assert(dir < communicator_halo.size());
-  
-  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
+  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo.size()<< <std::endl;
+
+  int ncomm  =communicator_halo.size(); 
+  int commdir=dir%ncomm;
  // Give the CPU to MPI immediately; can use threads to overlap optionally
  MPI_Request req[2];
-  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
-  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[dir],&req[0]);
+  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[commdir],&req[1]);
+  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[commdir],&req[0]);
  MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
  return 2.0*bytes;
 }
@@ -38,6 +38,9 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
  ShmInitGeneric();
 }

+CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent) 
+  : CartesianCommunicator(processors) {}
+
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
  _processors = processors;
@@ -53,6 +56,8 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
  }
 }

+CartesianCommunicator::~CartesianCommunicator(){}
+
 void CartesianCommunicator::GlobalSum(float &){}
 void CartesianCommunicator::GlobalSumVector(float *,int N){}
 void CartesianCommunicator::GlobalSum(double &){}
@@ -95,6 +100,14 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
 {
  assert(0);
 }
+void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
+{
+  bcopy(in,out,bytes*words);
+}
+void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t bytes)
+{
+  bcopy(in,out,bytes*words);
+}

 int  CartesianCommunicator::RankWorld(void){return 0;}
 void CartesianCommunicator::Barrier(void){}
@@ -75,6 +75,11 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
  ShmInitGeneric();
 }

+CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent) 
+  : CartesianCommunicator(processors) 
+{
+  std::cout << "Attempts to split SHMEM communicators will fail " <<std::endl;
+}
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
  _ndimension = processors.size();
@@ -544,7 +544,6 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
      for(int i=0;i<Nblock;i++){
      for(int j=0;j<Nblock;j++){
 	auto tmp = innerProduct(Left[i],Right[j]);
-	//	vector_typeD rtmp = TensorRemove(tmp);
 	auto rtmp = TensorRemove(tmp);
 	mat_thread(i,j) += Reduce(rtmp);
      }}
@@ -684,6 +684,307 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
    merge(out._odata[out_oidx], ptrs, 0);
  }
 }
+
+////////////////////////////////////////////////////////////////////////////////
+// Communicate between grids
+////////////////////////////////////////////////////////////////////////////////
+//
+// All to all plan
+//
+// Subvolume on fine grid is v.    Vectors a,b,c,d 
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////////////
+// SIMPLEST CASE:
+///////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Mesh of nodes (2) ; subdivide to  1 subdivisions
+//
+// Lex ord:   
+//          N0 va0 vb0  N1 va1 vb1 
+//
+// For each dimension do an all to all
+//
+// full AllToAll(0)
+//          N0 va0 va1    N1 vb0 vb1
+//
+// REARRANGE
+//          N0 va01       N1 vb01
+//
+// Must also rearrange data to get into the NEW lex order of grid at each stage. Some kind of "insert/extract".
+// NB: Easiest to programme if keep in lex order.
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////////////
+// SIMPLE CASE:
+///////////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Mesh of nodes (2x2) ; subdivide to  1x1 subdivisions
+//
+// Lex ord:   
+//          N0 va0 vb0 vc0 vd0       N1 va1 vb1 vc1 vd1  
+//          N2 va2 vb2 vc2 vd2       N3 va3 vb3 vc3 vd3 
+//
+// Ratio = full[dim] / split[dim]
+//
+// For each dimension do an all to all; get Nvec -> Nvec / ratio
+//                                          Ldim -> Ldim * ratio
+//                                          LocalVol -> LocalVol * ratio
+// full AllToAll(0)
+//          N0 va0 vb0 va1 vb1       N1 vc0 vd0 vc1 vd1   
+//          N2 va2 vb2 va3 vb3       N3 vc2 vd2 vc3 vd3 
+//
+// REARRANGE
+//          N0 va01 vb01      N1 vc01 vd01
+//          N2 va23 vb23      N3 vc23 vd23
+//
+// full AllToAll(1)           // Not what is wanted. FIXME
+//          N0 va01 va23      N1 vc01 vc23 
+//          N2 vb01 vb23      N3 vd01 vd23
+// 
+// REARRANGE
+//          N0 va0123      N1 vc0123
+//          N2 vb0123      N3 vd0123
+//
+// Must also rearrange data to get into the NEW lex order of grid at each stage. Some kind of "insert/extract".
+// NB: Easiest to programme if keep in lex order.
+//
+/////////////////////////////////////////////////////////
+template<class Vobj>
+void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj>   & split)
+{
+  typedef typename Vobj::scalar_object Sobj;
+
+  int full_vecs   = full.size();
+
+  assert(full_vecs>=1);
+
+  GridBase * full_grid = full[0]._grid;
+  GridBase *split_grid = split._grid;
+
+  int       ndim  = full_grid->_ndimension;
+  int  full_nproc = full_grid->_Nprocessors;
+  int split_nproc =split_grid->_Nprocessors;
+
+  ////////////////////////////////
+  // Checkerboard management
+  ////////////////////////////////
+  int cb = full[0].checkerboard;
+  split.checkerboard = cb;
+
+  //////////////////////////////
+  // Checks
+  //////////////////////////////
+  assert(full_grid->_ndimension==split_grid->_ndimension);
+  for(int n=0;n<full_vecs;n++){
+    assert(full[n].checkerboard == cb);
+    for(int d=0;d<ndim;d++){
+      assert(full[n]._grid->_gdimensions[d]==split._grid->_gdimensions[d]);
+      assert(full[n]._grid->_fdimensions[d]==split._grid->_fdimensions[d]);
+    }
+  }
+
+  int   nvector   =full_nproc/split_nproc; 
+  assert(nvector*split_nproc==full_nproc);
+  assert(nvector == full_vecs);
+
+  std::vector<int> ratio(ndim);
+  for(int d=0;d<ndim;d++){
+    ratio[d] = full_grid->_processors[d]/ split_grid->_processors[d];
+  }
+
+  uint64_t lsites = full_grid->lSites();
+  uint64_t     sz = lsites * nvector;
+  std::vector<Sobj> tmpdata(sz);
+  std::vector<Sobj> alldata(sz);
+  std::vector<Sobj> scalardata(lsites); 
+  for(int v=0;v<nvector;v++){
+    unvectorizeToLexOrdArray(scalardata,full[v]);    
+    parallel_for(int site=0;site<lsites;site++){
+      alldata[v*lsites+site] = scalardata[site];
+    }
+  }
+
+  int nvec = nvector; // Counts down to 1 as we collapse dims
+  std::vector<int> ldims = full_grid->_ldimensions;
+  std::vector<int> lcoor(ndim);
+
+  for(int d=0;d<ndim;d++){
+
+    if ( ratio[d] != 1 ) {
+
+      full_grid ->AllToAll(d,alldata,tmpdata);
+
+      //////////////////////////////////////////
+      //Local volume for this dimension is expanded by ratio of processor extents
+      // Number of vectors is decreased by same factor
+      // Rearrange to lexico for bigger volume
+      //////////////////////////////////////////
+      nvec    /= ratio[d];
+      auto rdims = ldims; rdims[d]  *=   ratio[d];
+      auto rsites= lsites*ratio[d];
+      for(int v=0;v<nvec;v++){
+
+	// For loop over each site within old subvol
+	for(int lsite=0;lsite<lsites;lsite++){
+
+	  Lexicographic::CoorFromIndex(lcoor, lsite, ldims);	  
+
+	  for(int r=0;r<ratio[d];r++){ // ratio*nvec terms
+
+	    auto rcoor = lcoor;	    rcoor[d]  += r*ldims[d];
+
+	    int rsite; Lexicographic::IndexFromCoor(rcoor, rsite, rdims);	  
+	    rsite += v * rsites;
+
+	    int rmul=nvec*lsites;
+	    int vmul=     lsites;
+	    alldata[rsite] = tmpdata[lsite+r*rmul+v*vmul];
+
+	  }
+	}
+      }
+      ldims[d]*= ratio[d];
+      lsites  *= ratio[d];
+
+      if ( split_grid->_processors[d] > 1 ) {
+	tmpdata = alldata;
+	split_grid->AllToAll(d,tmpdata,alldata);
+      }
+    }
+  }
+
+  vectorizeFromLexOrdArray(alldata,split);    
+}
+
+template<class Vobj>
+void Grid_split(Lattice<Vobj> &full,Lattice<Vobj>   & split)
+{
+  int nvector = full._grid->_Nprocessors / split._grid->_Nprocessors;
+  std::vector<Lattice<Vobj> > full_v(nvector,full._grid);
+  for(int n=0;n<nvector;n++){
+    full_v[n] = full;
+  }
+  Grid_split(full_v,split);
+}
+
+template<class Vobj>
+void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj>   & split)
+{
+  typedef typename Vobj::scalar_object Sobj;
+
+  int full_vecs   = full.size();
+
+  assert(full_vecs>=1);
+
+  GridBase * full_grid = full[0]._grid;
+  GridBase *split_grid = split._grid;
+
+  int       ndim  = full_grid->_ndimension;
+  int  full_nproc = full_grid->_Nprocessors;
+  int split_nproc =split_grid->_Nprocessors;
+
+  ////////////////////////////////
+  // Checkerboard management
+  ////////////////////////////////
+  int cb = full[0].checkerboard;
+  split.checkerboard = cb;
+
+  //////////////////////////////
+  // Checks
+  //////////////////////////////
+  assert(full_grid->_ndimension==split_grid->_ndimension);
+  for(int n=0;n<full_vecs;n++){
+    assert(full[n].checkerboard == cb);
+    for(int d=0;d<ndim;d++){
+      assert(full[n]._grid->_gdimensions[d]==split._grid->_gdimensions[d]);
+      assert(full[n]._grid->_fdimensions[d]==split._grid->_fdimensions[d]);
+    }
+  }
+
+  int   nvector   =full_nproc/split_nproc; 
+  assert(nvector*split_nproc==full_nproc);
+  assert(nvector == full_vecs);
+
+  std::vector<int> ratio(ndim);
+  for(int d=0;d<ndim;d++){
+    ratio[d] = full_grid->_processors[d]/ split_grid->_processors[d];
+  }
+
+  uint64_t lsites = full_grid->lSites();
+  uint64_t     sz = lsites * nvector;
+  std::vector<Sobj> tmpdata(sz);
+  std::vector<Sobj> alldata(sz);
+  std::vector<Sobj> scalardata(lsites); 
+
+  unvectorizeToLexOrdArray(alldata,split);    
+
+  /////////////////////////////////////////////////////////////////
+  // Start from split grid and work towards full grid
+  /////////////////////////////////////////////////////////////////
+  std::vector<int> lcoor(ndim);
+  std::vector<int> rcoor(ndim);
+
+  int nvec = 1;
+  lsites = split_grid->lSites();
+  std::vector<int> ldims = split_grid->_ldimensions;
+
+  for(int d=ndim-1;d>=0;d--){
+
+    if ( ratio[d] != 1 ) {
+
+      if ( split_grid->_processors[d] > 1 ) {
+	tmpdata = alldata;
+	split_grid->AllToAll(d,tmpdata,alldata);
+      }
+
+      //////////////////////////////////////////
+      //Local volume for this dimension is expanded by ratio of processor extents
+      // Number of vectors is decreased by same factor
+      // Rearrange to lexico for bigger volume
+      //////////////////////////////////////////
+      auto rsites= lsites/ratio[d];
+      auto rdims = ldims; rdims[d]/=ratio[d];
+
+      for(int v=0;v<nvec;v++){
+
+	// rsite, rcoor --> smaller local volume
+	// lsite, lcoor --> bigger original (single node?) volume
+	// For loop over each site within smaller subvol
+	for(int rsite=0;rsite<rsites;rsite++){
+
+	  Lexicographic::CoorFromIndex(rcoor, rsite, rdims);	  
+	  int lsite;
+
+	  for(int r=0;r<ratio[d];r++){ 
+
+	    lcoor = rcoor; lcoor[d] += r*rdims[d];
+	    Lexicographic::IndexFromCoor(lcoor, lsite, ldims); lsite += v * lsites;
+
+	    int rmul=nvec*rsites;
+	    int vmul=     rsites;
+	    tmpdata[rsite+r*rmul+v*vmul]=alldata[lsite];
+
+	  }
+	}
+      }
+      nvec   *= ratio[d];
+      ldims[d]=rdims[d];
+      lsites  =rsites;
+
+      full_grid ->AllToAll(d,tmpdata,alldata);
+    }
+  }
+
+  lsites = full_grid->lSites();
+  for(int v=0;v<nvector;v++){
+    parallel_for(int site=0;site<lsites;site++){
+      scalardata[site] = alldata[v*lsites+site];
+    }
+    assert(v<full.size());
+
+    vectorizeFromLexOrdArray(scalardata,full[v]);    
+
+  }
+}
+
 
 }
 #endif
@@ -84,10 +84,6 @@ namespace QCD {
   stream << "GRID_";
   stream << ScidacWordMnemonic<stype>();

-   //   std::cout << " Lorentz N/S/V/M : " << _LorentzN<<" "<<_LorentzScalar<<"/"<<_LorentzVector<<"/"<<_LorentzMatrix<<std::endl;
-   //   std::cout << " Spin    N/S/V/M : " << _SpinN   <<" "<<_SpinScalar   <<"/"<<_SpinVector   <<"/"<<_SpinMatrix<<std::endl;
-   //   std::cout << " Colour  N/S/V/M : " << _ColourN <<" "<<_ColourScalar <<"/"<<_ColourVector <<"/"<<_ColourMatrix<<std::endl;
-
   if ( _LorentzVector )   stream << "_LorentzVector"<<_LorentzN;
   if ( _LorentzMatrix )   stream << "_LorentzMatrix"<<_LorentzN;

@@ -182,7 +178,7 @@ class GridLimeReader : public BinaryIO {
   /////////////////////////////////////////////
   // Open the file
   /////////////////////////////////////////////
-   void open(std::string &_filename) 
+   void open(const std::string &_filename) 
   {
     filename= _filename;
     File = fopen(filename.c_str(), "r");
@@ -210,19 +206,33 @@ class GridLimeReader : public BinaryIO {

    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 

-      std::cout << GridLogMessage << limeReaderType(LimeR) <<std::endl;
-	
-      if ( strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) )  ) {
+      uint64_t file_bytes =limeReaderBytes(LimeR);

+      //      std::cout << GridLogMessage << limeReaderType(LimeR) << " "<< file_bytes <<" bytes "<<std::endl;
+      //      std::cout << GridLogMessage<< " readLimeObject seeking "<<  record_name <<" found record :" <<limeReaderType(LimeR) <<std::endl;
+
+      if ( !strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) )  ) {
+
+	//	std::cout << GridLogMessage<< " readLimeLatticeBinaryObject matches ! " <<std::endl;
+
+	uint64_t PayloadSize = sizeof(sobj) * field._grid->_gsites;
+
+	//	std::cout << "R sizeof(sobj)= " <<sizeof(sobj)<<std::endl;
+	//	std::cout << "R Gsites " <<field._grid->_gsites<<std::endl;
+	//	std::cout << "R Payload expected " <<PayloadSize<<std::endl;
+	//	std::cout << "R file size " <<file_bytes <<std::endl;
+
+	assert(PayloadSize == file_bytes);// Must match or user error

 	off_t offset= ftell(File);
+	//	std::cout << " ReadLatticeObject from offset "<<offset << std::endl;
 	BinarySimpleMunger<sobj,sobj> munge;
-	BinaryIO::readLatticeObject< sobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
+	BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);

 	/////////////////////////////////////////////
 	// Insist checksum is next record
 	/////////////////////////////////////////////
-	readLimeObject(scidacChecksum_,std::string("scidacChecksum"),record_name);
+	readLimeObject(scidacChecksum_,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));

 	/////////////////////////////////////////////
 	// Verify checksums
@@ -242,11 +252,19 @@ class GridLimeReader : public BinaryIO {
    // should this be a do while; can we miss a first record??
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 

+      //      std::cout << GridLogMessage<< " readLimeObject seeking "<< record_name <<" found record :" <<limeReaderType(LimeR) <<std::endl;
+
      uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration)

-      if ( strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) )  ) {
+      if ( !strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) )  ) {
+
+	//	std::cout << GridLogMessage<< " readLimeObject matches ! " << record_name <<std::endl;
+
 	std::vector<char> xmlc(nbytes+1,'\0');
 	limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);    
+
+	//	std::cout << GridLogMessage<< " readLimeObject matches XML " << &xmlc[0] <<std::endl;
+
 	XmlReader RD(&xmlc[0],"");
 	read(RD,object_name,object);
 	return;
@@ -261,13 +279,14 @@ class GridLimeWriter : public BinaryIO {
 public:
   ///////////////////////////////////////////////////
   // FIXME: format for RNG? Now just binary out instead
+   // FIXME: collective calls or not ?
+   //      : must know if I am the I/O boss
   ///////////////////////////////////////////////////
-
   FILE       *File;
   LimeWriter *LimeW;
   std::string filename;

-   void open(std::string &_filename) { 
+   void open(const std::string &_filename) { 
     filename= _filename;
     File = fopen(filename.c_str(), "w");
     LimeW = limeCreateWriter(File); assert(LimeW != NULL );
@@ -302,14 +321,18 @@ class GridLimeWriter : public BinaryIO {
      write(WR,object_name,object);
      xmlstring = WR.XmlString();
    }
+    //    std::cout << "WriteLimeObject" << record_name <<std::endl;
    uint64_t nbytes = xmlstring.size();
+    //    std::cout << " xmlstring "<< nbytes<< " " << xmlstring <<std::endl;
    int err;
-    LimeRecordHeader *h = limeCreateHeader(MB, ME,(char *)record_name.c_str(), nbytes); assert(h!= NULL);
+    LimeRecordHeader *h = limeCreateHeader(MB, ME,const_cast<char *>(record_name.c_str()), nbytes); 
+    assert(h!= NULL);

    err=limeWriteRecordHeader(h, LimeW);                    assert(err>=0);
    err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); assert(err>=0);
    err=limeWriterCloseRecord(LimeW);                       assert(err>=0);
    limeDestroyHeader(h);
+    //    std::cout << " File offset is now"<<ftell(File) << std::endl;
  }
  ////////////////////////////////////////////
  // Write a generic lattice field and csum
@@ -326,6 +349,11 @@ class GridLimeWriter : public BinaryIO {
    uint64_t PayloadSize = sizeof(sobj) * field._grid->_gsites;
    createLimeRecordHeader(record_name, 0, 0, PayloadSize);

+
+    //    std::cout << "W sizeof(sobj)"      <<sizeof(sobj)<<std::endl;
+    //    std::cout << "W Gsites "           <<field._grid->_gsites<<std::endl;
+    //    std::cout << "W Payload expected " <<PayloadSize<<std::endl;
+
    ////////////////////////////////////////////////////////////////////
    // NB: FILE and iostream are jointly writing disjoint sequences in the
    // the same file through different file handles (integer units).
@@ -340,6 +368,7 @@ class GridLimeWriter : public BinaryIO {
    //  v) Continue writing scidac record.
    ////////////////////////////////////////////////////////////////////
    off_t offset = ftell(File);
+    //    std::cout << " Writing to offset "<<offset << std::endl;
    std::string format = getFormatString<vobj>();
    BinarySimpleMunger<sobj,sobj> munge;
    BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
@@ -354,7 +383,7 @@ class GridLimeWriter : public BinaryIO {
    checksum.suma= streama.str();
    checksum.sumb= streamb.str();
    std::cout << GridLogMessage<<" writing scidac checksums "<<std::hex<<scidac_csuma<<"/"<<scidac_csumb<<std::dec<<std::endl;
-    writeLimeObject(0,1,checksum,std::string("scidacChecksum"    ),std::string(SCIDAC_CHECKSUM));
+    writeLimeObject(0,1,checksum,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
  }
 };

@@ -371,11 +400,9 @@ class ScidacWriter : public GridLimeWriter {
  ////////////////////////////////////////////////
  // Write generic lattice field in scidac format
  ////////////////////////////////////////////////
-   template <class vobj, class userRecord>
+  template <class vobj, class userRecord>
  void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord) 
  {
-    typedef typename vobj::scalar_object sobj;
-    uint64_t nbytes;
    GridBase * grid = field._grid;

    ////////////////////////////////////////
@@ -397,6 +424,66 @@ class ScidacWriter : public GridLimeWriter {
  }
 };

+
+class ScidacReader : public GridLimeReader {
+ public:
+
+   template<class SerialisableUserFile>
+   void readScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile)
+   {
+     scidacFile    _scidacFile(grid);
+     readLimeObject(_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
+     readLimeObject(_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
+   }
+  ////////////////////////////////////////////////
+  // Write generic lattice field in scidac format
+  ////////////////////////////////////////////////
+  template <class vobj, class userRecord>
+  void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord) 
+  {
+    typedef typename vobj::scalar_object sobj;
+    GridBase * grid = field._grid;
+
+    ////////////////////////////////////////
+    // fill the Grid header
+    ////////////////////////////////////////
+    FieldMetaData header;
+    scidacRecord  _scidacRecord;
+    scidacFile    _scidacFile;
+
+    //////////////////////////////////////////////
+    // Fill the Lime file record by record
+    //////////////////////////////////////////////
+    readLimeObject(header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message 
+    readLimeObject(_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
+    readLimeObject(_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
+    readLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));
+  }
+  void skipPastBinaryRecord(void) {
+    std::string rec_name(ILDG_BINARY_DATA);
+    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
+      if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) )  ) {
+	skipPastObjectRecord(std::string(SCIDAC_CHECKSUM));
+	return;
+      }
+    }    
+  }
+  void skipPastObjectRecord(std::string rec_name) {
+    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
+      if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) )  ) {
+	return;
+      }
+    }
+  }
+  void skipScidacFieldRecord() {
+    skipPastObjectRecord(std::string(GRID_FORMAT));
+    skipPastObjectRecord(std::string(SCIDAC_RECORD_XML));
+    skipPastObjectRecord(std::string(SCIDAC_PRIVATE_RECORD_XML));
+    skipPastBinaryRecord();
+  }
+};
+
+
 class IldgWriter : public ScidacWriter {
 public:

@@ -425,8 +512,6 @@ class IldgWriter : public ScidacWriter {
    typedef iLorentzColourMatrix<vsimd> vobj;
    typedef typename vobj::scalar_object sobj;

-    uint64_t nbytes;
-
    ////////////////////////////////////////
    // fill the Grid header
    ////////////////////////////////////////
@@ -64,6 +64,11 @@ namespace Grid {
 // file compatability, so should be correct to assume the undocumented but defacto file structure.
 /////////////////////////////////////////////////////////////////////////////////

+struct emptyUserRecord : Serializable { 
+  GRID_SERIALIZABLE_CLASS_MEMBERS(emptyUserRecord,int,dummy);
+  emptyUserRecord() { dummy=0; };
+};
+
 ////////////////////////
 // Scidac private file xml
 // <?xml version="1.0" encoding="UTF-8"?><scidacFile><version>1.1</version><spacetime>4</spacetime><dims>16 16 16 32 </dims><volfmt>0</volfmt></scidacFile>
@@ -85,6 +85,9 @@ namespace Grid {
 	nd=4;
 	dimension.resize(4);
 	boundary.resize(4);
+	scidac_checksuma=0;
+	scidac_checksumb=0;
+	checksum=0;
      }
    };

@@ -104,6 +107,7 @@ namespace Grid {
      header.nd = nd;
      header.dimension.resize(nd);
      header.boundary.resize(nd);
+      header.data_start = 0;
      for(int d=0;d<nd;d++) {
 	header.dimension[d] = grid->_fdimensions[d];
      }
@@ -0,0 +1,100 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/AbstractEOFAFermion.h
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef  GRID_QCD_ABSTRACT_EOFA_FERMION_H
+#define  GRID_QCD_ABSTRACT_EOFA_FERMION_H
+
+#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
+
+namespace Grid {
+namespace QCD {
+
+  // DJM: Abstract base class for EOFA fermion types.
+  // Defines layout of additional EOFA-specific parameters and operators.
+  // Use to construct EOFA pseudofermion actions that are agnostic to
+  // Shamir / Mobius / etc., and ensure that no one can construct EOFA
+  // pseudofermion action with non-EOFA fermion type.
+  template<class Impl>
+  class AbstractEOFAFermion : public CayleyFermion5D<Impl> {
+    public:
+      INHERIT_IMPL_TYPES(Impl);
+
+    public:
+      // Fermion operator: D(mq1) + shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm}
+      RealD mq1;
+      RealD mq2;
+      RealD mq3;
+      RealD shift;
+      int pm;
+
+      RealD alpha; // Mobius scale
+      RealD k;     // EOFA normalization constant
+
+      virtual void Instantiatable(void) = 0;
+
+      // EOFA-specific operations
+      // Force user to implement in derived classes
+      virtual void  Omega    (const FermionField& in, FermionField& out, int sign, int dag) = 0;
+      virtual void  Dtilde   (const FermionField& in, FermionField& out) = 0;
+      virtual void  DtildeInv(const FermionField& in, FermionField& out) = 0;
+
+      // Implement derivatives in base class:
+      // for EOFA both DWF and Mobius just need d(Dw)/dU
+      virtual void MDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
+        this->DhopDeriv(mat, U, V, dag);
+      };
+      virtual void MoeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
+        this->DhopDerivOE(mat, U, V, dag);
+      };
+      virtual void MeoDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
+        this->DhopDerivEO(mat, U, V, dag);
+      };
+
+      // Recompute 5D coefficients for different value of shift constant
+      // (needed for heatbath loop over poles)
+      virtual void RefreshShiftCoefficients(RealD new_shift) = 0;
+
+      // Constructors
+      AbstractEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
+        GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
+        RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int _pm,
+        RealD _M5, RealD _b, RealD _c, const ImplParams& p=ImplParams())
+        : CayleyFermion5D<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid, FourDimGrid, FourDimRedBlackGrid,
+          _mq1, _M5, p), mq1(_mq1), mq2(_mq2), mq3(_mq3), shift(_shift), pm(_pm)
+      {
+        int Ls = this->Ls;
+        this->alpha = _b + _c;
+        this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) /
+                    ( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) /
+                    ( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) );
+      };
+  };
+}}
+
+#endif
@@ -77,7 +77,6 @@ void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi
  }
 }

-
 template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
 {
  this->Report();
@@ -119,7 +118,6 @@ template<class Impl> void CayleyFermion5D<Impl>::CayleyZeroCounters(void)
  MooeeInvTime=0;
 }

-
 template<class Impl>  
 void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
 {
@@ -417,6 +415,8 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
    assert(omega[i]!=Coeff_t(0.0));
    bs[i] = 0.5*(bpc/omega[i] + bmc);
    cs[i] = 0.5*(bpc/omega[i] - bmc);
+    std::cout<<GridLogMessage << "CayleyFermion5D "<<i<<" bs="<<bs[i]<<" cs="<<cs[i]<< std::endl;
+
  }

  ////////////////////////////////////////////////////////
@@ -1,6 +1,6 @@
    /*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+    Grid physics library, www.github.com/paboyle/Grid

    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.h

@@ -35,24 +35,24 @@ namespace Grid {

  namespace QCD {

-     template<typename T> struct switcheroo   {  
-       static inline int iscomplex()  { return 0; } 
+     template<typename T> struct switcheroo   {
+       static inline int iscomplex()  { return 0; }

       template<class vec>
       static inline vec mult(vec a, vec b) {
 	 return real_mult(a,b);
       }
     };
-     template<> struct switcheroo<ComplexD> {  
-       static inline int iscomplex()  { return 1; } 
+     template<> struct switcheroo<ComplexD> {
+       static inline int iscomplex()  { return 1; }

       template<class vec>
       static inline vec mult(vec a, vec b) {
 	 return a*b;
       }
     };
-     template<> struct switcheroo<ComplexF> {  
-       static inline int iscomplex()  { return 1; } 
+     template<> struct switcheroo<ComplexF> {
+       static inline int iscomplex()  { return 1; }
       template<class vec>
       static inline vec mult(vec a, vec b) {
 	 return a*b;
@@ -90,14 +90,14 @@ namespace Grid {
      // Instantiate different versions depending on Impl
      /////////////////////////////////////////////////////
      void M5D(const FermionField &psi,
-	       const FermionField &phi, 
+	       const FermionField &phi,
 	       FermionField &chi,
 	       std::vector<Coeff_t> &lower,
 	       std::vector<Coeff_t> &diag,
 	       std::vector<Coeff_t> &upper);

      void M5Ddag(const FermionField &psi,
-		  const FermionField &phi, 
+		  const FermionField &phi,
 		  FermionField &chi,
 		  std::vector<Coeff_t> &lower,
 		  std::vector<Coeff_t> &diag,
@@ -125,7 +125,7 @@ namespace Grid {

      // Efficient support for multigrid coarsening
      virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
-      
+
      void   Meooe5D       (const FermionField &in, FermionField &out);
      void   MeooeDag5D    (const FermionField &in, FermionField &out);

@@ -133,23 +133,23 @@ namespace Grid {
      RealD mass;

      // Cayley form Moebius (tanh and zolotarev)
-      std::vector<Coeff_t> omega; 
+      std::vector<Coeff_t> omega;
      std::vector<Coeff_t> bs;    // S dependent coeffs
-      std::vector<Coeff_t> cs;    
-      std::vector<Coeff_t> as;    
+      std::vector<Coeff_t> cs;
+      std::vector<Coeff_t> as;
      // For preconditioning Cayley form
-      std::vector<Coeff_t> bee;    
-      std::vector<Coeff_t> cee;    
-      std::vector<Coeff_t> aee;    
-      std::vector<Coeff_t> beo;    
-      std::vector<Coeff_t> ceo;    
-      std::vector<Coeff_t> aeo;    
+      std::vector<Coeff_t> bee;
+      std::vector<Coeff_t> cee;
+      std::vector<Coeff_t> aee;
+      std::vector<Coeff_t> beo;
+      std::vector<Coeff_t> ceo;
+      std::vector<Coeff_t> aeo;
      // LDU factorisation of the eeoo matrix
-      std::vector<Coeff_t> lee;    
-      std::vector<Coeff_t> leem;    
-      std::vector<Coeff_t> uee;    
-      std::vector<Coeff_t> ueem;    
-      std::vector<Coeff_t> dee;    
+      std::vector<Coeff_t> lee;
+      std::vector<Coeff_t> leem;
+      std::vector<Coeff_t> uee;
+      std::vector<Coeff_t> ueem;
+      std::vector<Coeff_t> dee;

      // Matrices of 5d ee inverse params
      Vector<iSinglet<Simd> >  MatpInv;
@@ -165,7 +165,7 @@ namespace Grid {
 		      GridRedBlackCartesian &FourDimRedBlackGrid,
 		      RealD _mass,RealD _M5,const ImplParams &p= ImplParams());

-      
+

     void CayleyReport(void);
     void CayleyZeroCounters(void);
@@ -179,9 +179,9 @@ namespace Grid {
     double MooeeInvTime;

    protected:
-      void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
-      void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
-      void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
+      virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
+      virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
+      virtual void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
    };

  }
@@ -0,0 +1,438 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/Grid_Eigen_Dense.h>
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
+
+namespace Grid {
+namespace QCD {
+
+    template<class Impl>
+    DomainWallEOFAFermion<Impl>::DomainWallEOFAFermion(
+      GaugeField            &_Umu,
+      GridCartesian         &FiveDimGrid,
+      GridRedBlackCartesian &FiveDimRedBlackGrid,
+      GridCartesian         &FourDimGrid,
+      GridRedBlackCartesian &FourDimRedBlackGrid,
+      RealD _mq1, RealD _mq2, RealD _mq3,
+      RealD _shift, int _pm, RealD _M5, const ImplParams &p) :
+    AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
+        FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
+        _shift, _pm, _M5, 1.0, 0.0, p)
+    {
+        RealD eps = 1.0;
+        Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);
+        assert(zdata->n == this->Ls);
+
+        std::cout << GridLogMessage << "DomainWallEOFAFermion with Ls=" << this->Ls << std::endl;
+        this->SetCoefficientsTanh(zdata, 1.0, 0.0);
+
+        Approx::zolotarev_free(zdata);
+    }
+
+    /***************************************************************
+     * Additional EOFA operators only called outside the inverter.
+     * Since speed is not essential, simple axpby-style
+     * implementations should be fine.
+     ***************************************************************/
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
+    {
+        int Ls = this->Ls;
+
+        Din = zero;
+        if((sign == 1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, Ls-1, 0); }
+        else if((sign == -1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
+        else if((sign == 1 ) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, Ls-1); }
+        else if((sign == -1) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
+    }
+
+    // This is just the identity for DWF
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi){ chi = psi; }
+
+    // This is just the identity for DWF
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi){ chi = psi; }
+
+    /*****************************************************************************************************/
+
+    template<class Impl>
+    RealD DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
+    {
+        int Ls = this->Ls;
+
+        FermionField Din(psi._grid);
+
+        this->Meooe5D(psi, Din);
+        this->DW(Din, chi, DaggerNo);
+        axpby(chi, 1.0, 1.0, chi, psi);
+        this->M5D(psi, chi);
+        return(norm2(chi));
+    }
+
+    template<class Impl>
+    RealD DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
+    {
+        int Ls = this->Ls;
+
+        FermionField Din(psi._grid);
+
+        this->DW(psi, Din, DaggerYes);
+        this->MeooeDag5D(Din, chi);
+        this->M5Ddag(psi, chi);
+        axpby(chi, 1.0, 1.0, chi, psi);
+        return(norm2(chi));
+    }
+
+    /********************************************************************
+     * Performance critical fermion operators called inside the inverter
+     ********************************************************************/
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
+    {
+        int   Ls    = this->Ls;
+        int   pm    = this->pm;
+        RealD shift = this->shift;
+        RealD mq1   = this->mq1;
+        RealD mq2   = this->mq2;
+        RealD mq3   = this->mq3;
+
+        // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
+        Coeff_t shiftp(0.0), shiftm(0.0);
+        if(shift != 0.0){
+          if(pm == 1){ shiftp = shift*(mq3-mq2); }
+          else{ shiftm = -shift*(mq3-mq2); }
+        }
+
+        std::vector<Coeff_t> diag(Ls,1.0);
+        std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
+        std::vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftp;
+
+        #if(0)
+            std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
+            for(int i=0; i<diag.size(); ++i){
+                std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
+            }
+            for(int i=0; i<upper.size(); ++i){
+                std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
+            }
+            for(int i=0; i<lower.size(); ++i){
+                std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
+            }
+        #endif
+
+        this->M5D(psi, chi, chi, lower, diag, upper);
+    }
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
+    {
+        int   Ls    = this->Ls;
+        int   pm    = this->pm;
+        RealD shift = this->shift;
+        RealD mq1   = this->mq1;
+        RealD mq2   = this->mq2;
+        RealD mq3   = this->mq3;
+
+        // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
+        Coeff_t shiftp(0.0), shiftm(0.0);
+        if(shift != 0.0){
+          if(pm == 1){ shiftp = shift*(mq3-mq2); }
+          else{ shiftm = -shift*(mq3-mq2); }
+        }
+
+        std::vector<Coeff_t> diag(Ls,1.0);
+        std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
+        std::vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftm;
+
+        #if(0)
+            std::cout << GridLogMessage << "DomainWallEOFAFermion::M5Ddag(FF&,FF&):" << std::endl;
+            for(int i=0; i<diag.size(); ++i){
+                std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
+            }
+            for(int i=0; i<upper.size(); ++i){
+                std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
+            }
+            for(int i=0; i<lower.size(); ++i){
+                std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
+            }
+        #endif
+
+        this->M5Ddag(psi, chi, chi, lower, diag, upper);
+    }
+
+    // half checkerboard operations
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
+    {
+        int Ls = this->Ls;
+
+        std::vector<Coeff_t> diag = this->bee;
+        std::vector<Coeff_t> upper(Ls);
+        std::vector<Coeff_t> lower(Ls);
+
+        for(int s=0; s<Ls; s++){
+          upper[s] = -this->cee[s];
+          lower[s] = -this->cee[s];
+        }
+        upper[Ls-1] = this->dm;
+        lower[0]    = this->dp;
+
+        this->M5D(psi, psi, chi, lower, diag, upper);
+    }
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
+    {
+        int Ls = this->Ls;
+
+        std::vector<Coeff_t> diag = this->bee;
+        std::vector<Coeff_t> upper(Ls);
+        std::vector<Coeff_t> lower(Ls);
+
+        for(int s=0; s<Ls; s++){
+          upper[s] = -this->cee[s];
+          lower[s] = -this->cee[s];
+        }
+        upper[Ls-1] = this->dp;
+        lower[0]    = this->dm;
+
+        this->M5Ddag(psi, psi, chi, lower, diag, upper);
+    }
+
+    /****************************************************************************************/
+
+    //Zolo
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c)
+    {
+        int   Ls    = this->Ls;
+        int   pm    = this->pm;
+        RealD mq1   = this->mq1;
+        RealD mq2   = this->mq2;
+        RealD mq3   = this->mq3;
+        RealD shift = this->shift;
+
+        ////////////////////////////////////////////////////////
+        // Constants for the preconditioned matrix Cayley form
+        ////////////////////////////////////////////////////////
+        this->bs.resize(Ls);
+        this->cs.resize(Ls);
+        this->aee.resize(Ls);
+        this->aeo.resize(Ls);
+        this->bee.resize(Ls);
+        this->beo.resize(Ls);
+        this->cee.resize(Ls);
+        this->ceo.resize(Ls);
+
+        for(int i=0; i<Ls; ++i){
+          this->bee[i] = 4.0 - this->M5 + 1.0;
+          this->cee[i] = 1.0;
+        }
+
+        for(int i=0; i<Ls; ++i){
+          this->aee[i] = this->cee[i];
+          this->bs[i] = this->beo[i] = 1.0;
+          this->cs[i] = this->ceo[i] = 0.0;
+        }
+
+        //////////////////////////////////////////
+        // EOFA shift terms
+        //////////////////////////////////////////
+        if(pm == 1){
+          this->dp = mq1*this->cee[0] + shift*(mq3-mq2);
+          this->dm = mq1*this->cee[Ls-1];
+        } else if(this->pm == -1) {
+          this->dp = mq1*this->cee[0];
+          this->dm = mq1*this->cee[Ls-1] - shift*(mq3-mq2);
+        } else {
+          this->dp = mq1*this->cee[0];
+          this->dm = mq1*this->cee[Ls-1];
+        }
+
+        //////////////////////////////////////////
+        // LDU decomposition of eeoo
+        //////////////////////////////////////////
+        this->dee.resize(Ls+1);
+        this->lee.resize(Ls);
+        this->leem.resize(Ls);
+        this->uee.resize(Ls);
+        this->ueem.resize(Ls);
+
+        for(int i=0; i<Ls; ++i){
+
+          if(i < Ls-1){
+
+            this->lee[i] = -this->cee[i+1]/this->bee[i]; // sub-diag entry on the ith column
+
+            this->leem[i] = this->dm/this->bee[i];
+            for(int j=0; j<i; j++){ this->leem[i] *= this->aee[j]/this->bee[j]; }
+
+            this->dee[i] = this->bee[i];
+
+            this->uee[i] = -this->aee[i]/this->bee[i];   // up-diag entry on the ith row
+
+            this->ueem[i] = this->dp / this->bee[0];
+            for(int j=1; j<=i; j++){ this->ueem[i] *= this->cee[j]/this->bee[j]; }
+
+          } else {
+
+            this->lee[i]  = 0.0;
+            this->leem[i] = 0.0;
+            this->uee[i]  = 0.0;
+            this->ueem[i] = 0.0;
+
+          }
+        }
+
+        {
+          Coeff_t delta_d = 1.0 / this->bee[0];
+          for(int j=1; j<Ls-1; j++){ delta_d *= this->cee[j] / this->bee[j]; }
+          this->dee[Ls-1] = this->bee[Ls-1] + this->cee[0] * this->dm * delta_d;
+          this->dee[Ls] = this->bee[Ls-1] + this->cee[Ls-1] * this->dp * delta_d;
+        }
+
+        int inv = 1;
+        this->MooeeInternalCompute(0, inv, this->MatpInv, this->MatmInv);
+        this->MooeeInternalCompute(1, inv, this->MatpInvDag, this->MatmInvDag);
+    }
+
+    // Recompute Cayley-form coefficients for different shift
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
+    {
+        this->shift = new_shift;
+        Approx::zolotarev_data *zdata = Approx::higham(1.0, this->Ls);
+        this->SetCoefficientsTanh(zdata, 1.0, 0.0);
+    }
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
+        Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
+    {
+        int Ls = this->Ls;
+
+        GridBase* grid = this->FermionRedBlackGrid();
+        int LLs = grid->_rdimensions[0];
+
+        if(LLs == Ls){ return; } // Not vectorised in 5th direction
+
+        Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
+        Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
+
+        for(int s=0; s<Ls; s++){
+            Pplus(s,s)  = this->bee[s];
+            Pminus(s,s) = this->bee[s];
+        }
+
+        for(int s=0; s<Ls-1; s++){
+            Pminus(s,s+1) = -this->cee[s];
+        }
+
+        for(int s=0; s<Ls-1; s++){
+            Pplus(s+1,s) = -this->cee[s+1];
+        }
+
+        Pplus (0,Ls-1) = this->dp;
+        Pminus(Ls-1,0) = this->dm;
+
+        Eigen::MatrixXcd PplusMat ;
+        Eigen::MatrixXcd PminusMat;
+
+        #if(0)
+            std::cout << GridLogMessage << "Pplus:" << std::endl;
+            for(int s=0; s<Ls; ++s){
+                for(int ss=0; ss<Ls; ++ss){
+                    std::cout << Pplus(s,ss) << "\t";
+                }
+                std::cout << std::endl;
+            }
+            std::cout << GridLogMessage << "Pminus:" << std::endl;
+            for(int s=0; s<Ls; ++s){
+                for(int ss=0; ss<Ls; ++ss){
+                    std::cout << Pminus(s,ss) << "\t";
+                }
+                std::cout << std::endl;
+            }
+        #endif
+
+        if(inv) {
+            PplusMat  = Pplus.inverse();
+            PminusMat = Pminus.inverse();
+        } else {
+            PplusMat  = Pplus;
+            PminusMat = Pminus;
+        }
+
+        if(dag){
+            PplusMat.adjointInPlace();
+            PminusMat.adjointInPlace();
+        }
+
+        typedef typename SiteHalfSpinor::scalar_type scalar_type;
+        const int Nsimd = Simd::Nsimd();
+        Matp.resize(Ls*LLs);
+        Matm.resize(Ls*LLs);
+
+        for(int s2=0; s2<Ls; s2++){
+        for(int s1=0; s1<LLs; s1++){
+            int istride = LLs;
+            int ostride = 1;
+            Simd Vp;
+            Simd Vm;
+            scalar_type *sp = (scalar_type*) &Vp;
+            scalar_type *sm = (scalar_type*) &Vm;
+            for(int l=0; l<Nsimd; l++){
+                if(switcheroo<Coeff_t>::iscomplex()) {
+                    sp[l] = PplusMat (l*istride+s1*ostride,s2);
+                    sm[l] = PminusMat(l*istride+s1*ostride,s2);
+                } else {
+                    // if real
+                    scalar_type tmp;
+                    tmp = PplusMat (l*istride+s1*ostride,s2);
+                    sp[l] = scalar_type(tmp.real(),tmp.real());
+                    tmp = PminusMat(l*istride+s1*ostride,s2);
+                    sm[l] = scalar_type(tmp.real(),tmp.real());
+                }
+            }
+            Matp[LLs*s2+s1] = Vp;
+            Matm[LLs*s2+s1] = Vm;
+        }}
+    }
+
+    FermOpTemplateInstantiate(DomainWallEOFAFermion);
+    GparityFermOpTemplateInstantiate(DomainWallEOFAFermion);
+
+}}
@@ -0,0 +1,115 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.h
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef  GRID_QCD_DOMAIN_WALL_EOFA_FERMION_H
+#define  GRID_QCD_DOMAIN_WALL_EOFA_FERMION_H
+
+#include <Grid/qcd/action/fermion/AbstractEOFAFermion.h>
+
+namespace Grid {
+namespace QCD {
+
+  template<class Impl>
+  class DomainWallEOFAFermion : public AbstractEOFAFermion<Impl>
+  {
+    public:
+      INHERIT_IMPL_TYPES(Impl);
+
+    public:
+      // Modified (0,Ls-1) and (Ls-1,0) elements of Mooee
+      // for red-black preconditioned Shamir EOFA
+      Coeff_t dm;
+      Coeff_t dp;
+
+      virtual void Instantiatable(void) {};
+
+      // EOFA-specific operations
+      virtual void  Omega      (const FermionField& in, FermionField& out, int sign, int dag);
+      virtual void  Dtilde     (const FermionField& in, FermionField& out);
+      virtual void  DtildeInv  (const FermionField& in, FermionField& out);
+
+      // override multiply
+      virtual RealD M          (const FermionField& in, FermionField& out);
+      virtual RealD Mdag       (const FermionField& in, FermionField& out);
+
+      // half checkerboard operations
+      virtual void  Mooee      (const FermionField& in, FermionField& out);
+      virtual void  MooeeDag   (const FermionField& in, FermionField& out);
+      virtual void  MooeeInv   (const FermionField& in, FermionField& out);
+      virtual void  MooeeInvDag(const FermionField& in, FermionField& out);
+
+      virtual void   M5D       (const FermionField& psi, FermionField& chi);
+      virtual void   M5Ddag    (const FermionField& psi, FermionField& chi);
+
+      /////////////////////////////////////////////////////
+      // Instantiate different versions depending on Impl
+      /////////////////////////////////////////////////////
+      void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
+        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
+
+      void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
+        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
+
+      void MooeeInternal(const FermionField& in, FermionField& out, int dag, int inv);
+
+      void MooeeInternalCompute(int dag, int inv, Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
+
+      void MooeeInternalAsm(const FermionField& in, FermionField& out, int LLs, int site,
+        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
+
+      void MooeeInternalZAsm(const FermionField& in, FermionField& out, int LLs, int site,
+        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
+
+      virtual void RefreshShiftCoefficients(RealD new_shift);
+
+      // Constructors
+      DomainWallEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
+        GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
+        RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int pm,
+        RealD _M5, const ImplParams& p=ImplParams());
+
+    protected:
+      void SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c);
+  };
+}}
+
+#define INSTANTIATE_DPERP_DWF_EOFA(A)\
+template void DomainWallEOFAFermion<A>::M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, \
+  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
+template void DomainWallEOFAFermion<A>::M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, \
+  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
+template void DomainWallEOFAFermion<A>::MooeeInv(const FermionField& psi, FermionField& chi); \
+template void DomainWallEOFAFermion<A>::MooeeInvDag(const FermionField& psi, FermionField& chi);
+
+#undef  DOMAIN_WALL_EOFA_DPERP_DENSE
+#define DOMAIN_WALL_EOFA_DPERP_CACHE
+#undef  DOMAIN_WALL_EOFA_DPERP_LINALG
+#define DOMAIN_WALL_EOFA_DPERP_VEC
+
+#endif
@@ -0,0 +1,248 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
+
+namespace Grid {
+namespace QCD {
+
+    // FIXME -- make a version of these routines with site loop outermost for cache reuse.
+
+    // Pminus fowards
+    // Pplus  backwards..
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
+        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
+    {
+        int Ls = this->Ls;
+        GridBase* grid = psi._grid;
+
+        assert(phi.checkerboard == psi.checkerboard);
+        chi.checkerboard = psi.checkerboard;
+        // Flops = 6.0*(Nc*Ns) *Ls*vol
+        this->M5Dcalls++;
+        this->M5Dtime -= usecond();
+
+        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
+            for(int s=0; s<Ls; s++){
+                auto tmp = psi._odata[0];
+                if(s==0) {
+                    spProj5m(tmp, psi._odata[ss+s+1]);
+                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+                    spProj5p(tmp, psi._odata[ss+Ls-1]);
+                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+                } else if(s==(Ls-1)) {
+                    spProj5m(tmp, psi._odata[ss+0]);
+                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+                    spProj5p(tmp, psi._odata[ss+s-1]);
+                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+                } else {
+                    spProj5m(tmp, psi._odata[ss+s+1]);
+                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+                    spProj5p(tmp, psi._odata[ss+s-1]);
+                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+                }
+            }
+        }
+
+        this->M5Dtime += usecond();
+    }
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
+        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
+    {
+        int Ls = this->Ls;
+        GridBase* grid = psi._grid;
+        assert(phi.checkerboard == psi.checkerboard);
+        chi.checkerboard=psi.checkerboard;
+
+        // Flops = 6.0*(Nc*Ns) *Ls*vol
+        this->M5Dcalls++;
+        this->M5Dtime -= usecond();
+
+        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
+            auto tmp = psi._odata[0];
+            for(int s=0; s<Ls; s++){
+                if(s==0) {
+                    spProj5p(tmp, psi._odata[ss+s+1]);
+                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+                    spProj5m(tmp, psi._odata[ss+Ls-1]);
+                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+                } else if(s==(Ls-1)) {
+                    spProj5p(tmp, psi._odata[ss+0]);
+                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+                    spProj5m(tmp, psi._odata[ss+s-1]);
+                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+                } else {
+                    spProj5p(tmp, psi._odata[ss+s+1]);
+                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+                    spProj5m(tmp, psi._odata[ss+s-1]);
+                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+                }
+            }
+        }
+
+        this->M5Dtime += usecond();
+    }
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
+    {
+        GridBase* grid = psi._grid;
+        int Ls = this->Ls;
+
+        chi.checkerboard = psi.checkerboard;
+
+        this->MooeeInvCalls++;
+        this->MooeeInvTime -= usecond();
+
+        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
+
+            auto tmp1 = psi._odata[0];
+            auto tmp2 = psi._odata[0];
+
+            // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
+            // Apply (L^{\prime})^{-1}
+            chi[ss] = psi[ss]; // chi[0]=psi[0]
+            for(int s=1; s<Ls; s++){
+                spProj5p(tmp1, chi[ss+s-1]);
+                chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
+            }
+
+            // L_m^{-1}
+            for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+                spProj5m(tmp1, chi[ss+s]);
+                chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
+            }
+
+            // U_m^{-1} D^{-1}
+            for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
+                spProj5p(tmp1, chi[ss+Ls-1]);
+                chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls])*tmp1;
+            }
+            spProj5m(tmp2, chi[ss+Ls-1]);
+            chi[ss+Ls-1] = (1.0/this->dee[Ls])*tmp1 + (1.0/this->dee[Ls-1])*tmp2;
+
+            // Apply U^{-1}
+            for(int s=Ls-2; s>=0; s--){
+                spProj5m(tmp1, chi[ss+s+1]);
+                chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
+            }
+        }
+
+        this->MooeeInvTime += usecond();
+    }
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
+    {
+        GridBase* grid = psi._grid;
+        int Ls = this->Ls;
+
+        assert(psi.checkerboard == psi.checkerboard);
+        chi.checkerboard = psi.checkerboard;
+
+        std::vector<Coeff_t> ueec(Ls);
+        std::vector<Coeff_t> deec(Ls+1);
+        std::vector<Coeff_t> leec(Ls);
+        std::vector<Coeff_t> ueemc(Ls);
+        std::vector<Coeff_t> leemc(Ls);
+
+        for(int s=0; s<ueec.size(); s++){
+            ueec[s]  = conjugate(this->uee[s]);
+            deec[s]  = conjugate(this->dee[s]);
+            leec[s]  = conjugate(this->lee[s]);
+            ueemc[s] = conjugate(this->ueem[s]);
+            leemc[s] = conjugate(this->leem[s]);
+        }
+        deec[Ls] = conjugate(this->dee[Ls]);
+
+        this->MooeeInvCalls++;
+        this->MooeeInvTime -= usecond();
+
+        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
+
+            auto tmp1 = psi._odata[0];
+            auto tmp2 = psi._odata[0];
+
+            // Apply (U^{\prime})^{-dagger}
+            chi[ss] = psi[ss];
+            for(int s=1; s<Ls; s++){
+                spProj5m(tmp1, chi[ss+s-1]);
+                chi[ss+s] = psi[ss+s] - ueec[s-1]*tmp1;
+            }
+
+            // U_m^{-\dagger}
+            for(int s=0; s<Ls-1; s++){
+                spProj5p(tmp1, chi[ss+s]);
+                chi[ss+Ls-1] = chi[ss+Ls-1] - ueemc[s]*tmp1;
+            }
+
+            // L_m^{-\dagger} D^{-dagger}
+            for(int s=0; s<Ls-1; s++){
+                spProj5m(tmp1, chi[ss+Ls-1]);
+                chi[ss+s] = (1.0/deec[s])*chi[ss+s] - (leemc[s]/deec[Ls-1])*tmp1;
+            }
+            spProj5p(tmp2, chi[ss+Ls-1]);
+            chi[ss+Ls-1] = (1.0/deec[Ls-1])*tmp1 + (1.0/deec[Ls])*tmp2;
+
+            // Apply L^{-dagger}
+            for(int s=Ls-2; s>=0; s--){
+                spProj5p(tmp1, chi[ss+s+1]);
+                chi[ss+s] = chi[ss+s] - leec[s]*tmp1;
+            }
+        }
+
+        this->MooeeInvTime += usecond();
+    }
+
+    #ifdef DOMAIN_WALL_EOFA_DPERP_CACHE
+
+        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
+        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
+        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
+        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
+        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
+        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
+
+        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
+        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
+        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
+        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
+        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
+        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
+
+    #endif
+
+}}
@@ -0,0 +1,159 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermiondense.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/Grid_Eigen_Dense.h>
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
+
+namespace Grid {
+namespace QCD {
+
+    /*
+    * Dense matrix versions of routines
+    */
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
+    {
+        this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
+    }
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
+    {
+        this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
+    }
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
+    {
+        int Ls = this->Ls;
+        int LLs = psi._grid->_rdimensions[0];
+        int vol = psi._grid->oSites()/LLs;
+
+        chi.checkerboard = psi.checkerboard;
+
+        assert(Ls==LLs);
+
+        Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
+        Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
+
+        for(int s=0;s<Ls;s++){
+            Pplus(s,s)  = this->bee[s];
+            Pminus(s,s) = this->bee[s];
+        }
+
+        for(int s=0; s<Ls-1; s++){
+            Pminus(s,s+1) = -this->cee[s];
+        }
+
+        for(int s=0; s<Ls-1; s++){
+            Pplus(s+1,s) = -this->cee[s+1];
+        }
+
+        Pplus (0,Ls-1) = this->dp;
+        Pminus(Ls-1,0) = this->dm;
+
+        Eigen::MatrixXd PplusMat ;
+        Eigen::MatrixXd PminusMat;
+
+        if(inv) {
+            PplusMat  = Pplus.inverse();
+            PminusMat = Pminus.inverse();
+        } else {
+            PplusMat  = Pplus;
+            PminusMat = Pminus;
+        }
+
+        if(dag){
+            PplusMat.adjointInPlace();
+            PminusMat.adjointInPlace();
+        }
+
+        // For the non-vectorised s-direction this is simple
+
+        for(auto site=0; site<vol; site++){
+
+            SiteSpinor     SiteChi;
+            SiteHalfSpinor SitePplus;
+            SiteHalfSpinor SitePminus;
+
+            for(int s1=0; s1<Ls; s1++){
+                SiteChi = zero;
+                for(int s2=0; s2<Ls; s2++){
+                    int lex2 = s2 + Ls*site;
+                    if(PplusMat(s1,s2) != 0.0){
+                        spProj5p(SitePplus,psi[lex2]);
+                        accumRecon5p(SiteChi, PplusMat(s1,s2)*SitePplus);
+                    }
+                    if(PminusMat(s1,s2) != 0.0){
+                        spProj5m(SitePminus, psi[lex2]);
+                        accumRecon5m(SiteChi, PminusMat(s1,s2)*SitePminus);
+                    }
+                }
+                chi[s1+Ls*site] = SiteChi*0.5;
+            }
+        }
+    }
+
+    #ifdef DOMAIN_WALL_EOFA_DPERP_DENSE
+
+        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
+        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
+        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
+        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
+        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
+        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
+
+        template void DomainWallEOFAFermion<GparityWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<GparityWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<WilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<WilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<ZWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<ZWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+
+        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
+        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
+        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
+        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
+        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
+        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
+
+        template void DomainWallEOFAFermion<GparityWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<GparityWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<WilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<WilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<ZWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<ZWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+
+    #endif
+
+}}
@@ -0,0 +1,168 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionssp.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
+
+namespace Grid {
+namespace QCD {
+
+    // FIXME -- make a version of these routines with site loop outermost for cache reuse.
+    // Pminus fowards
+    // Pplus  backwards
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
+        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
+    {
+        Coeff_t one(1.0);
+        int Ls = this->Ls;
+        for(int s=0; s<Ls; s++){
+            if(s==0) {
+              axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
+              axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
+            } else if (s==(Ls-1)) {
+              axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
+              axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
+            } else {
+              axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
+              axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
+            }
+        }
+    }
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
+        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
+    {
+        Coeff_t one(1.0);
+        int Ls = this->Ls;
+        for(int s=0; s<Ls; s++){
+            if(s==0) {
+              axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
+              axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
+            } else if (s==(Ls-1)) {
+              axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
+              axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
+            } else {
+              axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
+              axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
+            }
+        }
+    }
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
+    {
+        Coeff_t one(1.0);
+        Coeff_t czero(0.0);
+        chi.checkerboard = psi.checkerboard;
+        int Ls = this->Ls;
+
+        FermionField tmp(psi._grid);
+
+        // Apply (L^{\prime})^{-1}
+        axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
+        for(int s=1; s<Ls; s++){
+            axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
+        }
+
+        // L_m^{-1}
+        for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+            axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
+        }
+
+        // U_m^{-1} D^{-1}
+        for(int s=0; s<Ls-1; s++){
+            axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls], chi, s, Ls-1);
+        }
+        axpby_ssp_pminus(tmp, czero, chi, one/this->dee[Ls-1], chi, Ls-1, Ls-1);
+        axpby_ssp_pplus(chi, one, tmp, one/this->dee[Ls], chi, Ls-1, Ls-1);
+
+        // Apply U^{-1}
+        for(int s=Ls-2; s>=0; s--){
+            axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1);  // chi[Ls]
+        }
+    }
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
+    {
+        Coeff_t one(1.0);
+        Coeff_t czero(0.0);
+        chi.checkerboard = psi.checkerboard;
+        int Ls = this->Ls;
+
+        FermionField tmp(psi._grid);
+
+        // Apply (U^{\prime})^{-dagger}
+        axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
+        for(int s=1; s<Ls; s++){
+            axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
+        }
+
+        // U_m^{-\dagger}
+        for(int s=0; s<Ls-1; s++){
+            axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
+        }
+
+        // L_m^{-\dagger} D^{-dagger}
+        for(int s=0; s<Ls-1; s++){
+            axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
+        }
+        axpby_ssp_pminus(tmp, czero, chi, one/conjugate(this->dee[Ls-1]), chi, Ls-1, Ls-1);
+        axpby_ssp_pplus(chi, one, tmp, one/conjugate(this->dee[Ls]), chi, Ls-1, Ls-1);
+
+        // Apply L^{-dagger}
+        for(int s=Ls-2; s>=0; s--){
+            axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1);  // chi[Ls]
+        }
+    }
+
+    #ifdef DOMAIN_WALL_EOFA_DPERP_LINALG
+
+        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
+        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
+        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
+        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
+        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
+        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
+
+        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
+        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
+        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
+        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
+        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
+        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
+
+    #endif
+
+}}
@@ -0,0 +1,605 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
+
+namespace Grid {
+namespace QCD {
+
+    /*
+    * Dense matrix versions of routines
+    */
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
+    {
+        this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
+    }
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
+    {
+        this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
+    }
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
+        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
+    {
+        GridBase* grid = psi._grid;
+        int Ls  = this->Ls;
+        int LLs = grid->_rdimensions[0];
+        const int nsimd = Simd::Nsimd();
+
+        Vector<iSinglet<Simd> > u(LLs);
+        Vector<iSinglet<Simd> > l(LLs);
+        Vector<iSinglet<Simd> > d(LLs);
+
+        assert(Ls/LLs == nsimd);
+        assert(phi.checkerboard == psi.checkerboard);
+
+        chi.checkerboard = psi.checkerboard;
+
+        // just directly address via type pun
+        typedef typename Simd::scalar_type scalar_type;
+        scalar_type* u_p = (scalar_type*) &u[0];
+        scalar_type* l_p = (scalar_type*) &l[0];
+        scalar_type* d_p = (scalar_type*) &d[0];
+
+        for(int o=0;o<LLs;o++){ // outer
+        for(int i=0;i<nsimd;i++){ //inner
+            int s  = o + i*LLs;
+            int ss = o*nsimd + i;
+            u_p[ss] = upper[s];
+            l_p[ss] = lower[s];
+            d_p[ss] = diag[s];
+        }}
+
+        this->M5Dcalls++;
+        this->M5Dtime -= usecond();
+
+        assert(Nc == 3);
+
+        parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
+
+            #if 0
+
+                alignas(64) SiteHalfSpinor hp;
+                alignas(64) SiteHalfSpinor hm;
+                alignas(64) SiteSpinor fp;
+                alignas(64) SiteSpinor fm;
+
+                for(int v=0; v<LLs; v++){
+
+                    int vp = (v+1)%LLs;
+                    int vm = (v+LLs-1)%LLs;
+
+                    spProj5m(hp, psi[ss+vp]);
+                    spProj5p(hm, psi[ss+vm]);
+
+                    if (vp <= v){ rotate(hp, hp, 1); }
+                    if (vm >= v){ rotate(hm, hm, nsimd-1); }
+
+                    hp = 0.5*hp;
+                    hm = 0.5*hm;
+
+                    spRecon5m(fp, hp);
+                    spRecon5p(fm, hm);
+
+                    chi[ss+v] = d[v]*phi[ss+v];
+                    chi[ss+v] = chi[ss+v] + u[v]*fp;
+                    chi[ss+v] = chi[ss+v] + l[v]*fm;
+
+                }
+
+            #else
+
+                for(int v=0; v<LLs; v++){
+
+                    vprefetch(psi[ss+v+LLs]);
+
+                    int vp = (v==LLs-1) ? 0     : v+1;
+                    int vm = (v==0)     ? LLs-1 : v-1;
+
+                    Simd hp_00 = psi[ss+vp]()(2)(0);
+                    Simd hp_01 = psi[ss+vp]()(2)(1);
+                    Simd hp_02 = psi[ss+vp]()(2)(2);
+                    Simd hp_10 = psi[ss+vp]()(3)(0);
+                    Simd hp_11 = psi[ss+vp]()(3)(1);
+                    Simd hp_12 = psi[ss+vp]()(3)(2);
+
+                    Simd hm_00 = psi[ss+vm]()(0)(0);
+                    Simd hm_01 = psi[ss+vm]()(0)(1);
+                    Simd hm_02 = psi[ss+vm]()(0)(2);
+                    Simd hm_10 = psi[ss+vm]()(1)(0);
+                    Simd hm_11 = psi[ss+vm]()(1)(1);
+                    Simd hm_12 = psi[ss+vm]()(1)(2);
+
+                    if(vp <= v){
+                        hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
+                        hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
+                        hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
+                        hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
+                        hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
+                        hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
+                    }
+
+                    if(vm >= v){
+                        hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
+                        hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
+                        hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
+                        hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
+                        hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
+                        hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
+                    }
+
+                    // Can force these to real arithmetic and save 2x.
+                    Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
+                    Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
+                    Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
+                    Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
+                    Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
+                    Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
+                    Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
+                    Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
+                    Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
+                    Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
+                    Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
+                    Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
+
+                    vstream(chi[ss+v]()(0)(0), p_00);
+                    vstream(chi[ss+v]()(0)(1), p_01);
+                    vstream(chi[ss+v]()(0)(2), p_02);
+                    vstream(chi[ss+v]()(1)(0), p_10);
+                    vstream(chi[ss+v]()(1)(1), p_11);
+                    vstream(chi[ss+v]()(1)(2), p_12);
+                    vstream(chi[ss+v]()(2)(0), p_20);
+                    vstream(chi[ss+v]()(2)(1), p_21);
+                    vstream(chi[ss+v]()(2)(2), p_22);
+                    vstream(chi[ss+v]()(3)(0), p_30);
+                    vstream(chi[ss+v]()(3)(1), p_31);
+                    vstream(chi[ss+v]()(3)(2), p_32);
+                }
+
+            #endif
+        }
+
+        this->M5Dtime += usecond();
+    }
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
+        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
+    {
+        GridBase* grid = psi._grid;
+        int Ls  = this->Ls;
+        int LLs = grid->_rdimensions[0];
+        int nsimd = Simd::Nsimd();
+
+        Vector<iSinglet<Simd> > u(LLs);
+        Vector<iSinglet<Simd> > l(LLs);
+        Vector<iSinglet<Simd> > d(LLs);
+
+        assert(Ls/LLs == nsimd);
+        assert(phi.checkerboard == psi.checkerboard);
+
+        chi.checkerboard = psi.checkerboard;
+
+        // just directly address via type pun
+        typedef typename Simd::scalar_type scalar_type;
+        scalar_type* u_p = (scalar_type*) &u[0];
+        scalar_type* l_p = (scalar_type*) &l[0];
+        scalar_type* d_p = (scalar_type*) &d[0];
+
+        for(int o=0; o<LLs; o++){ // outer
+        for(int i=0; i<nsimd; i++){ //inner
+            int s  = o + i*LLs;
+            int ss = o*nsimd + i;
+            u_p[ss] = upper[s];
+            l_p[ss] = lower[s];
+            d_p[ss] = diag[s];
+        }}
+
+        this->M5Dcalls++;
+        this->M5Dtime -= usecond();
+
+        parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
+
+        #if 0
+
+            alignas(64) SiteHalfSpinor hp;
+            alignas(64) SiteHalfSpinor hm;
+            alignas(64) SiteSpinor fp;
+            alignas(64) SiteSpinor fm;
+
+            for(int v=0; v<LLs; v++){
+
+                int vp = (v+1)%LLs;
+                int vm = (v+LLs-1)%LLs;
+
+                spProj5p(hp, psi[ss+vp]);
+                spProj5m(hm, psi[ss+vm]);
+
+                if(vp <= v){ rotate(hp, hp, 1); }
+                if(vm >= v){ rotate(hm, hm, nsimd-1); }
+
+                hp = hp*0.5;
+                hm = hm*0.5;
+                spRecon5p(fp, hp);
+                spRecon5m(fm, hm);
+
+                chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
+                chi[ss+v] = chi[ss+v]     +l[v]*fm;
+            }
+
+        #else
+
+            for(int v=0; v<LLs; v++){
+
+                vprefetch(psi[ss+v+LLs]);
+
+                int vp = (v == LLs-1) ? 0     : v+1;
+                int vm = (v == 0    ) ? LLs-1 : v-1;
+
+                Simd hp_00 = psi[ss+vp]()(0)(0);
+                Simd hp_01 = psi[ss+vp]()(0)(1);
+                Simd hp_02 = psi[ss+vp]()(0)(2);
+                Simd hp_10 = psi[ss+vp]()(1)(0);
+                Simd hp_11 = psi[ss+vp]()(1)(1);
+                Simd hp_12 = psi[ss+vp]()(1)(2);
+
+                Simd hm_00 = psi[ss+vm]()(2)(0);
+                Simd hm_01 = psi[ss+vm]()(2)(1);
+                Simd hm_02 = psi[ss+vm]()(2)(2);
+                Simd hm_10 = psi[ss+vm]()(3)(0);
+                Simd hm_11 = psi[ss+vm]()(3)(1);
+                Simd hm_12 = psi[ss+vm]()(3)(2);
+
+                if (vp <= v){
+                    hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
+                    hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
+                    hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
+                    hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
+                    hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
+                    hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
+                }
+
+                if(vm >= v){
+                    hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
+                    hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
+                    hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
+                    hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
+                    hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
+                    hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
+                }
+
+                Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
+                Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
+                Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
+                Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
+                Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
+                Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
+                Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
+                Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
+                Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
+                Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
+                Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
+                Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
+
+                vstream(chi[ss+v]()(0)(0), p_00);
+                vstream(chi[ss+v]()(0)(1), p_01);
+                vstream(chi[ss+v]()(0)(2), p_02);
+                vstream(chi[ss+v]()(1)(0), p_10);
+                vstream(chi[ss+v]()(1)(1), p_11);
+                vstream(chi[ss+v]()(1)(2), p_12);
+                vstream(chi[ss+v]()(2)(0), p_20);
+                vstream(chi[ss+v]()(2)(1), p_21);
+                vstream(chi[ss+v]()(2)(2), p_22);
+                vstream(chi[ss+v]()(3)(0), p_30);
+                vstream(chi[ss+v]()(3)(1), p_31);
+                vstream(chi[ss+v]()(3)(2), p_32);
+            }
+        #endif
+
+        }
+
+        this->M5Dtime += usecond();
+    }
+
+    #ifdef AVX512
+        #include<simd/Intel512common.h>
+        #include<simd/Intel512avx.h>
+        #include<simd/Intel512single.h>
+    #endif
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi, FermionField& chi,
+        int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
+    {
+        #ifndef AVX512
+        {
+            SiteHalfSpinor BcastP;
+            SiteHalfSpinor BcastM;
+            SiteHalfSpinor SiteChiP;
+            SiteHalfSpinor SiteChiM;
+
+            // Ls*Ls * 2 * 12 * vol flops
+            for(int s1=0; s1<LLs; s1++){
+
+                for(int s2=0; s2<LLs; s2++){
+                for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
+
+                    int s = s2 + l*LLs;
+                    int lex = s2 + LLs*site;
+
+                    if( s2==0 && l==0 ){
+                        SiteChiP=zero;
+                        SiteChiM=zero;
+                    }
+
+                    for(int sp=0; sp<2;  sp++){
+                    for(int co=0; co<Nc; co++){
+                        vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
+                    }}
+
+                    for(int sp=0; sp<2;  sp++){
+                    for(int co=0; co<Nc; co++){
+                        vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
+                    }}
+
+                    for(int sp=0; sp<2;  sp++){
+                    for(int co=0; co<Nc; co++){
+                        SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
+                        SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
+                    }}
+                }}
+
+                {
+                    int lex = s1 + LLs*site;
+                    for(int sp=0; sp<2;  sp++){
+                    for(int co=0; co<Nc; co++){
+                        vstream(chi[lex]()(sp)(co),   SiteChiP()(sp)(co));
+                        vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
+                    }}
+                }
+            }
+
+        }
+        #else
+        {
+            // pointers
+            //  MASK_REGS;
+            #define Chi_00 %%zmm1
+            #define Chi_01 %%zmm2
+            #define Chi_02 %%zmm3
+            #define Chi_10 %%zmm4
+            #define Chi_11 %%zmm5
+            #define Chi_12 %%zmm6
+            #define Chi_20 %%zmm7
+            #define Chi_21 %%zmm8
+            #define Chi_22 %%zmm9
+            #define Chi_30 %%zmm10
+            #define Chi_31 %%zmm11
+            #define Chi_32 %%zmm12
+
+            #define BCAST0  %%zmm13
+            #define BCAST1  %%zmm14
+            #define BCAST2  %%zmm15
+            #define BCAST3  %%zmm16
+            #define BCAST4  %%zmm17
+            #define BCAST5  %%zmm18
+            #define BCAST6  %%zmm19
+            #define BCAST7  %%zmm20
+            #define BCAST8  %%zmm21
+            #define BCAST9  %%zmm22
+            #define BCAST10 %%zmm23
+            #define BCAST11 %%zmm24
+
+            int incr = LLs*LLs*sizeof(iSinglet<Simd>);
+            for(int s1=0; s1<LLs; s1++){
+
+                for(int s2=0; s2<LLs; s2++){
+
+                    int lex = s2 + LLs*site;
+                    uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
+                    uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
+                    uint64_t a2 = (uint64_t) &psi[lex];
+
+                    for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
+                        if((s2+l)==0) {
+                            asm(
+                                    VPREFETCH1(0,%2)              VPREFETCH1(0,%1)
+                                    VPREFETCH1(12,%2)  	          VPREFETCH1(13,%2)
+                                    VPREFETCH1(14,%2)  	          VPREFETCH1(15,%2)
+                                    VBCASTCDUP(0,%2,BCAST0)
+                                    VBCASTCDUP(1,%2,BCAST1)
+                                    VBCASTCDUP(2,%2,BCAST2)
+                                    VBCASTCDUP(3,%2,BCAST3)
+                                    VBCASTCDUP(4,%2,BCAST4)       VMULMEM(0,%0,BCAST0,Chi_00)
+                                    VBCASTCDUP(5,%2,BCAST5)       VMULMEM(0,%0,BCAST1,Chi_01)
+                                    VBCASTCDUP(6,%2,BCAST6)       VMULMEM(0,%0,BCAST2,Chi_02)
+                                    VBCASTCDUP(7,%2,BCAST7)       VMULMEM(0,%0,BCAST3,Chi_10)
+                                    VBCASTCDUP(8,%2,BCAST8)       VMULMEM(0,%0,BCAST4,Chi_11)
+                                    VBCASTCDUP(9,%2,BCAST9)       VMULMEM(0,%0,BCAST5,Chi_12)
+                                    VBCASTCDUP(10,%2,BCAST10)     VMULMEM(0,%1,BCAST6,Chi_20)
+                                    VBCASTCDUP(11,%2,BCAST11)     VMULMEM(0,%1,BCAST7,Chi_21)
+                                    VMULMEM(0,%1,BCAST8,Chi_22)
+                                    VMULMEM(0,%1,BCAST9,Chi_30)
+                                    VMULMEM(0,%1,BCAST10,Chi_31)
+                                    VMULMEM(0,%1,BCAST11,Chi_32)
+                                    : : "r" (a0), "r" (a1), "r" (a2)                            );
+                        } else {
+                            asm(
+                                    VBCASTCDUP(0,%2,BCAST0)   VMADDMEM(0,%0,BCAST0,Chi_00)
+                                    VBCASTCDUP(1,%2,BCAST1)   VMADDMEM(0,%0,BCAST1,Chi_01)
+                                    VBCASTCDUP(2,%2,BCAST2)   VMADDMEM(0,%0,BCAST2,Chi_02)
+                                    VBCASTCDUP(3,%2,BCAST3)   VMADDMEM(0,%0,BCAST3,Chi_10)
+                                    VBCASTCDUP(4,%2,BCAST4)   VMADDMEM(0,%0,BCAST4,Chi_11)
+                                    VBCASTCDUP(5,%2,BCAST5)   VMADDMEM(0,%0,BCAST5,Chi_12)
+                                    VBCASTCDUP(6,%2,BCAST6)   VMADDMEM(0,%1,BCAST6,Chi_20)
+                                    VBCASTCDUP(7,%2,BCAST7)   VMADDMEM(0,%1,BCAST7,Chi_21)
+                                    VBCASTCDUP(8,%2,BCAST8)   VMADDMEM(0,%1,BCAST8,Chi_22)
+                                    VBCASTCDUP(9,%2,BCAST9)   VMADDMEM(0,%1,BCAST9,Chi_30)
+                                    VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
+                                    VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
+                                    : : "r" (a0), "r" (a1), "r" (a2)                            );
+                        }
+                        a0 = a0 + incr;
+                        a1 = a1 + incr;
+                        a2 = a2 + sizeof(Simd::scalar_type);
+                    }
+                }
+
+                {
+                  int lexa = s1+LLs*site;
+                  asm (
+                     VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)
+                     VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)
+                     VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)
+                     VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)
+                     : : "r" ((uint64_t)&chi[lexa]) : "memory" );
+
+                }
+            }
+        }
+
+        #undef Chi_00
+        #undef Chi_01
+        #undef Chi_02
+        #undef Chi_10
+        #undef Chi_11
+        #undef Chi_12
+        #undef Chi_20
+        #undef Chi_21
+        #undef Chi_22
+        #undef Chi_30
+        #undef Chi_31
+        #undef Chi_32
+
+        #undef BCAST0
+        #undef BCAST1
+        #undef BCAST2
+        #undef BCAST3
+        #undef BCAST4
+        #undef BCAST5
+        #undef BCAST6
+        #undef BCAST7
+        #undef BCAST8
+        #undef BCAST9
+        #undef BCAST10
+        #undef BCAST11
+        #endif
+    };
+
+    // Z-mobius version
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
+        int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
+    {
+        std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
+        exit(-1);
+    };
+
+    template<class Impl>
+    void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
+    {
+        int Ls  = this->Ls;
+        int LLs = psi._grid->_rdimensions[0];
+        int vol = psi._grid->oSites()/LLs;
+
+        chi.checkerboard = psi.checkerboard;
+
+        Vector<iSinglet<Simd> > Matp;
+        Vector<iSinglet<Simd> > Matm;
+        Vector<iSinglet<Simd> > *_Matp;
+        Vector<iSinglet<Simd> > *_Matm;
+
+        //  MooeeInternalCompute(dag,inv,Matp,Matm);
+        if(inv && dag){
+            _Matp = &this->MatpInvDag;
+            _Matm = &this->MatmInvDag;
+        }
+
+        if(inv && (!dag)){
+            _Matp = &this->MatpInv;
+            _Matm = &this->MatmInv;
+        }
+
+        if(!inv){
+            MooeeInternalCompute(dag, inv, Matp, Matm);
+            _Matp = &Matp;
+            _Matm = &Matm;
+        }
+
+        assert(_Matp->size() == Ls*LLs);
+
+        this->MooeeInvCalls++;
+        this->MooeeInvTime -= usecond();
+
+        if(switcheroo<Coeff_t>::iscomplex()){
+            parallel_for(auto site=0; site<vol; site++){
+                MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
+            }
+        } else {
+            parallel_for(auto site=0; site<vol; site++){
+                MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
+            }
+        }
+
+        this->MooeeInvTime += usecond();
+    }
+
+    #ifdef DOMAIN_WALL_EOFA_DPERP_VEC
+
+        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplD);
+        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplF);
+        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplD);
+        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplF);
+
+        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplDF);
+        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplFH);
+        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplDF);
+        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplFH);
+
+        template void DomainWallEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+
+        template void DomainWallEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+        template void DomainWallEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+
+    #endif
+
+}}
@@ -1,6 +1,6 @@
 /*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+    Grid physics library, www.github.com/paboyle/Grid

    Source file: ./lib/qcd/action/fermion/Fermion_base_aggregate.h

@@ -38,6 +38,8 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 // - ContinuedFractionFermion5D.cc
 // - WilsonFermion.cc
 // - WilsonKernels.cc
+// - DomainWallEOFAFermion.cc
+// - MobiusEOFAFermion.cc
 //
 // The explicit instantiation is only avoidable if we move this source to headers and end up with include/parse/recompile
 // for EVERY .cc file. This define centralises the list and restores global push of impl cases
@@ -55,11 +57,12 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
 #include <Grid/qcd/action/fermion/CayleyFermion5D.h>     // Cayley types
 #include <Grid/qcd/action/fermion/DomainWallFermion.h>
-#include <Grid/qcd/action/fermion/DomainWallFermion.h>
+#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
 #include <Grid/qcd/action/fermion/MobiusFermion.h>
+#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
 #include <Grid/qcd/action/fermion/ZMobiusFermion.h>
-#include <Grid/qcd/action/fermion/SchurDiagTwoKappa.h>
 #include <Grid/qcd/action/fermion/ScaledShamirFermion.h>
+//#include <Grid/qcd/action/fermion/SchurDiagTwoKappa.h>
 #include <Grid/qcd/action/fermion/MobiusZolotarevFermion.h>
 #include <Grid/qcd/action/fermion/ShamirZolotarevFermion.h>
 #include <Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h>
@@ -113,6 +116,14 @@ typedef DomainWallFermion<WilsonImplRL> DomainWallFermionRL;
 typedef DomainWallFermion<WilsonImplFH> DomainWallFermionFH;
 typedef DomainWallFermion<WilsonImplDF> DomainWallFermionDF;

+typedef DomainWallEOFAFermion<WilsonImplR> DomainWallEOFAFermionR;
+typedef DomainWallEOFAFermion<WilsonImplF> DomainWallEOFAFermionF;
+typedef DomainWallEOFAFermion<WilsonImplD> DomainWallEOFAFermionD;
+
+typedef DomainWallEOFAFermion<WilsonImplRL> DomainWallEOFAFermionRL;
+typedef DomainWallEOFAFermion<WilsonImplFH> DomainWallEOFAFermionFH;
+typedef DomainWallEOFAFermion<WilsonImplDF> DomainWallEOFAFermionDF;
+
 typedef MobiusFermion<WilsonImplR> MobiusFermionR;
 typedef MobiusFermion<WilsonImplF> MobiusFermionF;
 typedef MobiusFermion<WilsonImplD> MobiusFermionD;
@@ -121,6 +132,14 @@ typedef MobiusFermion<WilsonImplRL> MobiusFermionRL;
 typedef MobiusFermion<WilsonImplFH> MobiusFermionFH;
 typedef MobiusFermion<WilsonImplDF> MobiusFermionDF;

+typedef MobiusEOFAFermion<WilsonImplR> MobiusEOFAFermionR;
+typedef MobiusEOFAFermion<WilsonImplF> MobiusEOFAFermionF;
+typedef MobiusEOFAFermion<WilsonImplD> MobiusEOFAFermionD;
+
+typedef MobiusEOFAFermion<WilsonImplRL> MobiusEOFAFermionRL;
+typedef MobiusEOFAFermion<WilsonImplFH> MobiusEOFAFermionFH;
+typedef MobiusEOFAFermion<WilsonImplDF> MobiusEOFAFermionDF;
+
 typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
 typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
 typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
@@ -129,7 +148,7 @@ typedef ZMobiusFermion<ZWilsonImplRL> ZMobiusFermionRL;
 typedef ZMobiusFermion<ZWilsonImplFH> ZMobiusFermionFH;
 typedef ZMobiusFermion<ZWilsonImplDF> ZMobiusFermionDF;

-// Ls vectorised 
+// Ls vectorised
 typedef DomainWallFermion<DomainWallVec5dImplR> DomainWallFermionVec5dR;
 typedef DomainWallFermion<DomainWallVec5dImplF> DomainWallFermionVec5dF;
 typedef DomainWallFermion<DomainWallVec5dImplD> DomainWallFermionVec5dD;
@@ -138,6 +157,14 @@ typedef DomainWallFermion<DomainWallVec5dImplRL> DomainWallFermionVec5dRL;
 typedef DomainWallFermion<DomainWallVec5dImplFH> DomainWallFermionVec5dFH;
 typedef DomainWallFermion<DomainWallVec5dImplDF> DomainWallFermionVec5dDF;

+typedef DomainWallEOFAFermion<DomainWallVec5dImplR> DomainWallEOFAFermionVec5dR;
+typedef DomainWallEOFAFermion<DomainWallVec5dImplF> DomainWallEOFAFermionVec5dF;
+typedef DomainWallEOFAFermion<DomainWallVec5dImplD> DomainWallEOFAFermionVec5dD;
+
+typedef DomainWallEOFAFermion<DomainWallVec5dImplRL> DomainWallEOFAFermionVec5dRL;
+typedef DomainWallEOFAFermion<DomainWallVec5dImplFH> DomainWallEOFAFermionVec5dFH;
+typedef DomainWallEOFAFermion<DomainWallVec5dImplDF> DomainWallEOFAFermionVec5dDF;
+
 typedef MobiusFermion<DomainWallVec5dImplR> MobiusFermionVec5dR;
 typedef MobiusFermion<DomainWallVec5dImplF> MobiusFermionVec5dF;
 typedef MobiusFermion<DomainWallVec5dImplD> MobiusFermionVec5dD;
@@ -146,6 +173,14 @@ typedef MobiusFermion<DomainWallVec5dImplRL> MobiusFermionVec5dRL;
 typedef MobiusFermion<DomainWallVec5dImplFH> MobiusFermionVec5dFH;
 typedef MobiusFermion<DomainWallVec5dImplDF> MobiusFermionVec5dDF;

+typedef MobiusEOFAFermion<DomainWallVec5dImplR> MobiusEOFAFermionVec5dR;
+typedef MobiusEOFAFermion<DomainWallVec5dImplF> MobiusEOFAFermionVec5dF;
+typedef MobiusEOFAFermion<DomainWallVec5dImplD> MobiusEOFAFermionVec5dD;
+
+typedef MobiusEOFAFermion<DomainWallVec5dImplRL> MobiusEOFAFermionVec5dRL;
+typedef MobiusEOFAFermion<DomainWallVec5dImplFH> MobiusEOFAFermionVec5dFH;
+typedef MobiusEOFAFermion<DomainWallVec5dImplDF> MobiusEOFAFermionVec5dDF;
+
 typedef ZMobiusFermion<ZDomainWallVec5dImplR> ZMobiusFermionVec5dR;
 typedef ZMobiusFermion<ZDomainWallVec5dImplF> ZMobiusFermionVec5dF;
 typedef ZMobiusFermion<ZDomainWallVec5dImplD> ZMobiusFermionVec5dD;
@@ -206,6 +241,14 @@ typedef DomainWallFermion<GparityWilsonImplRL> GparityDomainWallFermionRL;
 typedef DomainWallFermion<GparityWilsonImplFH> GparityDomainWallFermionFH;
 typedef DomainWallFermion<GparityWilsonImplDF> GparityDomainWallFermionDF;

+typedef DomainWallEOFAFermion<GparityWilsonImplR> GparityDomainWallEOFAFermionR;
+typedef DomainWallEOFAFermion<GparityWilsonImplF> GparityDomainWallEOFAFermionF;
+typedef DomainWallEOFAFermion<GparityWilsonImplD> GparityDomainWallEOFAFermionD;
+
+typedef DomainWallEOFAFermion<GparityWilsonImplRL> GparityDomainWallEOFAFermionRL;
+typedef DomainWallEOFAFermion<GparityWilsonImplFH> GparityDomainWallEOFAFermionFH;
+typedef DomainWallEOFAFermion<GparityWilsonImplDF> GparityDomainWallEOFAFermionDF;
+
 typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR;
 typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF;
 typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD;
@@ -222,6 +265,14 @@ typedef MobiusFermion<GparityWilsonImplRL> GparityMobiusFermionRL;
 typedef MobiusFermion<GparityWilsonImplFH> GparityMobiusFermionFH;
 typedef MobiusFermion<GparityWilsonImplDF> GparityMobiusFermionDF;

+typedef MobiusEOFAFermion<GparityWilsonImplR> GparityMobiusEOFAFermionR;
+typedef MobiusEOFAFermion<GparityWilsonImplF> GparityMobiusEOFAFermionF;
+typedef MobiusEOFAFermion<GparityWilsonImplD> GparityMobiusEOFAFermionD;
+
+typedef MobiusEOFAFermion<GparityWilsonImplRL> GparityMobiusEOFAFermionRL;
+typedef MobiusEOFAFermion<GparityWilsonImplFH> GparityMobiusEOFAFermionFH;
+typedef MobiusEOFAFermion<GparityWilsonImplDF> GparityMobiusEOFAFermionDF;
+
 typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
 typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
 typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;
@@ -538,6 +538,12 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
   
 }

+
+ template <class ref>
+ inline void loadLinkElement(Simd &reg, ref &memory) {
+   reg = memory;
+ }
+
 inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
 {
   conformable(Uds._grid,GaugeGrid);
@@ -0,0 +1,502 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/Grid_Eigen_Dense.h>
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
+
+namespace Grid {
+namespace QCD {
+
+  template<class Impl>
+    MobiusEOFAFermion<Impl>::MobiusEOFAFermion(
+      GaugeField            &_Umu,
+      GridCartesian         &FiveDimGrid,
+      GridRedBlackCartesian &FiveDimRedBlackGrid,
+      GridCartesian         &FourDimGrid,
+      GridRedBlackCartesian &FourDimRedBlackGrid,
+      RealD _mq1, RealD _mq2, RealD _mq3,
+      RealD _shift, int _pm, RealD _M5,
+      RealD _b, RealD _c, const ImplParams &p) :
+    AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
+        FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
+        _shift, _pm, _M5, _b, _c, p)
+    {
+      int Ls = this->Ls;
+
+      RealD eps = 1.0;
+      Approx::zolotarev_data *zdata = Approx::higham(eps, this->Ls);
+      assert(zdata->n == this->Ls);
+
+      std::cout << GridLogMessage << "MobiusEOFAFermion (b=" << _b <<
+        ",c=" << _c << ") with Ls=" << Ls << std::endl;
+      this->SetCoefficientsTanh(zdata, _b, _c);
+      std::cout << GridLogMessage << "EOFA parameters: (mq1=" << _mq1 <<
+        ",mq2=" << _mq2 << ",mq3=" << _mq3 << ",shift=" << _shift <<
+        ",pm=" << _pm << ")" << std::endl;
+
+      Approx::zolotarev_free(zdata);
+
+      if(_shift != 0.0){
+        SetCoefficientsPrecondShiftOps();
+      } else {
+        Mooee_shift.resize(Ls, 0.0);
+        MooeeInv_shift_lc.resize(Ls, 0.0);
+        MooeeInv_shift_norm.resize(Ls, 0.0);
+        MooeeInvDag_shift_lc.resize(Ls, 0.0);
+        MooeeInvDag_shift_norm.resize(Ls, 0.0);
+      }
+    }
+
+    /****************************************************************
+     * Additional EOFA operators only called outside the inverter.  
+     * Since speed is not essential, simple axpby-style
+     * implementations should be fine.
+     ***************************************************************/
+    template<class Impl>
+    void MobiusEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
+    {
+      int Ls = this->Ls;
+      RealD alpha = this->alpha;
+
+      Din = zero;
+      if((sign == 1) && (dag == 0)) { // \Omega_{+}
+        for(int s=0; s<Ls; ++s){
+          axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,Ls-s-1)/std::pow(1.0+alpha,Ls-s), psi, s, 0);
+        }
+      } else if((sign == -1) && (dag == 0)) { // \Omega_{-}
+        for(int s=0; s<Ls; ++s){
+          axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,s)/std::pow(1.0+alpha,s+1), psi, s, 0);
+        }
+      } else if((sign == 1 ) && (dag == 1)) { // \Omega_{+}^{\dagger}
+        for(int sp=0; sp<Ls; ++sp){
+          axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,Ls-sp-1)/std::pow(1.0+alpha,Ls-sp), psi, 0, sp);
+        }
+      } else if((sign == -1) && (dag == 1)) { // \Omega_{-}^{\dagger}
+        for(int sp=0; sp<Ls; ++sp){
+          axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,sp)/std::pow(1.0+alpha,sp+1), psi, 0, sp);
+        }
+      }
+    }
+
+    // This is the operator relating the usual Ddwf to TWQCD's EOFA Dirac operator (arXiv:1706.05843, Eqn. 6).
+    // It also relates the preconditioned and unpreconditioned systems described in Appendix B.2.
+    template<class Impl>
+    void MobiusEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi)
+    {
+      int Ls    = this->Ls;
+      RealD b   = 0.5 * ( 1.0 + this->alpha );
+      RealD c   = 0.5 * ( 1.0 - this->alpha );
+      RealD mq1 = this->mq1;
+
+      for(int s=0; s<Ls; ++s){
+        if(s == 0) {
+          axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
+          axpby_ssp_pplus (chi, 1.0, chi, mq1*c, psi, s, Ls-1);
+        } else if(s == (Ls-1)) {
+          axpby_ssp_pminus(chi, b, psi, mq1*c, psi, s, 0);
+          axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
+        } else {
+          axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
+          axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
+        }
+      }
+    }
+
+    template<class Impl>
+    void MobiusEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi)
+    {
+      int Ls = this->Ls;
+      RealD m = this->mq1;
+      RealD c = 0.5 * this->alpha;
+      RealD d = 0.5;
+
+      RealD DtInv_p(0.0), DtInv_m(0.0);
+      RealD N = std::pow(c+d,Ls) + m*std::pow(c-d,Ls);
+      FermionField tmp(this->FermionGrid());
+
+      for(int s=0; s<Ls; ++s){
+      for(int sp=0; sp<Ls; ++sp){
+
+        DtInv_p = m * std::pow(-1.0,s-sp+1) * std::pow(c-d,Ls+s-sp) / std::pow(c+d,s-sp+1) / N;
+        DtInv_p += (s < sp) ? 0.0 : std::pow(-1.0,s-sp) * std::pow(c-d,s-sp) / std::pow(c+d,s-sp+1);
+        DtInv_m = m * std::pow(-1.0,sp-s+1) * std::pow(c-d,Ls+sp-s) / std::pow(c+d,sp-s+1) / N;
+        DtInv_m += (s > sp) ? 0.0 : std::pow(-1.0,sp-s) * std::pow(c-d,sp-s) / std::pow(c+d,sp-s+1);
+
+        if(sp == 0){
+          axpby_ssp_pplus (tmp, 0.0, tmp, DtInv_p, psi, s, sp);
+          axpby_ssp_pminus(tmp, 0.0, tmp, DtInv_m, psi, s, sp);
+        } else {
+          axpby_ssp_pplus (tmp, 1.0, tmp, DtInv_p, psi, s, sp);
+          axpby_ssp_pminus(tmp, 1.0, tmp, DtInv_m, psi, s, sp);
+        }
+
+      }}
+    }
+
+    /*****************************************************************************************************/
+
+    template<class Impl>
+    RealD MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
+    {
+      int Ls = this->Ls;
+
+      FermionField Din(psi._grid);
+
+      this->Meooe5D(psi, Din);
+      this->DW(Din, chi, DaggerNo);
+      axpby(chi, 1.0, 1.0, chi, psi);
+      this->M5D(psi, chi);
+      return(norm2(chi));
+    }
+
+    template<class Impl>
+    RealD MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
+    {
+      int Ls = this->Ls;
+
+      FermionField Din(psi._grid);
+
+      this->DW(psi, Din, DaggerYes);
+      this->MeooeDag5D(Din, chi);
+      this->M5Ddag(psi, chi);
+      axpby(chi, 1.0, 1.0, chi, psi);
+      return(norm2(chi));
+    }
+
+    /********************************************************************
+     * Performance critical fermion operators called inside the inverter
+     ********************************************************************/
+
+    template<class Impl>
+    void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
+    {
+      int Ls = this->Ls;
+
+      std::vector<Coeff_t> diag(Ls,1.0);
+      std::vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
+      std::vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
+
+      // no shift term
+      if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
+
+      // fused M + shift operation
+      else{ this->M5D_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
+    }
+
+    template<class Impl>
+    void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
+    {
+      int Ls = this->Ls;
+
+      std::vector<Coeff_t> diag(Ls,1.0);
+      std::vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
+      std::vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
+
+      // no shift term
+      if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
+
+      // fused M + shift operation
+      else{ this->M5Ddag_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
+    }
+
+    // half checkerboard operations
+    template<class Impl>
+    void MobiusEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
+    {
+      int Ls = this->Ls;
+
+      // coefficients of Mooee
+      std::vector<Coeff_t> diag = this->bee;
+      std::vector<Coeff_t> upper(Ls);
+      std::vector<Coeff_t> lower(Ls);
+      for(int s=0; s<Ls; s++){
+        upper[s] = -this->cee[s];
+        lower[s] = -this->cee[s];
+      }
+      upper[Ls-1] *= -this->mq1;
+      lower[0]    *= -this->mq1;
+
+      // no shift term
+      if(this->shift == 0.0){ this->M5D(psi, psi, chi, lower, diag, upper); }
+
+      // fused M + shift operation
+      else { this->M5D_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
+    }
+
+    template<class Impl>
+    void MobiusEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
+    {
+      int Ls = this->Ls;
+
+      // coefficients of MooeeDag
+      std::vector<Coeff_t> diag = this->bee;
+      std::vector<Coeff_t> upper(Ls);
+      std::vector<Coeff_t> lower(Ls);
+      for(int s=0; s<Ls; s++){
+        if(s==0) {
+          upper[s] = -this->cee[s+1];
+          lower[s] = this->mq1*this->cee[Ls-1];
+        } else if(s==(Ls-1)) {
+          upper[s] = this->mq1*this->cee[0];
+          lower[s] = -this->cee[s-1];
+        } else {
+          upper[s] = -this->cee[s+1];
+          lower[s] = -this->cee[s-1];
+        }
+      }
+
+      // no shift term
+      if(this->shift == 0.0){ this->M5Ddag(psi, psi, chi, lower, diag, upper); }
+
+      // fused M + shift operation
+      else{ this->M5Ddag_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
+    }
+
+    /****************************************************************************************/
+
+    // Computes coefficients for applying Cayley preconditioned shift operators
+    //  (Mooee + \Delta) --> Mooee_shift
+    //  (Mooee + \Delta)^{-1} --> MooeeInv_shift_lc, MooeeInv_shift_norm
+    //  (Mooee + \Delta)^{-dag} --> MooeeInvDag_shift_lc, MooeeInvDag_shift_norm
+    // For the latter two cases, the operation takes the form
+    //  [ (Mooee + \Delta)^{-1} \psi ]_{i} = Mooee_{ij} \psi_{j} +
+    //      ( MooeeInv_shift_norm )_{i} ( \sum_{j} [ MooeeInv_shift_lc ]_{j} P_{pm} \psi_{j} )
+    template<class Impl>
+    void MobiusEOFAFermion<Impl>::SetCoefficientsPrecondShiftOps()
+    {
+      int   Ls    = this->Ls;
+      int   pm    = this->pm;
+      RealD alpha = this->alpha;
+      RealD k     = this->k;
+      RealD mq1   = this->mq1;
+      RealD shift = this->shift;
+
+      // Initialize
+      Mooee_shift.resize(Ls);
+      MooeeInv_shift_lc.resize(Ls);
+      MooeeInv_shift_norm.resize(Ls);
+      MooeeInvDag_shift_lc.resize(Ls);
+      MooeeInvDag_shift_norm.resize(Ls);
+
+      // Construct Mooee_shift
+      int idx(0);
+      Coeff_t N = ( (pm == 1) ? 1.0 : -1.0 ) * (2.0*shift*k) *
+                  ( std::pow(alpha+1.0,Ls) + mq1*std::pow(alpha-1.0,Ls) );
+      for(int s=0; s<Ls; ++s){
+        idx = (pm == 1) ? (s) : (Ls-1-s);
+        Mooee_shift[idx] = N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1);
+      }
+
+      // Tridiagonal solve for MooeeInvDag_shift_lc
+      {
+        Coeff_t m(0.0);
+        std::vector<Coeff_t> d = Mooee_shift;
+        std::vector<Coeff_t> u(Ls,0.0);
+        std::vector<Coeff_t> y(Ls,0.0);
+        std::vector<Coeff_t> q(Ls,0.0);
+        if(pm == 1){ u[0] = 1.0; }
+        else{ u[Ls-1] = 1.0; }
+
+        // Tridiagonal matrix algorithm + Sherman-Morrison formula
+        //
+        // We solve
+        //  ( Mooee' + u \otimes v ) MooeeInvDag_shift_lc = Mooee_shift
+        // where Mooee' is the tridiagonal part of Mooee_{+}, and
+        // u = (1,0,...,0) and v = (0,...,0,mq1*cee[0]) are chosen
+        // so that the outer-product u \otimes v gives the (0,Ls-1)
+        // entry of Mooee_{+}.
+        //
+        // We do this as two solves: Mooee'*y = d and Mooee'*q = u,
+        // and then construct the solution to the original system
+        //  MooeeInvDag_shift_lc = y - <v,y> / ( 1 + <v,q> ) q
+        if(pm == 1){
+          for(int s=1; s<Ls; ++s){
+            m = -this->cee[s] / this->bee[s-1];
+            d[s] -= m*d[s-1];
+            u[s] -= m*u[s-1];
+          }
+        }
+        y[Ls-1] = d[Ls-1] / this->bee[Ls-1];
+        q[Ls-1] = u[Ls-1] / this->bee[Ls-1];
+        for(int s=Ls-2; s>=0; --s){
+          if(pm == 1){
+            y[s] = d[s] / this->bee[s];
+            q[s] = u[s] / this->bee[s];
+          } else {
+            y[s] = ( d[s] + this->cee[s]*y[s+1] ) / this->bee[s];
+            q[s] = ( u[s] + this->cee[s]*q[s+1] ) / this->bee[s];
+          }
+        }
+
+        // Construct MooeeInvDag_shift_lc
+        for(int s=0; s<Ls; ++s){
+          if(pm == 1){
+            MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[0]*y[Ls-1] /
+              (1.0+mq1*this->cee[0]*q[Ls-1]) * q[s];
+          } else {
+            MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[Ls-1]*y[0] /
+              (1.0+mq1*this->cee[Ls-1]*q[0]) * q[s];
+          }
+        }
+
+        // Compute remaining coefficients
+        N = (pm == 1) ? (1.0 + MooeeInvDag_shift_lc[Ls-1]) : (1.0 + MooeeInvDag_shift_lc[0]);
+        for(int s=0; s<Ls; ++s){
+
+          // MooeeInv_shift_lc
+          if(pm == 1){ MooeeInv_shift_lc[s] = std::pow(this->bee[s],s) * std::pow(this->cee[s],Ls-1-s); }
+          else{ MooeeInv_shift_lc[s] = std::pow(this->bee[s],Ls-1-s) * std::pow(this->cee[s],s); }
+
+          // MooeeInv_shift_norm
+          MooeeInv_shift_norm[s] = -MooeeInvDag_shift_lc[s] /
+            ( std::pow(this->bee[s],Ls) + mq1*std::pow(this->cee[s],Ls) ) / N;
+
+          // MooeeInvDag_shift_norm
+          if(pm == 1){ MooeeInvDag_shift_norm[s] = -std::pow(this->bee[s],s) * std::pow(this->cee[s],Ls-1-s) /
+            ( std::pow(this->bee[s],Ls) + mq1*std::pow(this->cee[s],Ls) ) / N; }
+          else{ MooeeInvDag_shift_norm[s] = -std::pow(this->bee[s],Ls-1-s) * std::pow(this->cee[s],s) /
+            ( std::pow(this->bee[s],Ls) + mq1*std::pow(this->cee[s],Ls) ) / N; }
+        }
+      }
+    }
+
+    // Recompute coefficients for a different value of shift constant
+    template<class Impl>
+    void MobiusEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
+    {
+      this->shift = new_shift;
+      if(new_shift != 0.0){
+        SetCoefficientsPrecondShiftOps();
+      } else {
+        int Ls = this->Ls;
+        Mooee_shift.resize(Ls,0.0);
+        MooeeInv_shift_lc.resize(Ls,0.0);
+        MooeeInv_shift_norm.resize(Ls,0.0);
+        MooeeInvDag_shift_lc.resize(Ls,0.0);
+        MooeeInvDag_shift_norm.resize(Ls,0.0);
+      }
+    }
+
+    template<class Impl>
+    void MobiusEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
+      Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
+    {
+      int Ls = this->Ls;
+
+      GridBase* grid = this->FermionRedBlackGrid();
+      int LLs = grid->_rdimensions[0];
+
+      if(LLs == Ls){ return; } // Not vectorised in 5th direction
+
+      Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
+      Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
+
+      for(int s=0; s<Ls; s++){
+        Pplus(s,s)  = this->bee[s];
+        Pminus(s,s) = this->bee[s];
+      }
+
+      for(int s=0; s<Ls-1; s++){
+        Pminus(s,s+1) = -this->cee[s];
+        Pplus(s+1,s) = -this->cee[s+1];
+      }
+
+      Pplus (0,Ls-1) = this->mq1*this->cee[0];
+      Pminus(Ls-1,0) = this->mq1*this->cee[Ls-1];
+
+      if(this->shift != 0.0){
+        RealD c = 0.5 * this->alpha;
+        RealD d = 0.5;
+        RealD N = this->shift * this->k * ( std::pow(c+d,Ls) + this->mq1*std::pow(c-d,Ls) );
+        if(this->pm == 1) {
+          for(int s=0; s<Ls; ++s){
+            Pplus(s,Ls-1) += N * std::pow(-1.0,s) * std::pow(c-d,s) / std::pow(c+d,Ls+s+1);
+          }
+        } else {
+          for(int s=0; s<Ls; ++s){
+            Pminus(s,0) += N * std::pow(-1.0,s+1) * std::pow(c-d,Ls-1-s) / std::pow(c+d,2*Ls-s);
+          }
+        }
+      }
+
+      Eigen::MatrixXcd PplusMat ;
+      Eigen::MatrixXcd PminusMat;
+
+      if(inv) {
+        PplusMat  = Pplus.inverse();
+        PminusMat = Pminus.inverse();
+      } else {
+        PplusMat  = Pplus;
+        PminusMat = Pminus;
+      }
+
+      if(dag){
+        PplusMat.adjointInPlace();
+        PminusMat.adjointInPlace();
+      }
+
+      typedef typename SiteHalfSpinor::scalar_type scalar_type;
+      const int Nsimd = Simd::Nsimd();
+      Matp.resize(Ls*LLs);
+      Matm.resize(Ls*LLs);
+
+      for(int s2=0; s2<Ls; s2++){
+      for(int s1=0; s1<LLs; s1++){
+        int istride = LLs;
+        int ostride = 1;
+        Simd Vp;
+        Simd Vm;
+        scalar_type *sp = (scalar_type*) &Vp;
+        scalar_type *sm = (scalar_type*) &Vm;
+        for(int l=0; l<Nsimd; l++){
+          if(switcheroo<Coeff_t>::iscomplex()) {
+            sp[l] = PplusMat (l*istride+s1*ostride,s2);
+            sm[l] = PminusMat(l*istride+s1*ostride,s2);
+          } else {
+            // if real
+            scalar_type tmp;
+            tmp = PplusMat (l*istride+s1*ostride,s2);
+            sp[l] = scalar_type(tmp.real(),tmp.real());
+            tmp = PminusMat(l*istride+s1*ostride,s2);
+            sm[l] = scalar_type(tmp.real(),tmp.real());
+          }
+        }
+        Matp[LLs*s2+s1] = Vp;
+        Matm[LLs*s2+s1] = Vm;
+      }}
+  }
+
+  FermOpTemplateInstantiate(MobiusEOFAFermion);
+  GparityFermOpTemplateInstantiate(MobiusEOFAFermion);
+
+}}
@@ -0,0 +1,133 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.h
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef  GRID_QCD_MOBIUS_EOFA_FERMION_H
+#define  GRID_QCD_MOBIUS_EOFA_FERMION_H
+
+#include <Grid/qcd/action/fermion/AbstractEOFAFermion.h>
+
+namespace Grid {
+namespace QCD {
+
+  template<class Impl>
+  class MobiusEOFAFermion : public AbstractEOFAFermion<Impl>
+  {
+    public:
+      INHERIT_IMPL_TYPES(Impl);
+
+    public:
+      // Shift operator coefficients for red-black preconditioned Mobius EOFA
+      std::vector<Coeff_t> Mooee_shift;
+      std::vector<Coeff_t> MooeeInv_shift_lc;
+      std::vector<Coeff_t> MooeeInv_shift_norm;
+      std::vector<Coeff_t> MooeeInvDag_shift_lc;
+      std::vector<Coeff_t> MooeeInvDag_shift_norm;
+
+      virtual void Instantiatable(void) {};
+
+      // EOFA-specific operations
+      virtual void  Omega            (const FermionField& in, FermionField& out, int sign, int dag);
+      virtual void  Dtilde           (const FermionField& in, FermionField& out);
+      virtual void  DtildeInv        (const FermionField& in, FermionField& out);
+
+      // override multiply
+      virtual RealD M                (const FermionField& in, FermionField& out);
+      virtual RealD Mdag             (const FermionField& in, FermionField& out);
+
+      // half checkerboard operations
+      virtual void  Mooee            (const FermionField& in, FermionField& out);
+      virtual void  MooeeDag         (const FermionField& in, FermionField& out);
+      virtual void  MooeeInv         (const FermionField& in, FermionField& out);
+      virtual void  MooeeInv_shift   (const FermionField& in, FermionField& out);
+      virtual void  MooeeInvDag      (const FermionField& in, FermionField& out);
+      virtual void  MooeeInvDag_shift(const FermionField& in, FermionField& out);
+
+      virtual void   M5D             (const FermionField& psi, FermionField& chi);
+      virtual void   M5Ddag          (const FermionField& psi, FermionField& chi);
+
+      /////////////////////////////////////////////////////
+      // Instantiate different versions depending on Impl
+      /////////////////////////////////////////////////////
+      void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
+        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
+
+      void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
+        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
+        std::vector<Coeff_t>& shift_coeffs);
+
+      void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
+        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
+
+      void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
+        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
+        std::vector<Coeff_t>& shift_coeffs);
+
+      void MooeeInternal(const FermionField& in, FermionField& out, int dag, int inv);
+
+      void MooeeInternalCompute(int dag, int inv, Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
+
+      void MooeeInternalAsm(const FermionField& in, FermionField& out, int LLs, int site,
+        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
+
+      void MooeeInternalZAsm(const FermionField& in, FermionField& out, int LLs, int site,
+        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
+
+      virtual void RefreshShiftCoefficients(RealD new_shift);
+
+      // Constructors
+      MobiusEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
+        GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
+        RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int pm,
+        RealD _M5, RealD _b, RealD _c, const ImplParams& p=ImplParams());
+
+    protected:
+      void SetCoefficientsPrecondShiftOps(void);
+  };
+}}
+
+#define INSTANTIATE_DPERP_MOBIUS_EOFA(A)\
+template void MobiusEOFAFermion<A>::M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, \
+  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
+template void MobiusEOFAFermion<A>::M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, \
+  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper, std::vector<Coeff_t>& shift_coeffs); \
+template void MobiusEOFAFermion<A>::M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, \
+  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
+template void MobiusEOFAFermion<A>::M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, \
+  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper, std::vector<Coeff_t>& shift_coeffs); \
+template void MobiusEOFAFermion<A>::MooeeInv(const FermionField& psi, FermionField& chi); \
+template void MobiusEOFAFermion<A>::MooeeInv_shift(const FermionField& psi, FermionField& chi); \
+template void MobiusEOFAFermion<A>::MooeeInvDag(const FermionField& psi, FermionField& chi); \
+template void MobiusEOFAFermion<A>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi);
+
+#undef  MOBIUS_EOFA_DPERP_DENSE
+#define MOBIUS_EOFA_DPERP_CACHE
+#undef  MOBIUS_EOFA_DPERP_LINALG
+#define MOBIUS_EOFA_DPERP_VEC
+
+#endif
@@ -0,0 +1,429 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/MobiusEOFAFermioncache.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
+
+namespace Grid {
+namespace QCD {
+
+  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi, const FermionField &phi, FermionField &chi,
+    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
+  {
+    int Ls = this->Ls;
+    GridBase *grid = psi._grid;
+
+    assert(phi.checkerboard == psi.checkerboard);
+    chi.checkerboard = psi.checkerboard;
+
+    // Flops = 6.0*(Nc*Ns) *Ls*vol
+    this->M5Dcalls++;
+    this->M5Dtime -= usecond();
+
+    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
+      for(int s=0; s<Ls; s++){
+        auto tmp = psi._odata[0];
+        if(s==0){
+          spProj5m(tmp, psi._odata[ss+s+1]);
+          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+          spProj5p(tmp, psi._odata[ss+Ls-1]);
+          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+        } else if(s==(Ls-1)) {
+          spProj5m(tmp, psi._odata[ss+0]);
+          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+          spProj5p(tmp, psi._odata[ss+s-1]);
+          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+        } else {
+          spProj5m(tmp, psi._odata[ss+s+1]);
+          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+          spProj5p(tmp, psi._odata[ss+s-1]);
+          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+        }
+      }
+    }
+
+    this->M5Dtime += usecond();
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi, const FermionField &phi, FermionField &chi,
+    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
+    std::vector<Coeff_t> &shift_coeffs)
+  {
+    int Ls = this->Ls;
+    int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
+    GridBase *grid = psi._grid;
+
+    assert(phi.checkerboard == psi.checkerboard);
+    chi.checkerboard = psi.checkerboard;
+
+    // Flops = 6.0*(Nc*Ns) *Ls*vol
+    this->M5Dcalls++;
+    this->M5Dtime -= usecond();
+
+    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
+      for(int s=0; s<Ls; s++){
+        auto tmp = psi._odata[0];
+        if(s==0){
+          spProj5m(tmp, psi._odata[ss+s+1]);
+          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+          spProj5p(tmp, psi._odata[ss+Ls-1]);
+          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+        } else if(s==(Ls-1)) {
+          spProj5m(tmp, psi._odata[ss+0]);
+          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+          spProj5p(tmp, psi._odata[ss+s-1]);
+          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+        } else {
+          spProj5m(tmp, psi._odata[ss+s+1]);
+          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+          spProj5p(tmp, psi._odata[ss+s-1]);
+          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+        }
+        if(this->pm == 1){ spProj5p(tmp, psi._odata[ss+shift_s]); }
+        else{ spProj5m(tmp, psi._odata[ss+shift_s]); }
+        chi[ss+s] = chi[ss+s] + shift_coeffs[s]*tmp;
+      }
+    }
+
+    this->M5Dtime += usecond();
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi, const FermionField &phi, FermionField &chi,
+    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
+  {
+    int Ls = this->Ls;
+    GridBase *grid = psi._grid;
+
+    assert(phi.checkerboard == psi.checkerboard);
+    chi.checkerboard = psi.checkerboard;
+
+    // Flops = 6.0*(Nc*Ns) *Ls*vol
+    this->M5Dcalls++;
+    this->M5Dtime -= usecond();
+
+    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
+      auto tmp = psi._odata[0];
+      for(int s=0; s<Ls; s++){
+        if(s==0) {
+          spProj5p(tmp, psi._odata[ss+s+1]);
+          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+          spProj5m(tmp, psi._odata[ss+Ls-1]);
+          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+        } else if(s==(Ls-1)) {
+          spProj5p(tmp, psi._odata[ss+0]);
+          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+          spProj5m(tmp, psi._odata[ss+s-1]);
+          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+        } else {
+          spProj5p(tmp, psi._odata[ss+s+1]);
+          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+          spProj5m(tmp, psi._odata[ss+s-1]);
+          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+        }
+      }
+    }
+
+    this->M5Dtime += usecond();
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi, const FermionField &phi, FermionField &chi,
+    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
+    std::vector<Coeff_t> &shift_coeffs)
+  {
+    int Ls = this->Ls;
+    int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
+    GridBase *grid = psi._grid;
+
+    assert(phi.checkerboard == psi.checkerboard);
+    chi.checkerboard = psi.checkerboard;
+
+    // Flops = 6.0*(Nc*Ns) *Ls*vol
+    this->M5Dcalls++;
+    this->M5Dtime -= usecond();
+
+    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
+      chi[ss+Ls-1] = zero;
+      auto tmp = psi._odata[0];
+      for(int s=0; s<Ls; s++){
+        if(s==0) {
+          spProj5p(tmp, psi._odata[ss+s+1]);
+          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+          spProj5m(tmp, psi._odata[ss+Ls-1]);
+          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+        } else if(s==(Ls-1)) {
+          spProj5p(tmp, psi._odata[ss+0]);
+          chi[ss+s] = chi[ss+s] + diag[s]*phi[ss+s] + upper[s]*tmp;
+          spProj5m(tmp, psi._odata[ss+s-1]);
+          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+        } else {
+          spProj5p(tmp, psi._odata[ss+s+1]);
+          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
+          spProj5m(tmp, psi._odata[ss+s-1]);
+          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
+        }
+        if(this->pm == 1){ spProj5p(tmp, psi._odata[ss+s]); }
+        else{ spProj5m(tmp, psi._odata[ss+s]); }
+        chi[ss+shift_s] = chi[ss+shift_s] + shift_coeffs[s]*tmp;
+      }
+    }
+
+    this->M5Dtime += usecond();
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
+  {
+    if(this->shift != 0.0){ MooeeInv_shift(psi,chi); return; }
+
+    GridBase *grid = psi._grid;
+    int Ls = this->Ls;
+
+    chi.checkerboard = psi.checkerboard;
+
+    this->MooeeInvCalls++;
+    this->MooeeInvTime -= usecond();
+
+    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
+
+      auto tmp = psi._odata[0];
+
+      // Apply (L^{\prime})^{-1}
+      chi[ss] = psi[ss]; // chi[0]=psi[0]
+      for(int s=1; s<Ls; s++){
+        spProj5p(tmp, chi[ss+s-1]);
+        chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp;
+      }
+
+      // L_m^{-1}
+      for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+        spProj5m(tmp, chi[ss+s]);
+        chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp;
+      }
+
+      // U_m^{-1} D^{-1}
+      for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
+        spProj5p(tmp, chi[ss+Ls-1]);
+        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls-1])*tmp;
+      }
+      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
+
+      // Apply U^{-1}
+      for(int s=Ls-2; s>=0; s--){
+        spProj5m(tmp, chi[ss+s+1]);
+        chi[ss+s] = chi[ss+s] - this->uee[s]*tmp;
+      }
+    }
+
+    this->MooeeInvTime += usecond();
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi, FermionField &chi)
+  {
+    GridBase *grid = psi._grid;
+    int Ls = this->Ls;
+
+    chi.checkerboard = psi.checkerboard;
+
+    this->MooeeInvCalls++;
+    this->MooeeInvTime -= usecond();
+
+    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
+
+      auto tmp1        = psi._odata[0];
+      auto tmp2        = psi._odata[0];
+      auto tmp2_spProj = psi._odata[0];
+
+      // Apply (L^{\prime})^{-1} and accumulate MooeeInv_shift_lc[j]*psi[j] in tmp2
+      chi[ss] = psi[ss]; // chi[0]=psi[0]
+      tmp2 = MooeeInv_shift_lc[0]*psi[ss];
+      for(int s=1; s<Ls; s++){
+        spProj5p(tmp1, chi[ss+s-1]);
+        chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
+        tmp2 = tmp2 + MooeeInv_shift_lc[s]*psi[ss+s];
+      }
+      if(this->pm == 1){ spProj5p(tmp2_spProj, tmp2);}
+      else{ spProj5m(tmp2_spProj, tmp2); }
+
+      // L_m^{-1}
+      for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+        spProj5m(tmp1, chi[ss+s]);
+        chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
+      }
+
+      // U_m^{-1} D^{-1}
+      for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
+        spProj5p(tmp1, chi[ss+Ls-1]);
+        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls-1])*tmp1;
+      }
+      // chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj;
+      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
+      spProj5m(tmp1, chi[ss+Ls-1]);
+      chi[ss+Ls-1] = chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj;
+
+      // Apply U^{-1} and add shift term
+      for(int s=Ls-2; s>=0; s--){
+        chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
+        spProj5m(tmp1, chi[ss+s]);
+        chi[ss+s] = chi[ss+s] + MooeeInv_shift_norm[s]*tmp2_spProj;
+      }
+    }
+
+    this->MooeeInvTime += usecond();
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi, FermionField &chi)
+  {
+    if(this->shift != 0.0){ MooeeInvDag_shift(psi,chi); return; }
+
+    GridBase *grid = psi._grid;
+    int Ls = this->Ls;
+
+    chi.checkerboard = psi.checkerboard;
+
+    this->MooeeInvCalls++;
+    this->MooeeInvTime -= usecond();
+
+    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
+
+      auto tmp = psi._odata[0];
+
+      // Apply (U^{\prime})^{-dag}
+      chi[ss] = psi[ss];
+      for(int s=1; s<Ls; s++){
+        spProj5m(tmp, chi[ss+s-1]);
+        chi[ss+s] = psi[ss+s] - this->uee[s-1]*tmp;
+      }
+
+      // U_m^{-\dag}
+      for(int s=0; s<Ls-1; s++){
+        spProj5p(tmp, chi[ss+s]);
+        chi[ss+Ls-1] = chi[ss+Ls-1] - this->ueem[s]*tmp;
+      }
+
+      // L_m^{-\dag} D^{-dag}
+      for(int s=0; s<Ls-1; s++){
+        spProj5m(tmp, chi[ss+Ls-1]);
+        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->leem[s]/this->dee[Ls-1])*tmp;
+      }
+      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
+
+      // Apply L^{-dag}
+      for(int s=Ls-2; s>=0; s--){
+        spProj5p(tmp, chi[ss+s+1]);
+        chi[ss+s] = chi[ss+s] - this->lee[s]*tmp;
+      }
+    }
+
+    this->MooeeInvTime += usecond();
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi, FermionField &chi)
+  {
+    GridBase *grid = psi._grid;
+    int Ls = this->Ls;
+
+    chi.checkerboard = psi.checkerboard;
+
+    this->MooeeInvCalls++;
+    this->MooeeInvTime -= usecond();
+
+    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
+
+      auto tmp1        = psi._odata[0];
+      auto tmp2        = psi._odata[0];
+      auto tmp2_spProj = psi._odata[0];
+
+      // Apply (U^{\prime})^{-dag} and accumulate MooeeInvDag_shift_lc[j]*psi[j] in tmp2
+      chi[ss] = psi[ss];
+      tmp2 = MooeeInvDag_shift_lc[0]*psi[ss];
+      for(int s=1; s<Ls; s++){
+        spProj5m(tmp1, chi[ss+s-1]);
+        chi[ss+s] = psi[ss+s] - this->uee[s-1]*tmp1;
+        tmp2 = tmp2 + MooeeInvDag_shift_lc[s]*psi[ss+s];
+      }
+      if(this->pm == 1){ spProj5p(tmp2_spProj, tmp2);}
+      else{ spProj5m(tmp2_spProj, tmp2); }
+
+      // U_m^{-\dag}
+      for(int s=0; s<Ls-1; s++){
+        spProj5p(tmp1, chi[ss+s]);
+        chi[ss+Ls-1] = chi[ss+Ls-1] - this->ueem[s]*tmp1;
+      }
+
+      // L_m^{-\dag} D^{-dag}
+      for(int s=0; s<Ls-1; s++){
+        spProj5m(tmp1, chi[ss+Ls-1]);
+        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->leem[s]/this->dee[Ls-1])*tmp1;
+      }
+      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
+      spProj5p(tmp1, chi[ss+Ls-1]);
+      chi[ss+Ls-1] = chi[ss+Ls-1] + MooeeInvDag_shift_norm[Ls-1]*tmp2_spProj;
+
+      // Apply L^{-dag}
+      for(int s=Ls-2; s>=0; s--){
+        chi[ss+s] = chi[ss+s] - this->lee[s]*tmp1;
+        spProj5p(tmp1, chi[ss+s]);
+        chi[ss+s] = chi[ss+s] + MooeeInvDag_shift_norm[s]*tmp2_spProj;
+      }
+    }
+
+    this->MooeeInvTime += usecond();
+  }
+
+  #ifdef MOBIUS_EOFA_DPERP_CACHE
+
+    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
+
+    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
+
+  #endif
+
+}}
@@ -0,0 +1,184 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/MobiusEOFAFermiondense.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/Grid_Eigen_Dense.h>
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
+
+namespace Grid {
+namespace QCD {
+
+  /*
+  * Dense matrix versions of routines
+  */
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
+  {
+    this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
+  {
+    this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
+  {
+    this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
+  {
+    this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
+  {
+    int Ls = this->Ls;
+    int LLs = psi._grid->_rdimensions[0];
+    int vol = psi._grid->oSites()/LLs;
+
+    int pm      = this->pm;
+    RealD shift = this->shift;
+    RealD alpha = this->alpha;
+    RealD k     = this->k;
+    RealD mq1   = this->mq1;
+
+    chi.checkerboard = psi.checkerboard;
+
+    assert(Ls==LLs);
+
+    Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
+    Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
+
+    for(int s=0;s<Ls;s++){
+        Pplus(s,s)  = this->bee[s];
+        Pminus(s,s) = this->bee[s];
+    }
+
+    for(int s=0; s<Ls-1; s++){
+        Pminus(s,s+1) = -this->cee[s];
+    }
+
+    for(int s=0; s<Ls-1; s++){
+        Pplus(s+1,s) = -this->cee[s+1];
+    }
+    Pplus (0,Ls-1) = mq1*this->cee[0];
+    Pminus(Ls-1,0) = mq1*this->cee[Ls-1];
+
+    if(shift != 0.0){
+      Coeff_t N = 2.0 * ( std::pow(alpha+1.0,Ls) + mq1*std::pow(alpha-1.0,Ls) );
+      for(int s=0; s<Ls; ++s){
+        if(pm == 1){ Pplus(s,Ls-1) += shift * k * N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1); }
+        else{ Pminus(Ls-1-s,Ls-1) -= shift * k * N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1); }
+      }
+    }
+
+    Eigen::MatrixXd PplusMat ;
+    Eigen::MatrixXd PminusMat;
+
+    if(inv){
+      PplusMat  = Pplus.inverse();
+      PminusMat = Pminus.inverse();
+    } else {
+      PplusMat  = Pplus;
+      PminusMat = Pminus;
+    }
+
+    if(dag){
+      PplusMat.adjointInPlace();
+      PminusMat.adjointInPlace();
+    }
+
+    // For the non-vectorised s-direction this is simple
+
+    for(auto site=0; site<vol; site++){
+
+        SiteSpinor     SiteChi;
+        SiteHalfSpinor SitePplus;
+        SiteHalfSpinor SitePminus;
+
+        for(int s1=0; s1<Ls; s1++){
+            SiteChi = zero;
+            for(int s2=0; s2<Ls; s2++){
+                int lex2 = s2 + Ls*site;
+                if(PplusMat(s1,s2) != 0.0){
+                    spProj5p(SitePplus,psi[lex2]);
+                    accumRecon5p(SiteChi, PplusMat(s1,s2)*SitePplus);
+                }
+                if(PminusMat(s1,s2) != 0.0){
+                    spProj5m(SitePminus, psi[lex2]);
+                    accumRecon5m(SiteChi, PminusMat(s1,s2)*SitePminus);
+                }
+            }
+            chi[s1+Ls*site] = SiteChi*0.5;
+        }
+    }
+  }
+
+  #ifdef MOBIUS_EOFA_DPERP_DENSE
+
+    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
+
+    template void MobiusEOFAFermion<GparityWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<GparityWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<WilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<WilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<ZWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<ZWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+
+    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
+
+    template void MobiusEOFAFermion<GparityWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<GparityWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<WilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<WilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<ZWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<ZWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+
+  #endif
+
+}}
@@ -0,0 +1,290 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/MobiusEOFAFermionssp.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
+
+namespace Grid {
+namespace QCD {
+
+  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
+  // Pminus fowards
+  // Pplus  backwards
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
+    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
+  {
+    Coeff_t one(1.0);
+    int Ls = this->Ls;
+    for(int s=0; s<Ls; s++){
+      if(s==0) {
+        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
+        axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
+      } else if (s==(Ls-1)) {
+        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
+        axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
+      } else {
+        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
+        axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
+      }
+    }
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField& psi, const FermionField& phi,
+    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
+    std::vector<Coeff_t>& shift_coeffs)
+  {
+    Coeff_t one(1.0);
+    int Ls = this->Ls;
+    for(int s=0; s<Ls; s++){
+      if(s==0) {
+        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
+        axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
+      } else if (s==(Ls-1)) {
+        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
+        axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
+      } else {
+        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
+        axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
+      }
+      if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); }
+      else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); }
+    }
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
+    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
+  {
+    Coeff_t one(1.0);
+    int Ls = this->Ls;
+    for(int s=0; s<Ls; s++){
+      if(s==0) {
+        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
+        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
+      } else if (s==(Ls-1)) {
+        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
+        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
+      } else {
+        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
+        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
+      }
+    }
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField& psi, const FermionField& phi,
+    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
+    std::vector<Coeff_t>& shift_coeffs)
+  {
+    Coeff_t one(1.0);
+    int Ls = this->Ls;
+    for(int s=0; s<Ls; s++){
+      if(s==0) {
+        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
+        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
+      } else if (s==(Ls-1)) {
+        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
+        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
+      } else {
+        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
+        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
+      }
+      if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); }
+      else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); }
+    }
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
+  {
+    if(this->shift != 0.0){ MooeeInv_shift(psi,chi); return; }
+
+    Coeff_t one(1.0);
+    Coeff_t czero(0.0);
+    chi.checkerboard = psi.checkerboard;
+    int Ls = this->Ls;
+
+    // Apply (L^{\prime})^{-1}
+    axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
+    for(int s=1; s<Ls; s++){
+      axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
+    }
+
+    // L_m^{-1}
+    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+      axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
+    }
+
+    // U_m^{-1} D^{-1}
+    for(int s=0; s<Ls-1; s++){
+      axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls-1], chi, s, Ls-1);
+    }
+    axpby_ssp(chi, one/this->dee[Ls-1], chi, czero, chi, Ls-1, Ls-1);
+
+    // Apply U^{-1}
+    for(int s=Ls-2; s>=0; s--){
+      axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1);  // chi[Ls]
+    }
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
+  {
+    Coeff_t one(1.0);
+    Coeff_t czero(0.0);
+    chi.checkerboard = psi.checkerboard;
+    int Ls = this->Ls;
+
+    FermionField tmp(psi._grid);
+
+    // Apply (L^{\prime})^{-1}
+    axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
+    axpby_ssp(tmp, czero, tmp, this->MooeeInv_shift_lc[0], psi, 0, 0);
+    for(int s=1; s<Ls; s++){
+      axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
+      axpby_ssp(tmp, one, tmp, this->MooeeInv_shift_lc[s], psi, 0, s);
+    }
+
+    // L_m^{-1}
+    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+      axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
+    }
+
+    // U_m^{-1} D^{-1}
+    for(int s=0; s<Ls-1; s++){
+      axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls-1], chi, s, Ls-1);
+    }
+    axpby_ssp(chi, one/this->dee[Ls-1], chi, czero, chi, Ls-1, Ls-1);
+
+    // Apply U^{-1} and add shift term
+    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInv_shift_norm[Ls-1], tmp, Ls-1, 0); }
+    else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInv_shift_norm[Ls-1], tmp, Ls-1, 0); }
+    for(int s=Ls-2; s>=0; s--){
+      axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1);  // chi[Ls]
+      if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInv_shift_norm[s], tmp, s, 0); }
+      else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInv_shift_norm[s], tmp, s, 0); }
+    }
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
+  {
+    if(this->shift != 0.0){ MooeeInvDag_shift(psi,chi); return; }
+
+    Coeff_t one(1.0);
+    Coeff_t czero(0.0);
+    chi.checkerboard = psi.checkerboard;
+    int Ls = this->Ls;
+
+    // Apply (U^{\prime})^{-dagger}
+    axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
+    for(int s=1; s<Ls; s++){
+      axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
+    }
+
+    // U_m^{-\dagger}
+    for(int s=0; s<Ls-1; s++){
+      axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
+    }
+
+    // L_m^{-\dagger} D^{-dagger}
+    for(int s=0; s<Ls-1; s++){
+      axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
+    }
+    axpby_ssp(chi, one/conjugate(this->dee[Ls-1]), chi, czero, chi, Ls-1, Ls-1);
+
+    // Apply L^{-dagger}
+    for(int s=Ls-2; s>=0; s--){
+      axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1);  // chi[Ls]
+    }
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
+  {
+    Coeff_t one(1.0);
+    Coeff_t czero(0.0);
+    chi.checkerboard = psi.checkerboard;
+    int Ls = this->Ls;
+
+    FermionField tmp(psi._grid);
+
+    // Apply (U^{\prime})^{-dagger} and accumulate (MooeeInvDag_shift_lc)_{j} \psi_{j} in tmp[0]
+    axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
+    axpby_ssp(tmp, czero, tmp, this->MooeeInvDag_shift_lc[0], psi, 0, 0);
+    for(int s=1; s<Ls; s++){
+      axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
+      axpby_ssp(tmp, one, tmp, this->MooeeInvDag_shift_lc[s], psi, 0, s);
+    }
+
+    // U_m^{-\dagger}
+    for(int s=0; s<Ls-1; s++){
+      axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
+    }
+
+    // L_m^{-\dagger} D^{-dagger}
+    for(int s=0; s<Ls-1; s++){
+      axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
+    }
+    axpby_ssp(chi, one/conjugate(this->dee[Ls-1]), chi, czero, chi, Ls-1, Ls-1);
+
+    // Apply L^{-dagger} and add shift
+    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInvDag_shift_norm[Ls-1], tmp, Ls-1, 0); }
+    else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInvDag_shift_norm[Ls-1], tmp, Ls-1, 0); }
+    for(int s=Ls-2; s>=0; s--){
+      axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1);  // chi[Ls]
+      if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInvDag_shift_norm[s], tmp, s, 0); }
+      else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInvDag_shift_norm[s], tmp, s, 0); }
+    }
+  }
+
+  #ifdef MOBIUS_EOFA_DPERP_LINALG
+
+    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
+
+    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
+
+  #endif
+
+}}
@@ -0,0 +1,983 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
+
+namespace Grid {
+namespace QCD {
+
+  /*
+  * Dense matrix versions of routines
+  */
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
+  {
+    this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
+  {
+    this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
+  {
+    this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
+  {
+    this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
+    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
+  {
+    GridBase* grid  = psi._grid;
+    int Ls          = this->Ls;
+    int LLs         = grid->_rdimensions[0];
+    const int nsimd = Simd::Nsimd();
+
+    Vector<iSinglet<Simd>> u(LLs);
+    Vector<iSinglet<Simd>> l(LLs);
+    Vector<iSinglet<Simd>> d(LLs);
+
+    assert(Ls/LLs == nsimd);
+    assert(phi.checkerboard == psi.checkerboard);
+
+    chi.checkerboard = psi.checkerboard;
+
+    // just directly address via type pun
+    typedef typename Simd::scalar_type scalar_type;
+    scalar_type* u_p = (scalar_type*) &u[0];
+    scalar_type* l_p = (scalar_type*) &l[0];
+    scalar_type* d_p = (scalar_type*) &d[0];
+
+    for(int o=0; o<LLs; o++){ // outer
+    for(int i=0; i<nsimd; i++){ //inner
+      int s   = o + i*LLs;
+      int ss  = o*nsimd + i;
+      u_p[ss] = upper[s];
+      l_p[ss] = lower[s];
+      d_p[ss] = diag[s];
+    }}
+
+    this->M5Dcalls++;
+    this->M5Dtime -= usecond();
+
+    assert(Nc == 3);
+
+    parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
+
+      #if 0
+
+        alignas(64) SiteHalfSpinor hp;
+        alignas(64) SiteHalfSpinor hm;
+        alignas(64) SiteSpinor fp;
+        alignas(64) SiteSpinor fm;
+
+        for(int v=0; v<LLs; v++){
+
+          int vp = (v+1)%LLs;
+          int vm = (v+LLs-1)%LLs;
+
+          spProj5m(hp, psi[ss+vp]);
+          spProj5p(hm, psi[ss+vm]);
+
+          if (vp <= v){ rotate(hp, hp, 1); }
+          if (vm >= v){ rotate(hm, hm, nsimd-1); }
+
+          hp = 0.5*hp;
+          hm = 0.5*hm;
+
+          spRecon5m(fp, hp);
+          spRecon5p(fm, hm);
+
+          chi[ss+v] = d[v]*phi[ss+v];
+          chi[ss+v] = chi[ss+v] + u[v]*fp;
+          chi[ss+v] = chi[ss+v] + l[v]*fm;
+
+        }
+
+      #else
+
+        for(int v=0; v<LLs; v++){
+
+          vprefetch(psi[ss+v+LLs]);
+
+          int vp = (v == LLs-1) ? 0     : v+1;
+          int vm = (v == 0)     ? LLs-1 : v-1;
+
+          Simd hp_00 = psi[ss+vp]()(2)(0);
+          Simd hp_01 = psi[ss+vp]()(2)(1);
+          Simd hp_02 = psi[ss+vp]()(2)(2);
+          Simd hp_10 = psi[ss+vp]()(3)(0);
+          Simd hp_11 = psi[ss+vp]()(3)(1);
+          Simd hp_12 = psi[ss+vp]()(3)(2);
+
+          Simd hm_00 = psi[ss+vm]()(0)(0);
+          Simd hm_01 = psi[ss+vm]()(0)(1);
+          Simd hm_02 = psi[ss+vm]()(0)(2);
+          Simd hm_10 = psi[ss+vm]()(1)(0);
+          Simd hm_11 = psi[ss+vm]()(1)(1);
+          Simd hm_12 = psi[ss+vm]()(1)(2);
+
+          if(vp <= v){
+            hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
+            hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
+            hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
+            hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
+            hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
+            hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
+          }
+
+          if(vm >= v){
+            hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
+            hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
+            hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
+            hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
+            hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
+            hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
+          }
+
+          // Can force these to real arithmetic and save 2x.
+          Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
+          Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
+          Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
+          Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
+          Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
+          Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
+          Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
+          Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
+          Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
+          Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
+          Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
+          Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
+
+          vstream(chi[ss+v]()(0)(0), p_00);
+          vstream(chi[ss+v]()(0)(1), p_01);
+          vstream(chi[ss+v]()(0)(2), p_02);
+          vstream(chi[ss+v]()(1)(0), p_10);
+          vstream(chi[ss+v]()(1)(1), p_11);
+          vstream(chi[ss+v]()(1)(2), p_12);
+          vstream(chi[ss+v]()(2)(0), p_20);
+          vstream(chi[ss+v]()(2)(1), p_21);
+          vstream(chi[ss+v]()(2)(2), p_22);
+          vstream(chi[ss+v]()(3)(0), p_30);
+          vstream(chi[ss+v]()(3)(1), p_31);
+          vstream(chi[ss+v]()(3)(2), p_32);
+        }
+
+      #endif
+    }
+
+    this->M5Dtime += usecond();
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField& psi, const FermionField& phi,
+    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
+    std::vector<Coeff_t>& shift_coeffs)
+  {
+    #if 0
+
+      this->M5D(psi, phi, chi, lower, diag, upper);
+
+      // FIXME: possible gain from vectorizing shift operation as well?
+      Coeff_t one(1.0);
+      int Ls = this->Ls;
+      for(int s=0; s<Ls; s++){
+        if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); }
+        else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); }
+      }
+
+    #else
+
+      GridBase* grid  = psi._grid;
+      int Ls          = this->Ls;
+      int LLs         = grid->_rdimensions[0];
+      const int nsimd = Simd::Nsimd();
+
+      Vector<iSinglet<Simd>> u(LLs);
+      Vector<iSinglet<Simd>> l(LLs);
+      Vector<iSinglet<Simd>> d(LLs);
+      Vector<iSinglet<Simd>> s(LLs);
+
+      assert(Ls/LLs == nsimd);
+      assert(phi.checkerboard == psi.checkerboard);
+
+      chi.checkerboard = psi.checkerboard;
+
+      // just directly address via type pun
+      typedef typename Simd::scalar_type scalar_type;
+      scalar_type* u_p = (scalar_type*) &u[0];
+      scalar_type* l_p = (scalar_type*) &l[0];
+      scalar_type* d_p = (scalar_type*) &d[0];
+      scalar_type* s_p = (scalar_type*) &s[0];
+
+      for(int o=0; o<LLs; o++){ // outer
+      for(int i=0; i<nsimd; i++){ //inner
+        int s   = o + i*LLs;
+        int ss  = o*nsimd + i;
+        u_p[ss] = upper[s];
+        l_p[ss] = lower[s];
+        d_p[ss] = diag[s];
+        s_p[ss] = shift_coeffs[s];
+      }}
+
+      this->M5Dcalls++;
+      this->M5Dtime -= usecond();
+
+      assert(Nc == 3);
+
+      parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
+
+        int vs     = (this->pm == 1) ? LLs-1 : 0;
+        Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(2)(0) : psi[ss+vs]()(0)(0);
+        Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(2)(1) : psi[ss+vs]()(0)(1);
+        Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(2)(2) : psi[ss+vs]()(0)(2);
+        Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(3)(0) : psi[ss+vs]()(1)(0);
+        Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(3)(1) : psi[ss+vs]()(1)(1);
+        Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(3)(2) : psi[ss+vs]()(1)(2);
+
+        for(int v=0; v<LLs; v++){
+
+          vprefetch(psi[ss+v+LLs]);
+
+          int vp = (v == LLs-1) ? 0     : v+1;
+          int vm = (v == 0)     ? LLs-1 : v-1;
+
+          Simd hp_00 = psi[ss+vp]()(2)(0);
+          Simd hp_01 = psi[ss+vp]()(2)(1);
+          Simd hp_02 = psi[ss+vp]()(2)(2);
+          Simd hp_10 = psi[ss+vp]()(3)(0);
+          Simd hp_11 = psi[ss+vp]()(3)(1);
+          Simd hp_12 = psi[ss+vp]()(3)(2);
+
+          Simd hm_00 = psi[ss+vm]()(0)(0);
+          Simd hm_01 = psi[ss+vm]()(0)(1);
+          Simd hm_02 = psi[ss+vm]()(0)(2);
+          Simd hm_10 = psi[ss+vm]()(1)(0);
+          Simd hm_11 = psi[ss+vm]()(1)(1);
+          Simd hm_12 = psi[ss+vm]()(1)(2);
+
+          if(vp <= v){
+            hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
+            hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
+            hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
+            hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
+            hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
+            hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
+          }
+
+          if(this->pm == 1 && vs <= v){
+            hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
+            hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
+            hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
+            hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
+            hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
+            hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
+          }
+
+          if(vm >= v){
+            hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
+            hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
+            hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
+            hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
+            hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
+            hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
+          }
+
+          if(this->pm == -1 && vs >= v){
+            hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
+            hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
+            hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
+            hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
+            hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
+            hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
+          }
+
+          // Can force these to real arithmetic and save 2x.
+          Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
+          Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
+          Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
+          Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
+          Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
+          Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
+          Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
+          Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
+          Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
+          Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
+          Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
+          Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
+
+          vstream(chi[ss+v]()(0)(0), p_00);
+          vstream(chi[ss+v]()(0)(1), p_01);
+          vstream(chi[ss+v]()(0)(2), p_02);
+          vstream(chi[ss+v]()(1)(0), p_10);
+          vstream(chi[ss+v]()(1)(1), p_11);
+          vstream(chi[ss+v]()(1)(2), p_12);
+          vstream(chi[ss+v]()(2)(0), p_20);
+          vstream(chi[ss+v]()(2)(1), p_21);
+          vstream(chi[ss+v]()(2)(2), p_22);
+          vstream(chi[ss+v]()(3)(0), p_30);
+          vstream(chi[ss+v]()(3)(1), p_31);
+          vstream(chi[ss+v]()(3)(2), p_32);
+        }
+      }
+
+      this->M5Dtime += usecond();
+
+    #endif
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
+    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
+  {
+    GridBase* grid = psi._grid;
+    int Ls  = this->Ls;
+    int LLs = grid->_rdimensions[0];
+    int nsimd = Simd::Nsimd();
+
+    Vector<iSinglet<Simd>> u(LLs);
+    Vector<iSinglet<Simd>> l(LLs);
+    Vector<iSinglet<Simd>> d(LLs);
+
+    assert(Ls/LLs == nsimd);
+    assert(phi.checkerboard == psi.checkerboard);
+
+    chi.checkerboard = psi.checkerboard;
+
+    // just directly address via type pun
+    typedef typename Simd::scalar_type scalar_type;
+    scalar_type* u_p = (scalar_type*) &u[0];
+    scalar_type* l_p = (scalar_type*) &l[0];
+    scalar_type* d_p = (scalar_type*) &d[0];
+
+    for(int o=0; o<LLs; o++){ // outer
+    for(int i=0; i<nsimd; i++){ //inner
+      int s  = o + i*LLs;
+      int ss = o*nsimd + i;
+      u_p[ss] = upper[s];
+      l_p[ss] = lower[s];
+      d_p[ss] = diag[s];
+    }}
+
+    this->M5Dcalls++;
+    this->M5Dtime -= usecond();
+
+    parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
+
+      #if 0
+
+        alignas(64) SiteHalfSpinor hp;
+        alignas(64) SiteHalfSpinor hm;
+        alignas(64) SiteSpinor fp;
+        alignas(64) SiteSpinor fm;
+
+        for(int v=0; v<LLs; v++){
+
+          int vp = (v+1)%LLs;
+          int vm = (v+LLs-1)%LLs;
+
+          spProj5p(hp, psi[ss+vp]);
+          spProj5m(hm, psi[ss+vm]);
+
+          if(vp <= v){ rotate(hp, hp, 1); }
+          if(vm >= v){ rotate(hm, hm, nsimd-1); }
+
+          hp = hp*0.5;
+          hm = hm*0.5;
+          spRecon5p(fp, hp);
+          spRecon5m(fm, hm);
+
+          chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
+          chi[ss+v] = chi[ss+v]     +l[v]*fm;
+
+        }
+
+      #else
+
+        for(int v=0; v<LLs; v++){
+
+          vprefetch(psi[ss+v+LLs]);
+
+          int vp = (v == LLs-1) ? 0     : v+1;
+          int vm = (v == 0    ) ? LLs-1 : v-1;
+
+          Simd hp_00 = psi[ss+vp]()(0)(0);
+          Simd hp_01 = psi[ss+vp]()(0)(1);
+          Simd hp_02 = psi[ss+vp]()(0)(2);
+          Simd hp_10 = psi[ss+vp]()(1)(0);
+          Simd hp_11 = psi[ss+vp]()(1)(1);
+          Simd hp_12 = psi[ss+vp]()(1)(2);
+
+          Simd hm_00 = psi[ss+vm]()(2)(0);
+          Simd hm_01 = psi[ss+vm]()(2)(1);
+          Simd hm_02 = psi[ss+vm]()(2)(2);
+          Simd hm_10 = psi[ss+vm]()(3)(0);
+          Simd hm_11 = psi[ss+vm]()(3)(1);
+          Simd hm_12 = psi[ss+vm]()(3)(2);
+
+          if (vp <= v){
+            hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
+            hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
+            hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
+            hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
+            hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
+            hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
+          }
+
+          if(vm >= v){
+            hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
+            hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
+            hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
+            hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
+            hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
+            hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
+          }
+
+          Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
+          Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
+          Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
+          Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
+          Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
+          Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
+          Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
+          Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
+          Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
+          Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
+          Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
+          Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
+
+          vstream(chi[ss+v]()(0)(0), p_00);
+          vstream(chi[ss+v]()(0)(1), p_01);
+          vstream(chi[ss+v]()(0)(2), p_02);
+          vstream(chi[ss+v]()(1)(0), p_10);
+          vstream(chi[ss+v]()(1)(1), p_11);
+          vstream(chi[ss+v]()(1)(2), p_12);
+          vstream(chi[ss+v]()(2)(0), p_20);
+          vstream(chi[ss+v]()(2)(1), p_21);
+          vstream(chi[ss+v]()(2)(2), p_22);
+          vstream(chi[ss+v]()(3)(0), p_30);
+          vstream(chi[ss+v]()(3)(1), p_31);
+          vstream(chi[ss+v]()(3)(2), p_32);
+
+        }
+
+      #endif
+
+    }
+
+    this->M5Dtime += usecond();
+  }
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField& psi, const FermionField& phi,
+    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
+    std::vector<Coeff_t>& shift_coeffs)
+  {
+    #if 0
+
+      this->M5Ddag(psi, phi, chi, lower, diag, upper);
+
+      // FIXME: possible gain from vectorizing shift operation as well?
+      Coeff_t one(1.0);
+      int Ls = this->Ls;
+      for(int s=0; s<Ls; s++){
+        if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); }
+        else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); }
+      }
+
+    #else
+
+      GridBase* grid = psi._grid;
+      int Ls  = this->Ls;
+      int LLs = grid->_rdimensions[0];
+      int nsimd = Simd::Nsimd();
+
+      Vector<iSinglet<Simd>> u(LLs);
+      Vector<iSinglet<Simd>> l(LLs);
+      Vector<iSinglet<Simd>> d(LLs);
+      Vector<iSinglet<Simd>> s(LLs);
+
+      assert(Ls/LLs == nsimd);
+      assert(phi.checkerboard == psi.checkerboard);
+
+      chi.checkerboard = psi.checkerboard;
+
+      // just directly address via type pun
+      typedef typename Simd::scalar_type scalar_type;
+      scalar_type* u_p = (scalar_type*) &u[0];
+      scalar_type* l_p = (scalar_type*) &l[0];
+      scalar_type* d_p = (scalar_type*) &d[0];
+      scalar_type* s_p = (scalar_type*) &s[0];
+
+      for(int o=0; o<LLs; o++){ // outer
+      for(int i=0; i<nsimd; i++){ //inner
+        int s  = o + i*LLs;
+        int ss = o*nsimd + i;
+        u_p[ss] = upper[s];
+        l_p[ss] = lower[s];
+        d_p[ss] = diag[s];
+        s_p[ss] = shift_coeffs[s];
+      }}
+
+      this->M5Dcalls++;
+      this->M5Dtime -= usecond();
+
+      parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
+
+        int vs     = (this->pm == 1) ? LLs-1 : 0;
+        Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(0)(0) : psi[ss+vs]()(2)(0);
+        Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(0)(1) : psi[ss+vs]()(2)(1);
+        Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(0)(2) : psi[ss+vs]()(2)(2);
+        Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(1)(0) : psi[ss+vs]()(3)(0);
+        Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(1)(1) : psi[ss+vs]()(3)(1);
+        Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(1)(2) : psi[ss+vs]()(3)(2);
+
+        for(int v=0; v<LLs; v++){
+
+          vprefetch(psi[ss+v+LLs]);
+
+          int vp = (v == LLs-1) ? 0     : v+1;
+          int vm = (v == 0    ) ? LLs-1 : v-1;
+
+          Simd hp_00 = psi[ss+vp]()(0)(0);
+          Simd hp_01 = psi[ss+vp]()(0)(1);
+          Simd hp_02 = psi[ss+vp]()(0)(2);
+          Simd hp_10 = psi[ss+vp]()(1)(0);
+          Simd hp_11 = psi[ss+vp]()(1)(1);
+          Simd hp_12 = psi[ss+vp]()(1)(2);
+
+          Simd hm_00 = psi[ss+vm]()(2)(0);
+          Simd hm_01 = psi[ss+vm]()(2)(1);
+          Simd hm_02 = psi[ss+vm]()(2)(2);
+          Simd hm_10 = psi[ss+vm]()(3)(0);
+          Simd hm_11 = psi[ss+vm]()(3)(1);
+          Simd hm_12 = psi[ss+vm]()(3)(2);
+
+          if (vp <= v){
+            hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
+            hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
+            hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
+            hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
+            hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
+            hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
+          }
+
+          if(this->pm == 1 && vs <= v){
+            hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
+            hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
+            hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
+            hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
+            hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
+            hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
+          }
+
+          if(vm >= v){
+            hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
+            hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
+            hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
+            hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
+            hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
+            hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
+          }
+
+          if(this->pm == -1 && vs >= v){
+            hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
+            hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
+            hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
+            hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
+            hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
+            hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
+          }
+
+          Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
+          Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
+          Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
+          Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
+          Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
+          Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
+          Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
+          Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
+          Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
+          Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
+          Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
+          Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
+                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
+                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
+
+          vstream(chi[ss+v]()(0)(0), p_00);
+          vstream(chi[ss+v]()(0)(1), p_01);
+          vstream(chi[ss+v]()(0)(2), p_02);
+          vstream(chi[ss+v]()(1)(0), p_10);
+          vstream(chi[ss+v]()(1)(1), p_11);
+          vstream(chi[ss+v]()(1)(2), p_12);
+          vstream(chi[ss+v]()(2)(0), p_20);
+          vstream(chi[ss+v]()(2)(1), p_21);
+          vstream(chi[ss+v]()(2)(2), p_22);
+          vstream(chi[ss+v]()(3)(0), p_30);
+          vstream(chi[ss+v]()(3)(1), p_31);
+          vstream(chi[ss+v]()(3)(2), p_32);
+
+        }
+
+      }
+
+      this->M5Dtime += usecond();
+
+    #endif
+  }
+
+  #ifdef AVX512
+    #include<simd/Intel512common.h>
+    #include<simd/Intel512avx.h>
+    #include<simd/Intel512single.h>
+  #endif
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi, FermionField& chi,
+    int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
+  {
+    #ifndef AVX512
+      {
+        SiteHalfSpinor BcastP;
+        SiteHalfSpinor BcastM;
+        SiteHalfSpinor SiteChiP;
+        SiteHalfSpinor SiteChiM;
+
+        // Ls*Ls * 2 * 12 * vol flops
+        for(int s1=0; s1<LLs; s1++){
+
+          for(int s2=0; s2<LLs; s2++){
+          for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
+
+            int s = s2 + l*LLs;
+            int lex = s2 + LLs*site;
+
+            if( s2==0 && l==0 ){
+              SiteChiP=zero;
+              SiteChiM=zero;
+            }
+
+            for(int sp=0; sp<2;  sp++){
+            for(int co=0; co<Nc; co++){
+              vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
+            }}
+
+            for(int sp=0; sp<2;  sp++){
+            for(int co=0; co<Nc; co++){
+              vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
+            }}
+
+            for(int sp=0; sp<2;  sp++){
+            for(int co=0; co<Nc; co++){
+              SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
+              SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
+            }}
+          }}
+
+          {
+            int lex = s1 + LLs*site;
+            for(int sp=0; sp<2;  sp++){
+            for(int co=0; co<Nc; co++){
+              vstream(chi[lex]()(sp)(co),   SiteChiP()(sp)(co));
+              vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
+            }}
+          }
+        }
+      }
+    #else
+      {
+        // pointers
+        //  MASK_REGS;
+        #define Chi_00 %%zmm1
+        #define Chi_01 %%zmm2
+        #define Chi_02 %%zmm3
+        #define Chi_10 %%zmm4
+        #define Chi_11 %%zmm5
+        #define Chi_12 %%zmm6
+        #define Chi_20 %%zmm7
+        #define Chi_21 %%zmm8
+        #define Chi_22 %%zmm9
+        #define Chi_30 %%zmm10
+        #define Chi_31 %%zmm11
+        #define Chi_32 %%zmm12
+
+        #define BCAST0  %%zmm13
+        #define BCAST1  %%zmm14
+        #define BCAST2  %%zmm15
+        #define BCAST3  %%zmm16
+        #define BCAST4  %%zmm17
+        #define BCAST5  %%zmm18
+        #define BCAST6  %%zmm19
+        #define BCAST7  %%zmm20
+        #define BCAST8  %%zmm21
+        #define BCAST9  %%zmm22
+        #define BCAST10 %%zmm23
+        #define BCAST11 %%zmm24
+
+        int incr = LLs*LLs*sizeof(iSinglet<Simd>);
+
+        for(int s1=0; s1<LLs; s1++){
+
+          for(int s2=0; s2<LLs; s2++){
+
+            int lex = s2 + LLs*site;
+            uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
+            uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
+            uint64_t a2 = (uint64_t) &psi[lex];
+
+            for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
+
+              if((s2+l)==0) {
+                asm(
+                      VPREFETCH1(0,%2)              VPREFETCH1(0,%1)
+                      VPREFETCH1(12,%2)  	          VPREFETCH1(13,%2)
+                      VPREFETCH1(14,%2)  	          VPREFETCH1(15,%2)
+                      VBCASTCDUP(0,%2,BCAST0)
+                      VBCASTCDUP(1,%2,BCAST1)
+                      VBCASTCDUP(2,%2,BCAST2)
+                      VBCASTCDUP(3,%2,BCAST3)
+                      VBCASTCDUP(4,%2,BCAST4)       VMULMEM(0,%0,BCAST0,Chi_00)
+                      VBCASTCDUP(5,%2,BCAST5)       VMULMEM(0,%0,BCAST1,Chi_01)
+                      VBCASTCDUP(6,%2,BCAST6)       VMULMEM(0,%0,BCAST2,Chi_02)
+                      VBCASTCDUP(7,%2,BCAST7)       VMULMEM(0,%0,BCAST3,Chi_10)
+                      VBCASTCDUP(8,%2,BCAST8)       VMULMEM(0,%0,BCAST4,Chi_11)
+                      VBCASTCDUP(9,%2,BCAST9)       VMULMEM(0,%0,BCAST5,Chi_12)
+                      VBCASTCDUP(10,%2,BCAST10)     VMULMEM(0,%1,BCAST6,Chi_20)
+                      VBCASTCDUP(11,%2,BCAST11)     VMULMEM(0,%1,BCAST7,Chi_21)
+                      VMULMEM(0,%1,BCAST8,Chi_22)
+                      VMULMEM(0,%1,BCAST9,Chi_30)
+                      VMULMEM(0,%1,BCAST10,Chi_31)
+                      VMULMEM(0,%1,BCAST11,Chi_32)
+                      : : "r" (a0), "r" (a1), "r" (a2)                            );
+              } else {
+                asm(
+                      VBCASTCDUP(0,%2,BCAST0)   VMADDMEM(0,%0,BCAST0,Chi_00)
+                      VBCASTCDUP(1,%2,BCAST1)   VMADDMEM(0,%0,BCAST1,Chi_01)
+                      VBCASTCDUP(2,%2,BCAST2)   VMADDMEM(0,%0,BCAST2,Chi_02)
+                      VBCASTCDUP(3,%2,BCAST3)   VMADDMEM(0,%0,BCAST3,Chi_10)
+                      VBCASTCDUP(4,%2,BCAST4)   VMADDMEM(0,%0,BCAST4,Chi_11)
+                      VBCASTCDUP(5,%2,BCAST5)   VMADDMEM(0,%0,BCAST5,Chi_12)
+                      VBCASTCDUP(6,%2,BCAST6)   VMADDMEM(0,%1,BCAST6,Chi_20)
+                      VBCASTCDUP(7,%2,BCAST7)   VMADDMEM(0,%1,BCAST7,Chi_21)
+                      VBCASTCDUP(8,%2,BCAST8)   VMADDMEM(0,%1,BCAST8,Chi_22)
+                      VBCASTCDUP(9,%2,BCAST9)   VMADDMEM(0,%1,BCAST9,Chi_30)
+                      VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
+                      VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
+                      : : "r" (a0), "r" (a1), "r" (a2)                            );
+              }
+
+              a0 = a0 + incr;
+              a1 = a1 + incr;
+              a2 = a2 + sizeof(Simd::scalar_type);
+            }
+          }
+
+          {
+            int lexa = s1+LLs*site;
+            asm (
+               VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)
+               VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)
+               VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)
+               VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)
+               : : "r" ((uint64_t)&chi[lexa]) : "memory" );
+          }
+        }
+      }
+
+      #undef Chi_00
+      #undef Chi_01
+      #undef Chi_02
+      #undef Chi_10
+      #undef Chi_11
+      #undef Chi_12
+      #undef Chi_20
+      #undef Chi_21
+      #undef Chi_22
+      #undef Chi_30
+      #undef Chi_31
+      #undef Chi_32
+
+      #undef BCAST0
+      #undef BCAST1
+      #undef BCAST2
+      #undef BCAST3
+      #undef BCAST4
+      #undef BCAST5
+      #undef BCAST6
+      #undef BCAST7
+      #undef BCAST8
+      #undef BCAST9
+      #undef BCAST10
+      #undef BCAST11
+
+    #endif
+  };
+
+  // Z-mobius version
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
+    int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
+  {
+    std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
+    exit(-1);
+  };
+
+  template<class Impl>
+  void MobiusEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
+  {
+    int Ls  = this->Ls;
+    int LLs = psi._grid->_rdimensions[0];
+    int vol = psi._grid->oSites()/LLs;
+
+    chi.checkerboard = psi.checkerboard;
+
+    Vector<iSinglet<Simd>>   Matp;
+    Vector<iSinglet<Simd>>   Matm;
+    Vector<iSinglet<Simd>>* _Matp;
+    Vector<iSinglet<Simd>>* _Matm;
+
+    //  MooeeInternalCompute(dag,inv,Matp,Matm);
+    if(inv && dag){
+      _Matp = &this->MatpInvDag;
+      _Matm = &this->MatmInvDag;
+    }
+
+    if(inv && (!dag)){
+      _Matp = &this->MatpInv;
+      _Matm = &this->MatmInv;
+    }
+
+    if(!inv){
+      MooeeInternalCompute(dag, inv, Matp, Matm);
+      _Matp = &Matp;
+      _Matm = &Matm;
+    }
+
+    assert(_Matp->size() == Ls*LLs);
+
+    this->MooeeInvCalls++;
+    this->MooeeInvTime -= usecond();
+
+    if(switcheroo<Coeff_t>::iscomplex()){
+      parallel_for(auto site=0; site<vol; site++){
+        MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
+      }
+    } else {
+      parallel_for(auto site=0; site<vol; site++){
+        MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
+      }
+    }
+
+    this->MooeeInvTime += usecond();
+  }
+
+  #ifdef MOBIUS_EOFA_DPERP_VEC
+
+    INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplD);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplD);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplF);
+
+    INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplDF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplFH);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplDF);
+    INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplFH);
+
+    template void MobiusEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+
+    template void MobiusEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+    template void MobiusEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
+
+  #endif
+
+}}
@@ -1,3 +1,4 @@
+#if 1
    /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 
@@ -97,6 +98,117 @@ namespace Grid {
    }
  };

+#if 0
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Copied from DiagTwoSolve
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field> class SchurRedBlackDiagTwoSolve {
+  private:
+    OperatorFunction<Field> & _HermitianRBSolver;
+    int CBfactorise;
+  public:
+
+    /////////////////////////////////////////////////////
+    // Wrap the usual normal equations Schur trick
+    /////////////////////////////////////////////////////
+  SchurRedBlackDiagTwoSolve(OperatorFunction<Field> &HermitianRBSolver)  :
+     _HermitianRBSolver(HermitianRBSolver) 
+    { 
+      CBfactorise=0;
+    };
+
+    template<class Matrix>
+      void operator() (Matrix & _Matrix,const Field &in, Field &out){
+
+      // FIXME CGdiagonalMee not implemented virtual function
+      // FIXME use CBfactorise to control schur decomp
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
+ 
+      Field src_e(grid);
+      Field src_o(grid);
+      Field sol_e(grid);
+      Field sol_o(grid);
+      Field   tmp(grid);
+      Field  Mtmp(grid);
+      Field resid(fgrid);
+
+      pickCheckerboard(Even,src_e,in);
+      pickCheckerboard(Odd ,src_o,in);
+      pickCheckerboard(Even,sol_e,out);
+      pickCheckerboard(Odd ,sol_o,out);
+    
+      /////////////////////////////////////////////////////
+      // src_o = Mdag * (source_o - Moe MeeInv source_e)
+      /////////////////////////////////////////////////////
+      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
+      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
+      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
+
+      // get the right MpcDag
+      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
+
+      //////////////////////////////////////////////////////////////
+      // Call the red-black solver
+      //////////////////////////////////////////////////////////////
+      std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
+//      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
+      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd);
+      _Matrix.MooeeInv(tmp,sol_o);        assert(  sol_o.checkerboard   ==Odd);
+
+      ///////////////////////////////////////////////////
+      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+      ///////////////////////////////////////////////////
+      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
+      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
+      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
+     
+      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
+      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
+
+      // Verify the unprec residual
+      _Matrix.M(out,resid); 
+      resid = resid-in;
+      RealD ns = norm2(in);
+      RealD nr = norm2(resid);
+
+      std::cout<<GridLogMessage << "SchurRedBlackDiagTwoKappa solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
+    }     
+  };
+#endif
+namespace QCD{
+    //
+    // Determinant is det of middle factor
+    // This assumes Mee is indept of U.
+    //
+    //
+    template<class Impl>
+    class SchurDifferentiableDiagTwo:  public SchurDiagTwoOperator<FermionOperator<Impl>,typename Impl::FermionField> 
+      {
+      public:
+      INHERIT_IMPL_TYPES(Impl);
+
+ 	typedef FermionOperator<Impl> Matrix;
+
+	SchurDifferentiableDiagTwo (Matrix &Mat) : SchurDiagTwoOperator<Matrix,FermionField>(Mat) {};
+    };
+#if 0
+    template<class Impl>
+    class SchurDifferentiableDiagTwoKappa :  public SchurDiagTwoKappaOperator<FermionOperator<Impl>,typename Impl::FermionField> 
+      {
+      public:
+      INHERIT_IMPL_TYPES(Impl);
+
+ 	typedef FermionOperator<Impl> Matrix;
+
+	SchurDifferentiableDiagTwoKappa (Matrix &Mat) : SchurDiagTwoKappaOperator<Matrix,FermionField>(Mat) {};
+    };
+#endif
+}
+
 }

 #endif
+#endif
@@ -30,60 +30,181 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 #define REGISTER

-#define LOAD_CHIMU \
-  {const SiteSpinor & ref (in._odata[offset]);	\
-    Chimu_00=ref()(0)(0);\
-    Chimu_01=ref()(0)(1);\
-    Chimu_02=ref()(0)(2);\
-    Chimu_10=ref()(1)(0);\
-    Chimu_11=ref()(1)(1);\
-    Chimu_12=ref()(1)(2);\
-    Chimu_20=ref()(2)(0);\
-    Chimu_21=ref()(2)(1);\
-    Chimu_22=ref()(2)(2);\
-    Chimu_30=ref()(3)(0);\
-    Chimu_31=ref()(3)(1);\
-    Chimu_32=ref()(3)(2);}
+#define LOAD_CHIMU_BODY(F)			\
+  Chimu_00=ref(F)(0)(0);			\
+  Chimu_01=ref(F)(0)(1);			\
+  Chimu_02=ref(F)(0)(2);			\
+  Chimu_10=ref(F)(1)(0);			\
+  Chimu_11=ref(F)(1)(1);			\
+  Chimu_12=ref(F)(1)(2);			\
+  Chimu_20=ref(F)(2)(0);			\
+  Chimu_21=ref(F)(2)(1);			\
+  Chimu_22=ref(F)(2)(2);			\
+  Chimu_30=ref(F)(3)(0);			\
+  Chimu_31=ref(F)(3)(1);			\
+  Chimu_32=ref(F)(3)(2)

-#define LOAD_CHI\
-  {const SiteHalfSpinor &ref(buf[offset]);	\
-    Chi_00 = ref()(0)(0);\
-    Chi_01 = ref()(0)(1);\
-    Chi_02 = ref()(0)(2);\
-    Chi_10 = ref()(1)(0);\
-    Chi_11 = ref()(1)(1);\
-    Chi_12 = ref()(1)(2);}
+#define LOAD_CHIMU(DIR,F,PERM)						\
+  { const SiteSpinor & ref (in._odata[offset]); LOAD_CHIMU_BODY(F); }
+
+#define LOAD_CHI_BODY(F)				\
+    Chi_00 = ref(F)(0)(0);\
+    Chi_01 = ref(F)(0)(1);\
+    Chi_02 = ref(F)(0)(2);\
+    Chi_10 = ref(F)(1)(0);\
+    Chi_11 = ref(F)(1)(1);\
+    Chi_12 = ref(F)(1)(2)
+
+#define LOAD_CHI(DIR,F,PERM)					\
+  {const SiteHalfSpinor &ref(buf[offset]); LOAD_CHI_BODY(F); }
+
+
+//G-parity implementations using in-place intrinsic ops
+
+//1l 1h -> 1h 1l
+//0l 0h , 1h 1l -> 0l 1h 0h,1l
+//0h,1l -> 1l,0h
+//if( (distance == 1 && !perm_will_occur) || (distance == -1 && perm_will_occur) )
+//Pulled fermion through forwards face, GPBC on upper component
+//Need 0= 0l 1h   1= 1l 0h
+//else if( (distance == -1 && !perm) || (distance == 1 && perm) )
+//Pulled fermion through backwards face, GPBC on lower component
+//Need 0= 1l 0h   1= 0l 1h
+
+//1l 1h -> 1h 1l
+//0l 0h , 1h 1l -> 0l 1h 0h,1l
+#define DO_TWIST_0L_1H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3)			\
+  permute##PERM(tmp1, ref(1)(S)(C));				\
+  exchange##PERM(tmp2,tmp3, ref(0)(S)(C), tmp1);		\
+  INTO = tmp2;
+
+//0l 0h -> 0h 0l
+//1l 1h, 0h 0l -> 1l 0h, 1h 0l
+#define DO_TWIST_1L_0H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3)			\
+  permute##PERM(tmp1, ref(0)(S)(C));				\
+  exchange##PERM(tmp2,tmp3, ref(1)(S)(C), tmp1);		\
+  INTO = tmp2;
+
+
+
+
+#define LOAD_CHI_SETUP(DIR,F)						\
+  g = F;								\
+  direction = st._directions[DIR];				\
+  distance = st._distances[DIR];				\
+  sl = st._grid->_simd_layout[direction];			\
+  inplace_twist = 0;						\
+  if(SE->_around_the_world && this->Params.twists[DIR % 4]){		\
+    if(sl == 1){							\
+      g = (F+1) % 2;							\
+    }else{								\
+      inplace_twist = 1;						\
+    }									\
+  }  
+
+#define LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)			\
+  { const SiteSpinor &ref(in._odata[offset]);				\
+    LOAD_CHI_SETUP(DIR,F);						\
+    if(!inplace_twist){							\
+      LOAD_CHIMU_BODY(g);						\
+    }else{								\
+      if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
+	   ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
+	DO_TWIST_0L_1H(Chimu_00,0,0,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_0L_1H(Chimu_01,0,1,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_0L_1H(Chimu_02,0,2,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_0L_1H(Chimu_10,1,0,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_0L_1H(Chimu_11,1,1,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_0L_1H(Chimu_12,1,2,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_0L_1H(Chimu_20,2,0,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_0L_1H(Chimu_21,2,1,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_0L_1H(Chimu_22,2,2,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_0L_1H(Chimu_30,3,0,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_0L_1H(Chimu_31,3,1,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_0L_1H(Chimu_32,3,2,F,PERM,  U_11,U_20,U_21);		\
+      }else{								\
+	DO_TWIST_1L_0H(Chimu_00,0,0,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_1L_0H(Chimu_01,0,1,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_1L_0H(Chimu_02,0,2,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_1L_0H(Chimu_10,1,0,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_1L_0H(Chimu_11,1,1,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_1L_0H(Chimu_12,1,2,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_1L_0H(Chimu_20,2,0,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_1L_0H(Chimu_21,2,1,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_1L_0H(Chimu_22,2,2,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_1L_0H(Chimu_30,3,0,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_1L_0H(Chimu_31,3,1,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_1L_0H(Chimu_32,3,2,F,PERM,  U_11,U_20,U_21);		\
+      } \
+    } \
+  }
+
+
+#define LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)				\
+  { const SiteHalfSpinor &ref(buf[offset]);				\
+    LOAD_CHI_SETUP(DIR,F);						\
+    if(!inplace_twist){							\
+      LOAD_CHI_BODY(g);							\
+    }else{								\
+      if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
+	   ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
+	DO_TWIST_0L_1H(Chi_00,0,0,F,PERM,  U_00,U_01,U_10);			\
+	DO_TWIST_0L_1H(Chi_01,0,1,F,PERM,  U_11,U_20,U_21);			\
+	DO_TWIST_0L_1H(Chi_02,0,2,F,PERM,  UChi_00,UChi_01,UChi_02);		\
+	DO_TWIST_0L_1H(Chi_10,1,0,F,PERM,  UChi_10,UChi_11,UChi_12);		\
+	DO_TWIST_0L_1H(Chi_11,1,1,F,PERM,  U_00,U_01,U_10);			\
+	DO_TWIST_0L_1H(Chi_12,1,2,F,PERM,  U_11,U_20,U_21);			\
+      }else{								\
+	DO_TWIST_1L_0H(Chi_00,0,0,F,PERM,  U_00,U_01,U_10);			\
+	DO_TWIST_1L_0H(Chi_01,0,1,F,PERM,  U_11,U_20,U_21);			\
+	DO_TWIST_1L_0H(Chi_02,0,2,F,PERM,  UChi_00,UChi_01,UChi_02);		\
+	DO_TWIST_1L_0H(Chi_10,1,0,F,PERM,  UChi_10,UChi_11,UChi_12);		\
+	DO_TWIST_1L_0H(Chi_11,1,1,F,PERM,  U_00,U_01,U_10);			\
+	DO_TWIST_1L_0H(Chi_12,1,2,F,PERM,  U_11,U_20,U_21);			\
+      }									\
+    }									\
+  }
+
+
+#define LOAD_CHI_GPARITY(DIR,F,PERM) LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)
+#define LOAD_CHIMU_GPARITY(DIR,F,PERM) LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)

 // To splat or not to splat depends on the implementation
-#define MULT_2SPIN(A)\
-  {auto & ref(U._odata[sU](A));			\
-   Impl::loadLinkElement(U_00,ref()(0,0));	\
-   Impl::loadLinkElement(U_10,ref()(1,0));	\
-   Impl::loadLinkElement(U_20,ref()(2,0));	\
-   Impl::loadLinkElement(U_01,ref()(0,1));	\
-   Impl::loadLinkElement(U_11,ref()(1,1));	\
-   Impl::loadLinkElement(U_21,ref()(2,1));	\
-    UChi_00 = U_00*Chi_00;\
-    UChi_10 = U_00*Chi_10;\
-    UChi_01 = U_10*Chi_00;\
-    UChi_11 = U_10*Chi_10;\
-    UChi_02 = U_20*Chi_00;\
-    UChi_12 = U_20*Chi_10;\
-    UChi_00+= U_01*Chi_01;\
-    UChi_10+= U_01*Chi_11;\
-    UChi_01+= U_11*Chi_01;\
-    UChi_11+= U_11*Chi_11;\
-    UChi_02+= U_21*Chi_01;\
-    UChi_12+= U_21*Chi_11;\
-    Impl::loadLinkElement(U_00,ref()(0,2));	\
-    Impl::loadLinkElement(U_10,ref()(1,2));	\
-    Impl::loadLinkElement(U_20,ref()(2,2));	\
-    UChi_00+= U_00*Chi_02;\
-    UChi_10+= U_00*Chi_12;\
-    UChi_01+= U_10*Chi_02;\
-    UChi_11+= U_10*Chi_12;\
-    UChi_02+= U_20*Chi_02;\
-    UChi_12+= U_20*Chi_12;}
+#define MULT_2SPIN_BODY \
+  Impl::loadLinkElement(U_00,ref()(0,0));	\
+  Impl::loadLinkElement(U_10,ref()(1,0));	\
+  Impl::loadLinkElement(U_20,ref()(2,0));	\
+  Impl::loadLinkElement(U_01,ref()(0,1));	\
+  Impl::loadLinkElement(U_11,ref()(1,1));	\
+  Impl::loadLinkElement(U_21,ref()(2,1));	\
+  UChi_00 = U_00*Chi_00;			\
+  UChi_10 = U_00*Chi_10;			\
+  UChi_01 = U_10*Chi_00;			\
+  UChi_11 = U_10*Chi_10;			\
+  UChi_02 = U_20*Chi_00;			\
+  UChi_12 = U_20*Chi_10;			\
+  UChi_00+= U_01*Chi_01;			\
+  UChi_10+= U_01*Chi_11;			\
+  UChi_01+= U_11*Chi_01;			\
+  UChi_11+= U_11*Chi_11;			\
+  UChi_02+= U_21*Chi_01;			\
+  UChi_12+= U_21*Chi_11;			\
+  Impl::loadLinkElement(U_00,ref()(0,2));	\
+  Impl::loadLinkElement(U_10,ref()(1,2));	\
+  Impl::loadLinkElement(U_20,ref()(2,2));	\
+  UChi_00+= U_00*Chi_02;			\
+  UChi_10+= U_00*Chi_12;			\
+  UChi_01+= U_10*Chi_02;			\
+  UChi_11+= U_10*Chi_12;			\
+  UChi_02+= U_20*Chi_02;			\
+  UChi_12+= U_20*Chi_12
+
+
+#define MULT_2SPIN(A,F)					\
+  {auto & ref(U._odata[sU](A)); MULT_2SPIN_BODY; }
+
+#define MULT_2SPIN_GPARITY(A,F)				\
+  {auto & ref(U._odata[sU](F)(A)); MULT_2SPIN_BODY; }


 #define PERMUTE_DIR(dir)			\
@@ -307,84 +428,87 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  result_31-= UChi_11;	\
  result_32-= UChi_12;

-#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)	\
+#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
  SE=st.GetEntry(ptype,DIR,ss);			\
  offset = SE->_offset;				\
  local  = SE->_is_local;			\
  perm   = SE->_permute;			\
  if ( local ) {				\
-    LOAD_CHIMU;					\
+    LOAD_CHIMU_IMPL(DIR,F,PERM);			\
    PROJ;					\
    if ( perm) {				\
      PERMUTE_DIR(PERM);			\
    }						\
  } else {					\
-    LOAD_CHI;					\
+    LOAD_CHI_IMPL(DIR,F,PERM);			\
  }						\
-  MULT_2SPIN(DIR);				\
+  MULT_2SPIN_IMPL(DIR,F);			\
  RECON;					

-#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON)	\
+
+#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\
  SE=st.GetEntry(ptype,DIR,ss);			\
  offset = SE->_offset;				\
  local  = SE->_is_local;			\
  perm   = SE->_permute;			\
  if ( local ) {				\
-    LOAD_CHIMU;					\
+    LOAD_CHIMU_IMPL(DIR,F,PERM);			\
    PROJ;					\
    if ( perm) {				\
      PERMUTE_DIR(PERM);			\
    }						\
  } else if ( st.same_node[DIR] ) {		\
-    LOAD_CHI;					\
+    LOAD_CHI_IMPL(DIR,F,PERM);			\
  }						\
  if (local || st.same_node[DIR] ) {		\
-    MULT_2SPIN(DIR);				\
+    MULT_2SPIN_IMPL(DIR,F);			\
    RECON;					\
  }

-#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON)	\
+#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\
  SE=st.GetEntry(ptype,DIR,ss);			\
  offset = SE->_offset;				\
+  local  = SE->_is_local;			\
+  perm   = SE->_permute;			\
  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
-    LOAD_CHI;					\
-    MULT_2SPIN(DIR);				\
+    LOAD_CHI_IMPL(DIR,F,PERM);			\
+    MULT_2SPIN_IMPL(DIR,F);			\
    RECON;					\
    nmu++;					\
  }

-#define HAND_RESULT(ss)				\
+#define HAND_RESULT(ss,F)			\
  {						\
    SiteSpinor & ref (out._odata[ss]);		\
-    vstream(ref()(0)(0),result_00);		\
-    vstream(ref()(0)(1),result_01);		\
-    vstream(ref()(0)(2),result_02);		\
-    vstream(ref()(1)(0),result_10);		\
-    vstream(ref()(1)(1),result_11);		\
-    vstream(ref()(1)(2),result_12);		\
-    vstream(ref()(2)(0),result_20);		\
-    vstream(ref()(2)(1),result_21);		\
-    vstream(ref()(2)(2),result_22);		\
-    vstream(ref()(3)(0),result_30);		\
-    vstream(ref()(3)(1),result_31);		\
-    vstream(ref()(3)(2),result_32);		\
+    vstream(ref(F)(0)(0),result_00);		\
+    vstream(ref(F)(0)(1),result_01);		\
+    vstream(ref(F)(0)(2),result_02);		\
+    vstream(ref(F)(1)(0),result_10);		\
+    vstream(ref(F)(1)(1),result_11);		\
+    vstream(ref(F)(1)(2),result_12);		\
+    vstream(ref(F)(2)(0),result_20);		\
+    vstream(ref(F)(2)(1),result_21);		\
+    vstream(ref(F)(2)(2),result_22);		\
+    vstream(ref(F)(3)(0),result_30);		\
+    vstream(ref(F)(3)(1),result_31);		\
+    vstream(ref(F)(3)(2),result_32);		\
  }

-#define HAND_RESULT_EXT(ss)			\
+#define HAND_RESULT_EXT(ss,F)			\
  if (nmu){					\
    SiteSpinor & ref (out._odata[ss]);		\
-    ref()(0)(0)+=result_00;		\
-    ref()(0)(1)+=result_01;		\
-    ref()(0)(2)+=result_02;		\
-    ref()(1)(0)+=result_10;		\
-    ref()(1)(1)+=result_11;		\
-    ref()(1)(2)+=result_12;		\
-    ref()(2)(0)+=result_20;		\
-    ref()(2)(1)+=result_21;		\
-    ref()(2)(2)+=result_22;		\
-    ref()(3)(0)+=result_30;		\
-    ref()(3)(1)+=result_31;		\
-    ref()(3)(2)+=result_32;		\
+    ref(F)(0)(0)+=result_00;		\
+    ref(F)(0)(1)+=result_01;		\
+    ref(F)(0)(2)+=result_02;		\
+    ref(F)(1)(0)+=result_10;		\
+    ref(F)(1)(1)+=result_11;		\
+    ref(F)(1)(2)+=result_12;		\
+    ref(F)(2)(0)+=result_20;		\
+    ref(F)(2)(1)+=result_21;		\
+    ref(F)(2)(2)+=result_22;		\
+    ref(F)(3)(0)+=result_30;		\
+    ref(F)(3)(1)+=result_31;		\
+    ref(F)(3)(2)+=result_32;		\
  }


@@ -463,15 +587,18 @@ WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGauge
  int offset,local,perm, ptype;
  StencilEntry *SE;

-  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
-  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM);
-  HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM);
-  HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM);
-  HAND_RESULT(ss);
+#define HAND_DOP_SITE(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
+  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);	\
+  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_RESULT(ss,F)
+
+  HAND_DOP_SITE(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 }

 template<class Impl>
@@ -485,16 +612,19 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,Doub

  StencilEntry *SE;
  int offset,local,perm, ptype;
-  
-  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
-  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM);
-  HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM);
-  HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM);
-  HAND_RESULT(ss);
+
+#define HAND_DOP_SITE_DAG(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
+  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_RESULT(ss,F)
+
+  HAND_DOP_SITE_DAG(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 }

 template<class Impl> void 
@@ -509,16 +639,20 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGa

  int offset,local,perm, ptype;
  StencilEntry *SE;
-  ZERO_RESULT;
-  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
-  HAND_RESULT(ss);
+
+#define HAND_DOP_SITE_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
+  ZERO_RESULT; \
+  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_RESULT(ss,F)
+
+  HAND_DOP_SITE_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 }

 template<class Impl>
@@ -532,16 +666,20 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,D

  StencilEntry *SE;
  int offset,local,perm, ptype;
-  ZERO_RESULT;
-  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
-  HAND_RESULT(ss);
+
+#define HAND_DOP_SITE_DAG_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)				\
+  ZERO_RESULT;							\
+  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_RESULT(ss,F)
+  
+  HAND_DOP_SITE_DAG_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 }

 template<class Impl> void 
@@ -557,16 +695,20 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGa
  int offset,local,perm, ptype;
  StencilEntry *SE;
  int nmu=0;
-  ZERO_RESULT;
-  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
-  HAND_RESULT_EXT(ss);
+
+#define HAND_DOP_SITE_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
+  ZERO_RESULT; \
+  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_RESULT_EXT(ss,F)
+
+  HAND_DOP_SITE_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 }

 template<class Impl>
@@ -581,16 +723,20 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
  StencilEntry *SE;
  int offset,local,perm, ptype;
  int nmu=0;
-  ZERO_RESULT;
-  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
-  HAND_RESULT_EXT(ss);
+
+#define HAND_DOP_SITE_DAG_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
+  ZERO_RESULT; \
+  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_RESULT_EXT(ss,F)
+
+  HAND_DOP_SITE_DAG_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 }

  ////////////////////////////////////////////////
@@ -646,11 +792,124 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
 				    const FermionField &in,		\
 				    FermionField &out){ assert(0); }	\

-  HAND_SPECIALISE_EMPTY(GparityWilsonImplF);
-  HAND_SPECIALISE_EMPTY(GparityWilsonImplD);
-  HAND_SPECIALISE_EMPTY(GparityWilsonImplFH);
-  HAND_SPECIALISE_EMPTY(GparityWilsonImplDF);

+
+#define HAND_SPECIALISE_GPARITY(IMPL)					\
+  template<> void							\
+  WilsonKernels<IMPL>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
+				    int ss,int sU,const FermionField &in, FermionField &out) \
+  {									\
+    typedef IMPL Impl;							\
+    typedef typename Simd::scalar_type S;				\
+    typedef typename Simd::vector_type V;				\
+									\
+    HAND_DECLARATIONS(ignore);						\
+									\
+    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
+    StencilEntry *SE;							\
+    HAND_DOP_SITE(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+    HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+  }									\
+									\
+  template<>								\
+  void WilsonKernels<IMPL>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
+					    int ss,int sU,const FermionField &in, FermionField &out) \
+  {									\
+    typedef IMPL Impl;							\
+    typedef typename Simd::scalar_type S;				\
+    typedef typename Simd::vector_type V;				\
+									\
+    HAND_DECLARATIONS(ignore);						\
+									\
+    StencilEntry *SE;							\
+    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist;					\
+    HAND_DOP_SITE_DAG(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+    HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+  }									\
+									\
+  template<> void							\
+  WilsonKernels<IMPL>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
+						     int ss,int sU,const FermionField &in, FermionField &out) \
+  {									\
+    typedef IMPL Impl;							\
+    typedef typename Simd::scalar_type S;				\
+    typedef typename Simd::vector_type V;				\
+									\
+    HAND_DECLARATIONS(ignore);						\
+									\
+    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist;					\
+    StencilEntry *SE;							\
+    HAND_DOP_SITE_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+    HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+  }									\
+									\
+  template<>								\
+  void WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
+							     int ss,int sU,const FermionField &in, FermionField &out) \
+  {									\
+    typedef IMPL Impl;							\
+    typedef typename Simd::scalar_type S;				\
+    typedef typename Simd::vector_type V;				\
+									\
+    HAND_DECLARATIONS(ignore);						\
+									\
+    StencilEntry *SE;							\
+    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
+    HAND_DOP_SITE_DAG_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+    HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+  }									\
+									\
+  template<> void							\
+  WilsonKernels<IMPL>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
+						     int ss,int sU,const FermionField &in, FermionField &out) \
+  {									\
+    typedef IMPL Impl;							\
+    typedef typename Simd::scalar_type S;				\
+    typedef typename Simd::vector_type V;				\
+									\
+    HAND_DECLARATIONS(ignore);						\
+									\
+    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
+    StencilEntry *SE;							\
+    int nmu=0;								\
+    HAND_DOP_SITE_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+    nmu = 0;								\
+    HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+  }									\
+  template<>								\
+  void WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
+							     int ss,int sU,const FermionField &in, FermionField &out) \
+  {									\
+    typedef IMPL Impl;							\
+    typedef typename Simd::scalar_type S;				\
+    typedef typename Simd::vector_type V;				\
+									\
+    HAND_DECLARATIONS(ignore);						\
+									\
+    StencilEntry *SE;							\
+    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
+    int nmu=0;								\
+    HAND_DOP_SITE_DAG_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+    nmu = 0;								\
+    HAND_DOP_SITE_DAG_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+  }
+
+
+HAND_SPECIALISE_GPARITY(GparityWilsonImplF);
+HAND_SPECIALISE_GPARITY(GparityWilsonImplD);
+HAND_SPECIALISE_GPARITY(GparityWilsonImplFH);
+HAND_SPECIALISE_GPARITY(GparityWilsonImplDF);
+
+
+
+
+
+
+
+
+
+
+  
 ////////////// Wilson ; uses this implementation /////////////////////

 #define INSTANTIATE_THEM(A) \
@@ -140,6 +140,7 @@ namespace Grid{

    };

+
  }
 }
 #endif
@@ -0,0 +1,264 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/pseudofermion/ExactOneFlavourRatio.h
+
+Copyright (C) 2017
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+/////////////////////////////////////////////////////////////////
+// Implementation of exact one flavour algorithm (EOFA)         //
+// using fermion classes defined in:                           //
+//    Grid/qcd/action/fermion/DomainWallEOFAFermion.h (Shamir) //
+//    Grid/qcd/action/fermion/MobiusEOFAFermion.h (Mobius)     //
+// arXiv: 1403.1683, 1706.05843                                //
+/////////////////////////////////////////////////////////////////
+
+#ifndef QCD_PSEUDOFERMION_EXACT_ONE_FLAVOUR_RATIO_H
+#define QCD_PSEUDOFERMION_EXACT_ONE_FLAVOUR_RATIO_H
+
+namespace Grid{
+namespace QCD{
+
+  ///////////////////////////////////////////////////////////////
+  // Exact one flavour implementation of DWF determinant ratio //
+  ///////////////////////////////////////////////////////////////
+
+  template<class Impl>
+  class ExactOneFlavourRatioPseudoFermionAction : public Action<typename Impl::GaugeField>
+  {
+    public:
+      INHERIT_IMPL_TYPES(Impl);
+      typedef OneFlavourRationalParams Params;
+      Params param;
+      MultiShiftFunction PowerNegHalf;
+
+    private:
+      bool use_heatbath_forecasting;
+      AbstractEOFAFermion<Impl>& Lop; // the basic LH operator
+      AbstractEOFAFermion<Impl>& Rop; // the basic RH operator
+      SchurRedBlackDiagMooeeSolve<FermionField> Solver;
+      FermionField Phi; // the pseudofermion field for this trajectory
+
+    public:
+      ExactOneFlavourRatioPseudoFermionAction(AbstractEOFAFermion<Impl>& _Lop, AbstractEOFAFermion<Impl>& _Rop,
+        OperatorFunction<FermionField>& S, Params& p, bool use_fc=false) : Lop(_Lop), Rop(_Rop), Solver(S),
+        Phi(_Lop.FermionGrid()), param(p), use_heatbath_forecasting(use_fc)
+      {
+        AlgRemez remez(param.lo, param.hi, param.precision);
+
+        // MdagM^(+- 1/2)
+        std::cout << GridLogMessage << "Generating degree " << param.degree << " for x^(-1/2)" << std::endl;
+        remez.generateApprox(param.degree, 1, 2);
+        PowerNegHalf.Init(remez, param.tolerance, true);
+      };
+
+      virtual std::string action_name() { return "ExactOneFlavourRatioPseudoFermionAction"; }
+
+      virtual std::string LogParameters() {
+        std::stringstream sstream;
+        sstream << GridLogMessage << "[" << action_name() << "] Low            :" << param.lo << std::endl;
+        sstream << GridLogMessage << "[" << action_name() << "] High           :" << param.hi << std::endl;
+        sstream << GridLogMessage << "[" << action_name() << "] Max iterations :" << param.MaxIter << std::endl;
+        sstream << GridLogMessage << "[" << action_name() << "] Tolerance      :" << param.tolerance << std::endl;
+        sstream << GridLogMessage << "[" << action_name() << "] Degree         :" << param.degree << std::endl;
+        sstream << GridLogMessage << "[" << action_name() << "] Precision      :" << param.precision << std::endl;
+        return sstream.str();
+      }
+
+      // Spin projection
+      void spProj(const FermionField& in, FermionField& out, int sign, int Ls)
+      {
+        if(sign == 1){ for(int s=0; s<Ls; ++s){ axpby_ssp_pplus(out, 0.0, in, 1.0, in, s, s); } }
+        else{ for(int s=0; s<Ls; ++s){ axpby_ssp_pminus(out, 0.0, in, 1.0, in, s, s); } }
+      }
+
+      // EOFA heatbath: see Eqn. (29) of arXiv:1706.05843
+      // We generate a Gaussian noise vector \eta, and then compute
+      //  \Phi = M_{\rm EOFA}^{-1/2} * \eta
+      // using a rational approximation to the inverse square root
+      virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG)
+      {
+        Lop.ImportGauge(U);
+        Rop.ImportGauge(U);
+
+        FermionField eta         (Lop.FermionGrid());
+        FermionField CG_src      (Lop.FermionGrid());
+        FermionField CG_soln     (Lop.FermionGrid());
+        FermionField Forecast_src(Lop.FermionGrid());
+        std::vector<FermionField> tmp(2, Lop.FermionGrid());
+
+        // Use chronological inverter to forecast solutions across poles
+        std::vector<FermionField> prev_solns;
+        if(use_heatbath_forecasting){ prev_solns.reserve(param.degree); }
+        ChronoForecast<AbstractEOFAFermion<Impl>, FermionField> Forecast;
+
+        // Seed with Gaussian noise vector (var = 0.5)
+        RealD scale = std::sqrt(0.5);
+        gaussian(pRNG,eta);
+        eta = eta * scale;
+        printf("Heatbath source vector: <\\eta|\\eta> = %1.15e\n", norm2(eta));
+
+        // \Phi = ( \alpha_{0} + \sum_{k=1}^{N_{p}} \alpha_{l} * \gamma_{l} ) * \eta
+        RealD N(PowerNegHalf.norm);
+        for(int k=0; k<param.degree; ++k){ N += PowerNegHalf.residues[k] / ( 1.0 + PowerNegHalf.poles[k] ); }
+        Phi = eta * N;
+
+        // LH terms:
+        // \Phi = \Phi + k \sum_{k=1}^{N_{p}} P_{-} \Omega_{-}^{\dagger} ( H(mf)
+        //          - \gamma_{l} \Delta_{-}(mf,mb) P_{-} )^{-1} \Omega_{-} P_{-} \eta
+        RealD gamma_l(0.0);
+        spProj(eta, tmp[0], -1, Lop.Ls);
+        Lop.Omega(tmp[0], tmp[1], -1, 0);
+        G5R5(CG_src, tmp[1]);
+        tmp[1] = zero;
+        for(int k=0; k<param.degree; ++k){
+          gamma_l = 1.0 / ( 1.0 + PowerNegHalf.poles[k] );
+          Lop.RefreshShiftCoefficients(-gamma_l);
+          if(use_heatbath_forecasting){ // Forecast CG guess using solutions from previous poles
+            Lop.Mdag(CG_src, Forecast_src);
+            CG_soln = Forecast(Lop, Forecast_src, prev_solns);
+            Solver(Lop, CG_src, CG_soln);
+            prev_solns.push_back(CG_soln);
+          } else {
+            CG_soln = zero; // Just use zero as the initial guess
+            Solver(Lop, CG_src, CG_soln);
+          }
+          Lop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
+          tmp[1] = tmp[1] + ( PowerNegHalf.residues[k]*gamma_l*gamma_l*Lop.k ) * tmp[0];
+        }
+        Lop.Omega(tmp[1], tmp[0], -1, 1);
+        spProj(tmp[0], tmp[1], -1, Lop.Ls);
+        Phi = Phi + tmp[1];
+
+        // RH terms:
+        // \Phi = \Phi - k \sum_{k=1}^{N_{p}} P_{+} \Omega_{+}^{\dagger} ( H(mb)
+        //          + \gamma_{l} \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} \eta
+        spProj(eta, tmp[0], 1, Rop.Ls);
+        Rop.Omega(tmp[0], tmp[1], 1, 0);
+        G5R5(CG_src, tmp[1]);
+        tmp[1] = zero;
+        if(use_heatbath_forecasting){ prev_solns.clear(); } // empirically, LH solns don't help for RH solves
+        for(int k=0; k<param.degree; ++k){
+          gamma_l = 1.0 / ( 1.0 + PowerNegHalf.poles[k] );
+          Rop.RefreshShiftCoefficients(-gamma_l*PowerNegHalf.poles[k]);
+          if(use_heatbath_forecasting){
+            Rop.Mdag(CG_src, Forecast_src);
+            CG_soln = Forecast(Rop, Forecast_src, prev_solns);
+            Solver(Rop, CG_src, CG_soln);
+            prev_solns.push_back(CG_soln);
+          } else {
+            CG_soln = zero;
+            Solver(Rop, CG_src, CG_soln);
+          }
+          Rop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
+          tmp[1] = tmp[1] - ( PowerNegHalf.residues[k]*gamma_l*gamma_l*Rop.k ) * tmp[0];
+        }
+        Rop.Omega(tmp[1], tmp[0], 1, 1);
+        spProj(tmp[0], tmp[1], 1, Rop.Ls);
+        Phi = Phi + tmp[1];
+
+        // Reset shift coefficients for energy and force evals
+        Lop.RefreshShiftCoefficients(0.0);
+        Rop.RefreshShiftCoefficients(-1.0);
+      };
+
+      // EOFA action: see Eqn. (10) of arXiv:1706.05843
+      virtual RealD S(const GaugeField& U)
+      {
+        Lop.ImportGauge(U);
+        Rop.ImportGauge(U);
+
+        FermionField spProj_Phi(Lop.FermionGrid());
+        std::vector<FermionField> tmp(2, Lop.FermionGrid());
+
+        // S = <\Phi|\Phi>
+        RealD action(norm2(Phi));
+
+        // LH term: S = S - k <\Phi| P_{-} \Omega_{-}^{\dagger} H(mf)^{-1} \Omega_{-} P_{-} |\Phi>
+        spProj(Phi, spProj_Phi, -1, Lop.Ls);
+        Lop.Omega(spProj_Phi, tmp[0], -1, 0);
+        G5R5(tmp[1], tmp[0]);
+        tmp[0] = zero;
+        Solver(Lop, tmp[1], tmp[0]);
+        Lop.Dtilde(tmp[0], tmp[1]); // We actually solved Cayley preconditioned system: transform back
+        Lop.Omega(tmp[1], tmp[0], -1, 1);
+        action -= Lop.k * innerProduct(spProj_Phi, tmp[0]).real();
+
+        // RH term: S = S + k <\Phi| P_{+} \Omega_{+}^{\dagger} ( H(mb)
+        //               - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{-} P_{-} |\Phi>
+        spProj(Phi, spProj_Phi, 1, Rop.Ls);
+        Rop.Omega(spProj_Phi, tmp[0], 1, 0);
+        G5R5(tmp[1], tmp[0]);
+        tmp[0] = zero;
+        Solver(Rop, tmp[1], tmp[0]);
+        Rop.Dtilde(tmp[0], tmp[1]);
+        Rop.Omega(tmp[1], tmp[0], 1, 1);
+        action += Rop.k * innerProduct(spProj_Phi, tmp[0]).real();
+
+        return action;
+      };
+
+      // EOFA pseudofermion force: see Eqns. (34)-(36) of arXiv:1706.05843
+      virtual void deriv(const GaugeField& U, GaugeField& dSdU)
+      {
+        Lop.ImportGauge(U);
+        Rop.ImportGauge(U);
+
+        FermionField spProj_Phi      (Lop.FermionGrid());
+        FermionField Omega_spProj_Phi(Lop.FermionGrid());
+        FermionField CG_src          (Lop.FermionGrid());
+        FermionField Chi             (Lop.FermionGrid());
+        FermionField g5_R5_Chi       (Lop.FermionGrid());
+
+        GaugeField force(Lop.GaugeGrid());
+
+        // LH: dSdU = k \chi_{L}^{\dagger} \gamma_{5} R_{5} ( \partial_{x,\mu} D_{w} ) \chi_{L}
+        //     \chi_{L} = H(mf)^{-1} \Omega_{-} P_{-} \Phi
+        spProj(Phi, spProj_Phi, -1, Lop.Ls);
+        Lop.Omega(spProj_Phi, Omega_spProj_Phi, -1, 0);
+        G5R5(CG_src, Omega_spProj_Phi);
+        spProj_Phi = zero;
+        Solver(Lop, CG_src, spProj_Phi);
+        Lop.Dtilde(spProj_Phi, Chi);
+        G5R5(g5_R5_Chi, Chi);
+        Lop.MDeriv(force, g5_R5_Chi, Chi, DaggerNo);
+        dSdU = Lop.k * force;
+
+        // RH: dSdU = dSdU - k \chi_{R}^{\dagger} \gamma_{5} R_{5} ( \partial_{x,\mu} D_{w} ) \chi_{}
+        //     \chi_{R} = ( H(mb) - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} \Phi
+        spProj(Phi, spProj_Phi, 1, Rop.Ls);
+        Rop.Omega(spProj_Phi, Omega_spProj_Phi, 1, 0);
+        G5R5(CG_src, Omega_spProj_Phi);
+        spProj_Phi = zero;
+        Solver(Rop, CG_src, spProj_Phi);
+        Rop.Dtilde(spProj_Phi, Chi);
+        G5R5(g5_R5_Chi, Chi);
+        Lop.MDeriv(force, g5_R5_Chi, Chi, DaggerNo);
+        dSdU = dSdU - Rop.k * force;
+      };
+  };
+}}
+
+#endif
@@ -38,5 +38,6 @@ directory
 #include <Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>
+#include <Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h>

 #endif
@@ -231,7 +231,7 @@ class ForceGradient : public Integrator<FieldImplementation, SmearingPolicy,
    Field Pfg(U._grid);
    Ufg = U;
    Pfg = zero;
-    std::cout << GridLogMessage << "FG update " << fg_dt << " " << ep
+    std::cout << GridLogIntegrator << "FG update " << fg_dt << " " << ep
              << std::endl;
    // prepare_fg; no prediction/result cache for now
    // could relax CG stopping conditions for the
@@ -72,7 +72,7 @@ protected:
  }

  virtual unsigned int Ls(){
-    return 0;  
+    return 0;
  }

  virtual void print_parameters(){
@@ -97,7 +97,7 @@ class HMC_FermionOperatorModuleFactory
    : public Factory < FermionOperatorModuleBase<QCD::FermionOperator<FermionImpl> > ,  Reader<ReaderClass> > {
 public:
  // use SINGLETON FUNCTOR MACRO HERE
-  typedef Reader<ReaderClass> TheReader; 
+  typedef Reader<ReaderClass> TheReader;

  HMC_FermionOperatorModuleFactory(const HMC_FermionOperatorModuleFactory& e) = delete;
  void operator=(const HMC_FermionOperatorModuleFactory& e) = delete;
@@ -122,7 +122,7 @@ namespace QCD{
 // Modules
 class WilsonFermionParameters : Serializable {
 public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(WilsonFermionParameters, 
+  GRID_SERIALIZABLE_CLASS_MEMBERS(WilsonFermionParameters,
    RealD, mass);
 };

@@ -144,7 +144,7 @@ class WilsonFermionModule: public FermionOperatorModule<WilsonFermion, FermionIm

 class MobiusFermionParameters : Serializable {
 public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(MobiusFermionParameters, 
+  GRID_SERIALIZABLE_CLASS_MEMBERS(MobiusFermionParameters,
    RealD, mass,
    RealD, M5,
    RealD, b,
@@ -166,7 +166,7 @@ class MobiusFermionModule: public FermionOperatorModule<MobiusFermion, FermionIm
    auto GridMod = this->GridRefs[0];
    auto GridMod5d = this->GridRefs[1];
    typename FermionImpl::GaugeField U(GridMod->get_full());
-    this->FOPtr.reset(new MobiusFermion<FermionImpl>( U, *(GridMod->get_full()), *(GridMod->get_rb()), 
+    this->FOPtr.reset(new MobiusFermion<FermionImpl>( U, *(GridMod->get_full()), *(GridMod->get_rb()),
                                                      *(GridMod5d->get_full()), *(GridMod5d->get_rb()),
                                                      this->Par_.mass, this->Par_.M5, this->Par_.b, this->Par_.c));
  }
@@ -175,7 +175,7 @@ class MobiusFermionModule: public FermionOperatorModule<MobiusFermion, FermionIm

 class DomainWallFermionParameters : Serializable {
 public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(DomainWallFermionParameters, 
+  GRID_SERIALIZABLE_CLASS_MEMBERS(DomainWallFermionParameters,
    RealD, mass,
    RealD, M5,
    unsigned int, Ls);
@@ -195,16 +195,49 @@ class DomainWallFermionModule: public FermionOperatorModule<DomainWallFermion, F
    auto GridMod = this->GridRefs[0];
    auto GridMod5d = this->GridRefs[1];
    typename FermionImpl::GaugeField U(GridMod->get_full());
-    this->FOPtr.reset(new DomainWallFermion<FermionImpl>( U, *(GridMod->get_full()), *(GridMod->get_rb()), 
+    this->FOPtr.reset(new DomainWallFermion<FermionImpl>( U, *(GridMod->get_full()), *(GridMod->get_rb()),
                                                      *(GridMod5d->get_full()), *(GridMod5d->get_rb()),
                                                      this->Par_.mass, this->Par_.M5));
  }
 };


+class DomainWallEOFAFermionParameters : Serializable {
+ public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(DomainWallEOFAFermionParameters,
+    RealD, mq1,
+    RealD, mq2,
+    RealD, mq3,
+    RealD, shift,
+    int, pm,
+    RealD, M5,
+    unsigned int, Ls);
+};
+
+template <class FermionImpl >
+class DomainWallEOFAFermionModule: public FermionOperatorModule<DomainWallEOFAFermion, FermionImpl, DomainWallEOFAFermionParameters> {
+  typedef FermionOperatorModule<DomainWallEOFAFermion, FermionImpl, DomainWallEOFAFermionParameters> FermBase;
+  using FermBase::FermBase; // for constructors
+
+  virtual unsigned int Ls(){
+    return this->Par_.Ls;
+  }
+
+  // acquire resource
+  virtual void initialize(){
+    auto GridMod = this->GridRefs[0];
+    auto GridMod5d = this->GridRefs[1];
+    typename FermionImpl::GaugeField U(GridMod->get_full());
+    this->FOPtr.reset(new DomainWallEOFAFermion<FermionImpl>( U, *(GridMod->get_full()), *(GridMod->get_rb()),
+                                                      *(GridMod5d->get_full()), *(GridMod5d->get_rb()),
+                                                      this->Par_.mq1, this->Par_.mq2, this->Par_.mq3,
+                                                      this->Par_.shift, this->Par_.pm, this->Par_.M5));
+  }
+};
+

 } // QCD
 } // Grid


-#endif //FERMIONOPERATOR_MODULES_H
+#endif //FERMIONOPERATOR_MODULES_H
@@ -50,6 +50,7 @@ GridCartesian *SpaceTimeGrid::makeFourDimDWFGrid(const std::vector<int> & latt,c
 GridCartesian         *SpaceTimeGrid::makeFiveDimGrid(int Ls,const GridCartesian *FourDimGrid)
 {
  int N4=FourDimGrid->_ndimension;
+  assert(N4==4);

  std::vector<int> latt5(1,Ls);
  std::vector<int> simd5(1,1);
@@ -60,7 +61,7 @@ GridCartesian         *SpaceTimeGrid::makeFiveDimGrid(int Ls,const GridCartesian
    simd5.push_back(FourDimGrid->_simd_layout[d]);
     mpi5.push_back(FourDimGrid->_processors[d]);
  }
-  return new GridCartesian(latt5,simd5,mpi5); 
+  return new GridCartesian(latt5,simd5,mpi5,*FourDimGrid); 
 }


@@ -68,18 +69,14 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridC
 {
  int N4=FourDimGrid->_ndimension;
  int cbd=1;
-  std::vector<int> latt5(1,Ls);
-  std::vector<int> simd5(1,1);
-  std::vector<int>  mpi5(1,1);
  std::vector<int>   cb5(1,0);
-    
  for(int d=0;d<N4;d++){
-    latt5.push_back(FourDimGrid->_fdimensions[d]);
-    simd5.push_back(FourDimGrid->_simd_layout[d]);
-     mpi5.push_back(FourDimGrid->_processors[d]);
      cb5.push_back(  1);
-    }
-  return new GridRedBlackCartesian(latt5,simd5,mpi5,cb5,cbd); 
+  }
+  GridCartesian *tmp = makeFiveDimGrid(Ls,FourDimGrid);
+  GridRedBlackCartesian *ret = new GridRedBlackCartesian(tmp,cb5,cbd); 
+  delete tmp;
+  return ret;
 }


@@ -97,26 +94,24 @@ GridCartesian         *SpaceTimeGrid::makeFiveDimDWFGrid(int Ls,const GridCartes
    simd5.push_back(1);
     mpi5.push_back(FourDimGrid->_processors[d]);
  }
-  return new GridCartesian(latt5,simd5,mpi5); 
+  return new GridCartesian(latt5,simd5,mpi5,*FourDimGrid); 
 }
-
+///////////////////////////////////////////////////
+// Interface is inefficient and forces the deletion
+// Pass in the non-redblack grid
+///////////////////////////////////////////////////
 GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(int Ls,const GridCartesian *FourDimGrid)
 {
  int N4=FourDimGrid->_ndimension;
-  int nsimd = FourDimGrid->Nsimd();
  int cbd=1;
-  std::vector<int> latt5(1,Ls);
-  std::vector<int> simd5(1,nsimd);
-  std::vector<int>  mpi5(1,1);
  std::vector<int>   cb5(1,0);
-    
  for(int d=0;d<N4;d++){
-    latt5.push_back(FourDimGrid->_fdimensions[d]);
-    simd5.push_back(1);
-     mpi5.push_back(FourDimGrid->_processors[d]);
      cb5.push_back(1);
-    }
-  return new GridRedBlackCartesian(latt5,simd5,mpi5,cb5,cbd); 
+  }
+  GridCartesian *tmp         = makeFiveDimDWFGrid(Ls,FourDimGrid);
+  GridRedBlackCartesian *ret = new GridRedBlackCartesian(tmp,cb5,cbd); 
+  delete tmp;
+  return ret;
 }


@@ -86,7 +86,7 @@ namespace Grid {
                                      or element<T>::is_number;
  };
  
-  // Vector flatening utility class ////////////////////////////////////////////
+  // Vector flattening utility class ////////////////////////////////////////////
  // Class to flatten a multidimensional std::vector
  template <typename V>
  class Flatten
@@ -42,6 +42,7 @@ JSONWriter::~JSONWriter(void)

  // write prettified JSON to file
  std::ofstream os(fileName_);
+  //std::cout << "JSONWriter::~JSONWriter" << std::endl;
  os << std::setw(2) << json::parse(ss_.str()) << std::endl;
 }

@@ -56,6 +57,7 @@ void JSONWriter::push(const string &s)

 void JSONWriter::pop(void)
 {
+  //std::cout << "JSONWriter::pop" << std::endl;
  delete_comma();
  ss_ << "},";
 }
@@ -67,20 +69,22 @@ void JSONWriter::delete_comma()
  ss_.str(dlast);
 }

+
 // here we are hitting a g++ bug (Bug 56480)
 // compiles fine with clang
 // have to wrap in the Grid namespace
 // annoying, but necessary for TravisCI
 namespace Grid
 {
-  template<>
-  void JSONWriter::writeDefault(const std::string &s,
-				const std::string &x)
+  void JSONWriter::writeDefault(const std::string &s,	const std::string &x)
  {
+    //std::cout << "JSONWriter::writeDefault(string) : " << s <<  std::endl;
+    std::ostringstream os;
+    os << std::boolalpha << x;
    if (s.size())
-      ss_ << "\""<< s << "\" : \"" << x << "\" ," ; 
+      ss_ << "\""<< s << "\" : \"" << os.str() << "\" ," ;
    else
-      ss_ << "\"" << x << "\" ," ;
+     ss_ << os.str() << " ," ;
  }
 }// namespace Grid 

@@ -138,6 +142,7 @@ void JSONReader::pop(void)

 bool JSONReader::nextElement(const std::string &s)
 {
+  // Work in progress
  // JSON dictionaries do not support multiple names 
  // Same name objects must be packed in vectors
  ++it_;
@@ -58,10 +58,15 @@ namespace Grid
    void writeDefault(const std::string &s, const std::complex<U> &x);
    template <typename U>
    void writeDefault(const std::string &s, const std::vector<U> &x);
+    template <typename U, typename P>
+    void writeDefault(const std::string &s, const std::pair<U,P> &x);

    template<std::size_t N>
    void writeDefault(const std::string &s, const char(&x)[N]);

+    void writeDefault(const std::string &s, const std::string &x);
+
+
  private:
    void delete_comma();
    std::string         fileName_;
@@ -82,6 +87,8 @@ namespace Grid
    void readDefault(const std::string &s, std::complex<U> &output);
    template <typename U>
    void readDefault(const std::string &s, std::vector<U> &output);
+    template <typename U, typename P>
+    void readDefault(const std::string &s, std::pair<U,P> &output);
  private:
    json                jobject_; // main object
    json                jcur_;  // current json object
@@ -106,7 +113,7 @@ namespace Grid
  template <typename U>
  void JSONWriter::writeDefault(const std::string &s, const U &x)
  {
-    //std::cout << "JSONReader::writeDefault(U) : " << s <<  std::endl;
+    //std::cout << "JSONWriter::writeDefault(U) : " << s <<  " " << x <<std::endl;
    std::ostringstream os;
    os << std::boolalpha << x;
    if (s.size())
@@ -118,7 +125,7 @@ namespace Grid
  template <typename U>
  void JSONWriter::writeDefault(const std::string &s, const std::complex<U> &x)
  {
-    //std::cout << "JSONReader::writeDefault(complex) : " << s <<  std::endl;
+    //std::cout << "JSONWriter::writeDefault(complex) : " << s <<  " " << x <<  std::endl;
    std::ostringstream os;
    os << "["<< std::boolalpha << x.real() << ", " << x.imag() << "]";
    if (s.size())
@@ -127,10 +134,22 @@ namespace Grid
     ss_ << os.str() << " ," ;
  }

+  template <typename U, typename P>
+  void JSONWriter::writeDefault(const std::string &s, const std::pair<U,P> &x)
+  {
+    //std::cout << "JSONWriter::writeDefault(pair) : " << s <<  " " << x <<  std::endl;
+    std::ostringstream os;
+    os << "["<< std::boolalpha << "\""<< x.first << "\" , \"" << x.second << "\" ]";
+    if (s.size())
+      ss_ << "\""<< s << "\" : " << os.str() << " ," ;
+    else
+     ss_ << os.str() << " ," ;
+  }
+
  template <typename U>
  void JSONWriter::writeDefault(const std::string &s, const std::vector<U> &x)
  {
-    //std::cout << "JSONReader::writeDefault(vec U) : " << s <<  std::endl;
+    //std::cout << "JSONWriter::writeDefault(vec U) : " << s <<  std::endl;

    if (s.size())
      ss_ << " \""<<s<<"\" : [";
@@ -146,12 +165,12 @@ namespace Grid

  template<std::size_t N>
  void JSONWriter::writeDefault(const std::string &s, const char(&x)[N]){
-    //std::cout << "JSONReader::writeDefault(char U) : " << s <<  std::endl;
+    //std::cout << "JSONWriter::writeDefault(char U) : " << s <<  "  " << x << std::endl;

    if (s.size())
-    ss_ << "\""<< s << "\" : \"" << x << "\" ," ;
+      ss_ << "\""<< s << "\" : \"" << x << "\" ," ;
    else
-    ss_ << "\"" << x << "\" ," ;
+      ss_ << "\"" << x << "\" ," ;
  }

  // Reader template implementation ////////////////////////////////////////////
@@ -173,11 +192,35 @@ namespace Grid

  }

+  // Reader template implementation ////////////////////////////////////////////
+  template <typename U, typename P>
+  void JSONReader::readDefault(const std::string &s, std::pair<U,P> &output)
+  {
+    U first;
+    P second;
+    json j;
+    if (s.size()){
+      //std::cout << "JSONReader::readDefault(pair) : " << s << "  |  "<< jcur_[s] << std::endl;
+      j = jcur_[s];
+    } else {
+      j = jcur_;
+    }
+    json::iterator it = j.begin();
+    jcur_ = *it;
+    read("", first);
+    it++;
+    jcur_ = *it;
+    read("", second);
+    output = std::pair<U,P>(first,second);
+  }
+
+
+
  template <typename U>
  void JSONReader::readDefault(const std::string &s, std::complex<U> &output)
  {
    U tmp1, tmp2;
-    //std::cout << "JSONReader::readDefault( complex U) : " << s << "  :  "<< jcur_ << std::endl;
+    //std::cout << "JSONReader::readDefault(complex U) : " << s << "  :  "<< jcur_ << std::endl;
    json j = jcur_;
    json::iterator it = j.begin();
    jcur_ = *it;
@@ -82,11 +82,11 @@ namespace Optimization {
      double tmp[2]={a,b};
      return vld1q_f64(tmp);
    }
-    //Real double // N:tbc
+    //Real double
    inline float64x2_t operator()(double a){
      return vdupq_n_f64(a);
    }
-    //Integer // N:tbc
+    //Integer
    inline uint32x4_t operator()(Integer a){
      return vdupq_n_u32(a);
    }
@@ -124,33 +124,32 @@ namespace Optimization {
  // Nils: Vset untested; not used currently in Grid at all;
  // git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
  struct Vset{
-    // Complex float // N:ok
+    // Complex float
    inline float32x4_t operator()(Grid::ComplexF *a){
      float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()};
      return vld1q_f32(tmp);
    }
-    // Complex double // N:ok
+    // Complex double
    inline float64x2_t operator()(Grid::ComplexD *a){
      double tmp[2]={a[0].imag(),a[0].real()};
      return vld1q_f64(tmp);
    }
-    // Real float // N:ok
+    // Real float
    inline float32x4_t operator()(float *a){
      float tmp[4]={a[3],a[2],a[1],a[0]};
      return vld1q_f32(tmp);
    }
-    // Real double // N:ok
+    // Real double
    inline float64x2_t operator()(double *a){
      double tmp[2]={a[1],a[0]};
      return vld1q_f64(tmp);
    }
-    // Integer // N:ok
+    // Integer
    inline uint32x4_t operator()(Integer *a){
      return vld1q_dup_u32(a);
    }
  };

-  // N:leaving as is
  template <typename Out_type, typename In_type>
  struct Reduce{
    //Need templated class to overload output type
@@ -249,9 +248,9 @@ namespace Optimization {
      return vfmaq_f32(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi ...

      // no fma, use mul and add
-      //float32x4_t r5;
-      //r5 = vmulq_f32(r0, a);
-      //return vaddq_f32(r4, r5);
+      // float32x4_t r5;
+      // r5 = vmulq_f32(r0, a);
+      // return vaddq_f32(r4, r5);
    }
    // Complex double
    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
@@ -272,9 +271,9 @@ namespace Optimization {
      return vfmaq_f64(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi

      // no fma, use mul and add
-      //float64x2_t r5;
-      //r5 = vmulq_f64(r0, a);
-      //return vaddq_f64(r4, r5);
+      // float64x2_t r5;
+      // r5 = vmulq_f64(r0, a);
+      // return vaddq_f64(r4, r5);
    }
  };

@@ -421,11 +420,6 @@ namespace Optimization {
      }
    }

-// working, but no restriction on n
-//    template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n); };
-//    template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n); };
-
-// restriction on n
    template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); };
    template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); };

@@ -441,7 +435,7 @@ namespace Optimization {
      sb = vcvt_high_f32_f16(h);
      // there is no direct conversion from lower float32x4_t to float64x2_t
      // vextq_f16 not supported by clang 3.8 / 4.0 / arm clang
-      //float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang
+      // float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang
      // workaround for clang
      uint32x4_t h1u = reinterpret_cast<uint32x4_t>(h);
      float16x8_t h1 = reinterpret_cast<float16x8_t>(vextq_u32(h1u, h1u, 2));
@@ -547,7 +541,7 @@ namespace Optimization {


  //Complex double Reduce
-  template<> // N:by Boyle
+  template<>
  inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
    u128d conv; conv.v = in;
    return Grid::ComplexD(conv.f[0],conv.f[1]);
@@ -562,9 +556,7 @@ namespace Optimization {
  //Integer Reduce
  template<>
  inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){
-    // FIXME unimplemented
-    printf("Reduce : Missing integer implementation -> FIX\n");
-    assert(0);
+    return vaddvq_u32(in);
  }
 }

@@ -603,4 +595,5 @@ namespace Optimization {
  typedef Optimization::TimesMinusI TimesMinusISIMD;
  typedef Optimization::TimesI      TimesISIMD;

-}
+}
+
@@ -376,7 +376,18 @@ class Grid_simd {
      Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
    }
  }
-
+  friend inline void exchange0(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){    
+    Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
+  }
+  friend inline void exchange1(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){    
+    Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v);
+  }
+  friend inline void exchange2(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){    
+    Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v);
+  }
+  friend inline void exchange3(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){    
+    Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v);
+  }
  ////////////////////////////////////////////////////////////////////
  // General permute; assumes vector length is same across
  // all subtypes; may not be a good assumption, but could
@@ -400,11 +400,13 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
      if ( sshift[0] == sshift[1] ) {
 	if (splice_dim) {
 	  splicetime-=usecond();
-	  same_node = same_node && GatherSimd(source,dimension,shift,0x3,compress,face_idx);
+	  auto tmp  = GatherSimd(source,dimension,shift,0x3,compress,face_idx);
+	  same_node = same_node && tmp;
 	  splicetime+=usecond();
 	} else { 
 	  nosplicetime-=usecond();
-	  same_node = same_node && Gather(source,dimension,shift,0x3,compress,face_idx);
+	  auto tmp  = Gather(source,dimension,shift,0x3,compress,face_idx);
+	  same_node = same_node && tmp;
 	  nosplicetime+=usecond();
 	}
      } else {
@@ -412,13 +414,15 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 	  splicetime-=usecond();
 	  // if checkerboard is unfavourable take two passes
 	  // both with block stride loop iteration
-	  same_node = same_node && GatherSimd(source,dimension,shift,0x1,compress,face_idx);
-	  same_node = same_node && GatherSimd(source,dimension,shift,0x2,compress,face_idx);
+	  auto tmp1 =  GatherSimd(source,dimension,shift,0x1,compress,face_idx);
+	  auto tmp2 =  GatherSimd(source,dimension,shift,0x2,compress,face_idx);
+	  same_node = same_node && tmp1 && tmp2;
 	  splicetime+=usecond();
 	} else {
 	  nosplicetime-=usecond();
-	  same_node = same_node && Gather(source,dimension,shift,0x1,compress,face_idx);
-	  same_node = same_node && Gather(source,dimension,shift,0x2,compress,face_idx);
+	  auto tmp1 = Gather(source,dimension,shift,0x1,compress,face_idx);
+	  auto tmp2 = Gather(source,dimension,shift,0x2,compress,face_idx);
+	  same_node = same_node && tmp1 && tmp2;
 	  nosplicetime+=usecond();
 	}
      }
@@ -175,7 +175,7 @@ class TensorIndexRecursion {
      }
    }
  template<class vtype,int N> inline static 
-    void pokeIndex(iVector<vtype,N> &ret, const iVector<decltype(TensorIndexRecursion<Level-1>::peekIndex(ret._internal[0],0)),N> &arg, int i,int j)
+    void pokeIndex(iVector<vtype,N> &ret, const iVector<decltype(TensorIndexRecursion<Level-1>::peekIndex(ret._internal[0],0,0)),N> &arg, int i,int j)
    {
      for(int ii=0;ii<N;ii++){
 	TensorIndexRecursion<Level-1>::pokeIndex(ret._internal[ii],arg._internal[ii],i,j);
@@ -191,7 +191,7 @@ class TensorIndexRecursion {
      }}
    }
  template<class vtype,int N> inline static 
-    void pokeIndex(iMatrix<vtype,N> &ret, const iMatrix<decltype(TensorIndexRecursion<Level-1>::peekIndex(ret._internal[0][0],0)),N> &arg, int i,int j)
+    void pokeIndex(iMatrix<vtype,N> &ret, const iMatrix<decltype(TensorIndexRecursion<Level-1>::peekIndex(ret._internal[0][0],0,0)),N> &arg, int i,int j)
    {
      for(int ii=0;ii<N;ii++){
      for(int jj=0;jj<N;jj++){
@@ -243,6 +243,12 @@ void Grid_init(int *argc,char ***argv)
    fname<<CartesianCommunicator::RankWorld();
    fp=freopen(fname.str().c_str(),"w",stdout);
    assert(fp!=(FILE *)NULL);
+
+    std::ostringstream ename;
+    ename<<"Grid.stderr.";
+    ename<<CartesianCommunicator::RankWorld();
+    fp=freopen(ename.str().c_str(),"w",stderr);
+    assert(fp!=(FILE *)NULL);
  }

  ////////////////////////////////////
@@ -7,7 +7,7 @@ namespace Grid{
  class Lexicographic {
  public:

-    static inline void CoorFromIndex (std::vector<int>& coor,int index,std::vector<int> &dims){
+    static inline void CoorFromIndex (std::vector<int>& coor,int index,const std::vector<int> &dims){
      int nd= dims.size();
      coor.resize(nd);
      for(int d=0;d<nd;d++){
@@ -16,8 +16,12 @@ namespace Grid{
      }
    }

-    static inline void IndexFromCoor (std::vector<int>& coor,int &index,std::vector<int> &dims){
+    static inline void IndexFromCoor (const std::vector<int>& coor,int &index,const std::vector<int> &dims){
      int nd=dims.size();
+      if(nd > coor.size())  {
+	std::cout<< "coor.size "<<coor.size()<<" >dims.size "<<dims.size()<<std::endl; 
+	assert(0);
+	}
      int stride=1;
      index=0;
      for(int d=0;d<nd;d++){
@@ -1,6 +1,6 @@
    /*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+    Grid physics library, www.github.com/paboyle/Grid

    Source file: ./tests/Test_serialisation.cc

@@ -29,12 +29,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    /*  END LEGAL */
 #include <Grid/Grid.h>

-
 using namespace Grid;
 using namespace Grid::QCD;

 GRID_SERIALIZABLE_ENUM(myenum, undef, red, 1, blue, 2, green, 3);
-  
+
 class myclass: Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(myclass,
@@ -79,14 +78,14 @@ void ioTest(const std::string &filename, const O &object, const std::string &nam
  // writer needs to be destroyed so that writing physically happens
  {
    W writer(filename);
-    
+
    write(writer, "testobject", object);
  }
-  
+
  R    reader(filename);
  O    buf;
  bool good;
-  
+
  read(reader, "testobject", buf);
  good = (object == buf);
  std::cout << name << " IO test: " << (good ? "success" : "failure");
@@ -98,7 +97,7 @@ int main(int argc,char **argv)
 {
  std::cout << "==== basic IO" << std::endl;
  XmlWriter WR("bother.xml");
-  
+
  // test basic type writing
  std::cout << "-- basic writing to 'bother.xml'..." << std::endl;
  push(WR,"BasicTypes");
@@ -112,12 +111,12 @@ int main(int argc,char **argv)
  write(WR,"d",d);
  write(WR,"b",b);
  pop(WR);
-  
+
  // test serializable class writing
  myclass              obj(1234); // non-trivial constructor
  std::vector<myclass> vec;
  std::pair<myenum, myenum> pair;
-  
+
  std::cout << "-- serialisable class writing to 'bother.xml'..." << std::endl;
  write(WR,"obj",obj);
  WR.write("obj2", obj);
@@ -132,11 +131,11 @@ int main(int argc,char **argv)
  std::cout << "-- serialisable class comparison:" << std::endl;
  std::cout << "vec[0] == obj: " << ((vec[0] == obj) ? "true" : "false") << std::endl;
  std::cout << "vec[1] == obj: " << ((vec[1] == obj) ? "true" : "false") << std::endl;
-  
+
  write(WR, "objpair", pair);
  std::cout << "-- pair writing to std::cout:" << std::endl;
  std::cout << pair << std::endl;
-  
+
  // read tests
  std::cout << "\n==== IO self-consistency tests" << std::endl;
  //// XML
@@ -151,6 +150,11 @@ int main(int argc,char **argv)
  ioTest<TextWriter, TextReader>("iotest.dat", obj, "text   (object)           ");
  ioTest<TextWriter, TextReader>("iotest.dat", vec, "text   (vector of objects)");
  ioTest<TextWriter, TextReader>("iotest.dat", pair, "text   (pair of objects)");
+  //// text
+  ioTest<JSONWriter, JSONReader>("iotest.json", obj,  "JSON   (object)           ");
+  ioTest<JSONWriter, JSONReader>("iotest.json", vec,  "JSON   (vector of objects)");
+  ioTest<JSONWriter, JSONReader>("iotest.json", pair, "JSON   (pair of objects)");
+
  //// HDF5
 #undef HAVE_HDF5
 #ifdef HAVE_HDF5
@@ -158,13 +162,13 @@ int main(int argc,char **argv)
  ioTest<Hdf5Writer, Hdf5Reader>("iotest.h5", vec, "HDF5   (vector of objects)");
  ioTest<Hdf5Writer, Hdf5Reader>("iotest.h5", pair, "HDF5   (pair of objects)");
 #endif
-  
+
  std::cout << "\n==== vector flattening/reconstruction" << std::endl;
  typedef std::vector<std::vector<std::vector<double>>> vec3d;
-  
+
  vec3d dv, buf;
  double d = 0.;
-  
+
  dv.resize(4);
  for (auto &v1: dv)
  {
@@ -180,14 +184,14 @@ int main(int argc,char **argv)
  }
  std::cout << "original 3D vector:" << std::endl;
  std::cout << dv << std::endl;
-  
+
  Flatten<vec3d> flatdv(dv);
-  
+
  std::cout << "\ndimensions:" << std::endl;
  std::cout << flatdv.getDim() << std::endl;
  std::cout << "\nflattened vector:" << std::endl;
  std::cout << flatdv.getFlatVector() << std::endl;
-  
+
  Reconstruct<vec3d> rec(flatdv.getFlatVector(), flatdv.getDim());
  std::cout << "\nreconstructed vector:" << std::endl;
  std::cout << flatdv.getVector() << std::endl;
@@ -199,10 +203,12 @@ int main(int argc,char **argv)

  {
    JSONWriter JW("bother.json");
-    
+
    // test basic type writing
+    myenum a = myenum::red;
    push(JW,"BasicTypes");
    write(JW,std::string("i16"),i16);
+    write(JW,"myenum",a);
    write(JW,"u16",u16);
    write(JW,"i32",i32);
    write(JW,"u32",u32);
@@ -212,23 +218,25 @@ int main(int argc,char **argv)
    write(JW,"d",d);
    write(JW,"b",b);
    pop(JW);
-    
+
+
    // test serializable class writing
    myclass obj(1234); // non-trivial constructor
+    std::cout << obj << std::endl;
    std::cout << "-- serialisable class writing to 'bother.json'..." << std::endl;
    write(JW,"obj",obj);
    JW.write("obj2", obj);
-    
-    std::cout << obj << std::endl;
-    
+
+
    std::vector<myclass> vec;
    vec.push_back(myclass(1234));
    vec.push_back(myclass(5678));
    vec.push_back(myclass(3838));
    write(JW, "objvec", vec);
-    
+
  }

+
  {
    JSONReader RD("bother.json");
    myclass jcopy1;
@@ -238,8 +246,9 @@ int main(int argc,char **argv)
    std::cout << "Loaded (JSON) -----------------" << std::endl;
    std::cout << jcopy1 << std::endl << jveccopy1 << std::endl;
  }
-  
-/* 
+ 
+
+/*
  // This is still work in progress
  {
    // Testing the next element function
@@ -1,4 +1,4 @@
-SUBDIRS = . core forces hmc solver debug smearing IO
+SUBDIRS = . core forces hmc solver debug smearing IO lanczos

 if BUILD_CHROMA_REGRESSION
  SUBDIRS+= qdpxx
@@ -80,31 +80,47 @@ int main (int argc, char ** argv)


  LatticeFermionD    src_o(FrbGrid);
-  LatticeFermionD result_o(FrbGrid);
-  LatticeFermionD result_o_2(FrbGrid);
+  LatticeFermionD result_cg(FrbGrid);
  pickCheckerboard(Odd,src_o,src);
-  result_o.checkerboard = Odd;
-  result_o = zero;
-  result_o_2.checkerboard = Odd;
-  result_o_2 = zero;
+  result_cg.checkerboard = Odd;
+  result_cg = zero;
+  LatticeFermionD result_mcg(result_cg);
+  LatticeFermionD result_rlcg(result_cg);

  SchurDiagMooeeOperator<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
  SchurDiagMooeeOperator<DomainWallFermionFH,LatticeFermionF> HermOpEO_f(Ddwf_f);

+  //#define DO_MIXED_CG
+#define DO_RLUP_CG
+
+#ifdef DO_MIXED_CG
  std::cout << "Starting mixed CG" << std::endl;
  MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO);
  mCG.InnerTolerance = 3.0e-5;
-  mCG(src_o,result_o);
+  mCG(src_o,result_mcg);
+#endif

+#ifdef DO_RLUP_CG
+  std::cout << "Starting reliable update CG" << std::endl;
+  ConjugateGradientReliableUpdate<LatticeFermionD,LatticeFermionF> rlCG(1.e-8, 10000, 0.1, FrbGrid_f, HermOpEO_f, HermOpEO);
+  rlCG(src_o,result_rlcg);
+#endif
+  
  std::cout << "Starting regular CG" << std::endl;
  ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
-  CG(HermOpEO,src_o,result_o_2);
+  CG(HermOpEO,src_o,result_cg);

-  LatticeFermionD diff_o(FrbGrid);
-  RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);
-
-  std::cout << "Diff between mixed and regular CG: " << diff << std::endl;
+#ifdef DO_MIXED_CG
+  LatticeFermionD diff_mcg(FrbGrid);
+  RealD vdiff_mcg = axpy_norm(diff_mcg, -1.0, result_cg, result_mcg);
+  std::cout << "Diff between mixed and regular CG: " << vdiff_mcg << std::endl;
+#endif

+#ifdef DO_RLUP_CG
+  LatticeFermionD diff_rlcg(FrbGrid);
+  RealD vdiff_rlcg = axpy_norm(diff_rlcg, -1.0, result_cg, result_rlcg);
+  std::cout << "Diff between reliable update and regular CG: " << vdiff_rlcg << std::endl;
+#endif
  
  Grid_finalize();
 }
@@ -48,7 +48,7 @@ int main(int argc, char ** argv) {
  double volume = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];

  GridCartesian Fine(latt_size,simd_layout,mpi_layout);
-  GridRedBlackCartesian rbFine(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian rbFine(&Fine);
  GridParallelRNG       fRNG(&Fine);

  //  fRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
@@ -47,7 +47,7 @@ int main (int argc, char ** argv)
  mask[0]=0;

  GridCartesian         Fine  (latt_size,simd_layout,mpi_layout);
-  GridRedBlackCartesian RBFine(latt_size,simd_layout,mpi_layout,mask,1);
+  GridRedBlackCartesian RBFine(&Fine,mask,1);

  GridParallelRNG      FineRNG(&Fine);  FineRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));

@@ -47,7 +47,7 @@ int main (int argc, char ** argv)
  mask[0]=0;

  GridCartesian         Fine  (latt_size,simd_layout,mpi_layout);
-  GridRedBlackCartesian RBFine(latt_size,simd_layout,mpi_layout,mask,1);
+  GridRedBlackCartesian RBFine(&Fine,mask,1);

  GridParallelRNG      FineRNG(&Fine);  FineRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));

@@ -0,0 +1,239 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/core/Test_dwf_eofa_even_odd.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class d>
+struct scal {
+    d internal;
+};
+
+Gamma::Algebra Gmu [] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ,
+    Gamma::Algebra::GammaT
+};
+
+int main (int argc, char ** argv)
+{
+    Grid_init(&argc, &argv);
+
+    int threads = GridThread::GetThreads();
+    std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+    const int Ls = 8;
+    // GridCartesian*         UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()), GridDefaultMpi());
+    GridCartesian*         UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()), GridDefaultMpi());
+    GridCartesian*         FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
+    GridRedBlackCartesian* UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+    GridRedBlackCartesian* FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
+
+    std::vector<int> seeds4({1,2,3,4});
+    std::vector<int> seeds5({5,6,7,8});
+
+    GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
+    GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
+
+    LatticeFermion    src   (FGrid); random(RNG5, src);
+    LatticeFermion    phi   (FGrid); random(RNG5, phi);
+    LatticeFermion    chi   (FGrid); random(RNG5, chi);
+    LatticeFermion    result(FGrid); result = zero;
+    LatticeFermion    ref   (FGrid); ref = zero;
+    LatticeFermion    tmp   (FGrid); tmp = zero;
+    LatticeFermion    err   (FGrid); err = zero;
+    LatticeGaugeField Umu   (UGrid); SU3::HotConfiguration(RNG4, Umu);
+    std::vector<LatticeColourMatrix> U(4,UGrid);
+
+    // Only one non-zero (y)
+    Umu = zero;
+    for(int nn=0; nn<Nd; nn++){
+        random(RNG4, U[nn]);
+        if(nn>0){ U[nn] = zero; }
+        PokeIndex<LorentzIndex>(Umu, U[nn], nn);
+    }
+
+    RealD mq1   = 0.1;
+    RealD mq2   = 0.5;
+    RealD mq3   = 1.0;
+    RealD shift = 0.1234;
+    RealD M5    = 1.8;
+    int   pm    = 1;
+    DomainWallEOFAFermionR Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mq1, mq2, mq3, shift, pm, M5);
+
+    LatticeFermion src_e (FrbGrid);
+    LatticeFermion src_o (FrbGrid);
+    LatticeFermion r_e   (FrbGrid);
+    LatticeFermion r_o   (FrbGrid);
+    LatticeFermion r_eo  (FGrid);
+    LatticeFermion r_eeoo(FGrid);
+
+    std::cout << GridLogMessage << "==========================================================" << std::endl;
+    std::cout << GridLogMessage << "= Testing that Meo + Moe + Moo + Mee = Munprec " << std::endl;
+    std::cout << GridLogMessage << "==========================================================" << std::endl;
+
+    pickCheckerboard(Even, src_e, src);
+    pickCheckerboard(Odd,  src_o, src);
+
+    Ddwf.Meooe(src_e, r_o); std::cout << GridLogMessage << "Applied Meo" << std::endl;
+    Ddwf.Meooe(src_o, r_e); std::cout << GridLogMessage << "Applied Moe" << std::endl;
+    setCheckerboard(r_eo, r_o);
+    setCheckerboard(r_eo, r_e);
+
+    Ddwf.Mooee(src_e, r_e); std::cout << GridLogMessage << "Applied Mee" << std::endl;
+    Ddwf.Mooee(src_o, r_o); std::cout << GridLogMessage << "Applied Moo" << std::endl;
+    setCheckerboard(r_eeoo, r_e);
+    setCheckerboard(r_eeoo, r_o);
+
+    r_eo = r_eo + r_eeoo;
+    Ddwf.M(src, ref);
+
+    // std::cout << GridLogMessage << r_eo << std::endl;
+    // std::cout << GridLogMessage << ref  << std::endl;
+
+    err = ref - r_eo;
+    std::cout << GridLogMessage << "EO norm diff   " << norm2(err) << " " << norm2(ref) << " " << norm2(r_eo) << std::endl;
+
+    LatticeComplex cerr(FGrid);
+    cerr = localInnerProduct(err,err);
+    // std::cout << GridLogMessage << cerr << std::endl;
+
+    std::cout << GridLogMessage << "==============================================================" << std::endl;
+    std::cout << GridLogMessage << "= Test Ddagger is the dagger of D by requiring                " << std::endl;
+    std::cout << GridLogMessage << "=  < phi | Deo | chi > * = < chi | Deo^dag| phi>  " << std::endl;
+    std::cout << GridLogMessage << "==============================================================" << std::endl;
+
+    LatticeFermion chi_e (FrbGrid);
+    LatticeFermion chi_o (FrbGrid);
+
+    LatticeFermion dchi_e(FrbGrid);
+    LatticeFermion dchi_o(FrbGrid);
+
+    LatticeFermion phi_e (FrbGrid);
+    LatticeFermion phi_o (FrbGrid);
+
+    LatticeFermion dphi_e(FrbGrid);
+    LatticeFermion dphi_o(FrbGrid);
+
+    pickCheckerboard(Even, chi_e, chi);
+    pickCheckerboard(Odd , chi_o, chi);
+    pickCheckerboard(Even, phi_e, phi);
+    pickCheckerboard(Odd , phi_o, phi);
+
+    Ddwf.Meooe   (chi_e, dchi_o);
+    Ddwf.Meooe   (chi_o, dchi_e);
+    Ddwf.MeooeDag(phi_e, dphi_o);
+    Ddwf.MeooeDag(phi_o, dphi_e);
+
+    ComplexD pDce = innerProduct(phi_e, dchi_e);
+    ComplexD pDco = innerProduct(phi_o, dchi_o);
+    ComplexD cDpe = innerProduct(chi_e, dphi_e);
+    ComplexD cDpo = innerProduct(chi_o, dphi_o);
+
+    std::cout << GridLogMessage << "e " << pDce << " " << cDpe << std::endl;
+    std::cout << GridLogMessage << "o " << pDco << " " << cDpo << std::endl;
+
+    std::cout << GridLogMessage << "pDce - conj(cDpo) " << pDce-conj(cDpo) << std::endl;
+    std::cout << GridLogMessage << "pDco - conj(cDpe) " << pDco-conj(cDpe) << std::endl;
+
+    std::cout << GridLogMessage << "==============================================================" << std::endl;
+    std::cout << GridLogMessage << "= Test MeeInv Mee = 1                                         " << std::endl;
+    std::cout << GridLogMessage << "==============================================================" << std::endl;
+
+    pickCheckerboard(Even, chi_e, chi);
+    pickCheckerboard(Odd , chi_o, chi);
+
+    Ddwf.Mooee   (chi_e, src_e);
+    Ddwf.MooeeInv(src_e, phi_e);
+
+    Ddwf.Mooee   (chi_o, src_o);
+    Ddwf.MooeeInv(src_o, phi_o);
+
+    setCheckerboard(phi, phi_e);
+    setCheckerboard(phi, phi_o);
+
+    err = phi - chi;
+    std::cout << GridLogMessage << "norm diff   " << norm2(err) << std::endl;
+
+    std::cout << GridLogMessage << "==============================================================" << std::endl;
+    std::cout << GridLogMessage << "= Test MeeInvDag MeeDag = 1                                   " << std::endl;
+    std::cout << GridLogMessage << "==============================================================" << std::endl;
+
+    pickCheckerboard(Even, chi_e, chi);
+    pickCheckerboard(Odd , chi_o, chi);
+
+    Ddwf.MooeeDag   (chi_e, src_e);
+    Ddwf.MooeeInvDag(src_e, phi_e);
+
+    Ddwf.MooeeDag   (chi_o, src_o);
+    Ddwf.MooeeInvDag(src_o, phi_o);
+
+    setCheckerboard(phi, phi_e);
+    setCheckerboard(phi, phi_o);
+
+    err = phi - chi;
+    std::cout << GridLogMessage << "norm diff   " << norm2(err) << std::endl;
+
+    std::cout << GridLogMessage << "==============================================================" << std::endl;
+    std::cout << GridLogMessage << "= Test MpcDagMpc is Hermitian              " << std::endl;
+    std::cout << GridLogMessage << "==============================================================" << std::endl;
+
+    random(RNG5, phi);
+    random(RNG5, chi);
+    pickCheckerboard(Even, chi_e, chi);
+    pickCheckerboard(Odd , chi_o, chi);
+    pickCheckerboard(Even, phi_e, phi);
+    pickCheckerboard(Odd , phi_o, phi);
+    RealD t1,t2;
+
+    SchurDiagMooeeOperator<DomainWallEOFAFermionR,LatticeFermion> HermOpEO(Ddwf);
+    HermOpEO.MpcDagMpc(chi_e, dchi_e, t1, t2);
+    HermOpEO.MpcDagMpc(chi_o, dchi_o, t1, t2);
+
+    HermOpEO.MpcDagMpc(phi_e, dphi_e, t1, t2);
+    HermOpEO.MpcDagMpc(phi_o, dphi_o, t1, t2);
+
+    pDce = innerProduct(phi_e, dchi_e);
+    pDco = innerProduct(phi_o, dchi_o);
+    cDpe = innerProduct(chi_e, dphi_e);
+    cDpo = innerProduct(chi_o, dphi_o);
+
+    std::cout << GridLogMessage << "e " << pDce << " " << cDpe << std::endl;
+    std::cout << GridLogMessage << "o " << pDco << " " << cDpo << std::endl;
+
+    std::cout << GridLogMessage << "pDce - conj(cDpo) " << pDco-conj(cDpo) << std::endl;
+    std::cout << GridLogMessage << "pDco - conj(cDpe) " << pDce-conj(cDpe) << std::endl;
+
+    Grid_finalize();
+}
@@ -47,7 +47,7 @@ int main (int argc, char ** argv)
    vol = vol * latt_size[d];
  }
  GridCartesian         GRID(latt_size,simd_layout,mpi_layout);
-  GridRedBlackCartesian RBGRID(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian RBGRID(&GRID);

  LatticeComplexD     one(&GRID);
  LatticeComplexD      zz(&GRID);
@@ -33,22 +33,68 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;

-typedef typename GparityDomainWallFermionR::FermionField FermionField;
+//typedef GparityDomainWallFermionD GparityDiracOp;
+//typedef DomainWallFermionD StandardDiracOp;
+//#define DOP_PARAMS

+typedef GparityMobiusFermionD GparityDiracOp;
+typedef MobiusFermionD StandardDiracOp;
+#define DOP_PARAMS ,1.5, 0.5
+
+
+typedef typename GparityDiracOp::FermionField GparityFermionField;
+typedef typename GparityDiracOp::GaugeField GparityGaugeField;
+typedef typename GparityFermionField::vector_type vComplexType;
+
+typedef typename StandardDiracOp::FermionField StandardFermionField;
+typedef typename StandardDiracOp::GaugeField StandardGaugeField;
+
+enum{ same_vComplex = std::is_same<vComplexType, typename StandardFermionField::vector_type>::value };
+static_assert(same_vComplex == 1, "Dirac Operators must have same underlying SIMD complex type");

 int main (int argc, char ** argv)
 {
-  const int nu = 3;
+  int nu = 0;

  Grid_init(&argc,&argv);

+  for(int i=1;i<argc;i++){
+    if(std::string(argv[i]) == "--Gparity-dir"){
+      std::stringstream ss; ss << argv[i+1]; ss >> nu;
+      std::cout << GridLogMessage << "Set Gparity direction to " << nu << std::endl;
+    }
+  }
+
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Testing Gparity Dirac operator                  "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexType::Nsimd()<<std::endl;
+#ifdef GRID_OMP
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
+#endif
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using UNROLLED Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;

  const int Ls=4;
-  const int L =4;
-  std::vector<int> latt_2f(Nd,L);
-  std::vector<int> latt_1f(Nd,L); latt_1f[nu] = 2*L;
+  //const int L =4;
+  //std::vector<int> latt_2f(Nd,L);

-  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  std::vector<int> latt_2f = GridDefaultLatt();
+  std::vector<int> latt_1f(latt_2f); latt_1f[nu] = 2*latt_2f[nu];
+  int L = latt_2f[nu];
+
+
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexType::Nsimd());
+
+  std::cout << GridLogMessage << "SIMD layout: ";
+  for(int i=0;i<simd_layout.size();i++) std::cout << simd_layout[i] << " ";
+  std::cout << std::endl;
+  
  std::vector<int> mpi_layout  = GridDefaultMpi(); //node layout

  GridCartesian         * UGrid_1f   = SpaceTimeGrid::makeFourDimGrid(latt_1f, simd_layout, mpi_layout);
@@ -67,13 +113,13 @@ int main (int argc, char ** argv)
  GridParallelRNG          RNG5_2f(FGrid_2f);  RNG5_2f.SeedFixedIntegers(seeds5);
  GridParallelRNG          RNG4_2f(UGrid_2f);  RNG4_2f.SeedFixedIntegers(seeds4);

-  LatticeGaugeField Umu_2f(UGrid_2f);
+  GparityGaugeField Umu_2f(UGrid_2f);
  SU3::HotConfiguration(RNG4_2f,Umu_2f);

-  LatticeFermion    src   (FGrid_2f); 
-  LatticeFermion    tmpsrc(FGrid_2f); 
-  FermionField      src_2f(FGrid_2f); 
-  LatticeFermion    src_1f(FGrid_1f); 
+  StandardFermionField    src   (FGrid_2f); 
+  StandardFermionField    tmpsrc(FGrid_2f); 
+  GparityFermionField      src_2f(FGrid_2f); 
+  StandardFermionField    src_1f(FGrid_1f); 

  // Replicate fermion source
  random(RNG5_2f,src);
@@ -81,8 +127,8 @@ int main (int argc, char ** argv)
  tmpsrc=src*2.0;
  PokeIndex<0>(src_2f,tmpsrc,1);

-  LatticeFermion result_1f(FGrid_1f); result_1f=zero;
-  LatticeGaugeField Umu_1f(UGrid_1f); 
+  StandardFermionField result_1f(FGrid_1f); result_1f=zero;
+  StandardGaugeField Umu_1f(UGrid_1f); 
  Replicate(Umu_2f,Umu_1f);

  //Coordinate grid for reference
@@ -92,7 +138,7 @@ int main (int argc, char ** argv)
  //Copy-conjugate the gauge field
  //First C-shift the lattice by Lx/2
  {
-    LatticeGaugeField Umu_shift = conjugate( Cshift(Umu_1f,nu,L) );
+    StandardGaugeField Umu_shift = conjugate( Cshift(Umu_1f,nu,L) );
    Umu_1f = where( xcoor_1f >= Integer(L), Umu_shift, Umu_1f );

    // hack test to check the same
@@ -101,7 +147,7 @@ int main (int argc, char ** argv)
    cout << GridLogMessage << "Umu diff " << norm2(Umu_shift)<<std::endl;

    //Make the gauge field antiperiodic in nu-direction
-    LatticeColourMatrix Unu(UGrid_1f);
+    decltype(PeekIndex<LorentzIndex>(Umu_1f,nu)) Unu(UGrid_1f);
    Unu = PeekIndex<LorentzIndex>(Umu_1f,nu);
    Unu = where(xcoor_1f == Integer(2*L-1), -Unu, Unu);
    PokeIndex<LorentzIndex>(Umu_1f,Unu,nu);
@@ -115,33 +161,33 @@ int main (int argc, char ** argv)

  RealD mass=0.0;
  RealD M5=1.8;
-  DomainWallFermionR Ddwf(Umu_1f,*FGrid_1f,*FrbGrid_1f,*UGrid_1f,*UrbGrid_1f,mass,M5);
+  StandardDiracOp Ddwf(Umu_1f,*FGrid_1f,*FrbGrid_1f,*UGrid_1f,*UrbGrid_1f,mass,M5 DOP_PARAMS);

-  LatticeFermion    src_o_1f(FrbGrid_1f);
-  LatticeFermion result_o_1f(FrbGrid_1f);
+  StandardFermionField    src_o_1f(FrbGrid_1f);
+  StandardFermionField result_o_1f(FrbGrid_1f);
  pickCheckerboard(Odd,src_o_1f,src_1f);
  result_o_1f=zero;

-  SchurDiagMooeeOperator<DomainWallFermionR,LatticeFermion> HermOpEO(Ddwf);
-  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
+  SchurDiagMooeeOperator<StandardDiracOp,StandardFermionField> HermOpEO(Ddwf);
+  ConjugateGradient<StandardFermionField> CG(1.0e-8,10000);
  CG(HermOpEO,src_o_1f,result_o_1f);
  
  //  const int nu = 3;
  std::vector<int> twists(Nd,0);
  twists[nu] = 1;
-  GparityDomainWallFermionR::ImplParams params;
+  GparityDiracOp::ImplParams params;
  params.twists = twists;
-  GparityDomainWallFermionR GPDdwf(Umu_2f,*FGrid_2f,*FrbGrid_2f,*UGrid_2f,*UrbGrid_2f,mass,M5,params);
+  GparityDiracOp GPDdwf(Umu_2f,*FGrid_2f,*FrbGrid_2f,*UGrid_2f,*UrbGrid_2f,mass,M5 DOP_PARAMS,params);

  for(int disp=-1;disp<=1;disp+=2)
  for(int mu=0;mu<5;mu++)
  { 
-    FermionField Dsrc_2f(FGrid_2f);
+    GparityFermionField Dsrc_2f(FGrid_2f);

-    LatticeFermion Dsrc_1f(FGrid_1f);
-    LatticeFermion Dsrc_2freplica(FGrid_1f);
-    LatticeFermion Dsrc_2freplica0(FGrid_1f);
-    LatticeFermion Dsrc_2freplica1(FGrid_1f);
+    StandardFermionField Dsrc_1f(FGrid_1f);
+    StandardFermionField Dsrc_2freplica(FGrid_1f);
+    StandardFermionField Dsrc_2freplica0(FGrid_1f);
+    StandardFermionField Dsrc_2freplica1(FGrid_1f);

    if ( mu ==0 ) {
      std::cout << GridLogMessage<< " Cross checking entire hopping term"<<std::endl;
@@ -156,8 +202,8 @@ int main (int argc, char ** argv)
    std::cout << GridLogMessage << "S norms "<< norm2(src_2f) << " " << norm2(src_1f)  <<std::endl;
    std::cout << GridLogMessage << "D norms "<< norm2(Dsrc_2f)<< " " << norm2(Dsrc_1f) <<std::endl;

-    LatticeFermion Dsrc_2f0(FGrid_2f); Dsrc_2f0 = PeekIndex<0>(Dsrc_2f,0);
-    LatticeFermion Dsrc_2f1(FGrid_2f); Dsrc_2f1 = PeekIndex<0>(Dsrc_2f,1);
+    StandardFermionField Dsrc_2f0(FGrid_2f); Dsrc_2f0 = PeekIndex<0>(Dsrc_2f,0);
+    StandardFermionField Dsrc_2f1(FGrid_2f); Dsrc_2f1 = PeekIndex<0>(Dsrc_2f,1);

    //    Dsrc_2f1 = Dsrc_2f1 - Dsrc_2f0;
    //    std::cout << GridLogMessage << " Cross check two halves " <<norm2(Dsrc_2f1)<<std::endl;
@@ -174,20 +220,20 @@ int main (int argc, char ** argv)
  }

  {
-    FermionField chi   (FGrid_2f); gaussian(RNG5_2f,chi);
-    FermionField phi   (FGrid_2f); gaussian(RNG5_2f,phi);
+    GparityFermionField chi   (FGrid_2f); gaussian(RNG5_2f,chi);
+    GparityFermionField phi   (FGrid_2f); gaussian(RNG5_2f,phi);
  
-    FermionField chi_e   (FrbGrid_2f);
-    FermionField chi_o   (FrbGrid_2f);
+    GparityFermionField chi_e   (FrbGrid_2f);
+    GparityFermionField chi_o   (FrbGrid_2f);
    
-    FermionField dchi_e  (FrbGrid_2f);
-    FermionField dchi_o  (FrbGrid_2f);
+    GparityFermionField dchi_e  (FrbGrid_2f);
+    GparityFermionField dchi_o  (FrbGrid_2f);
    
-    FermionField phi_e   (FrbGrid_2f);
-    FermionField phi_o   (FrbGrid_2f);
+    GparityFermionField phi_e   (FrbGrid_2f);
+    GparityFermionField phi_o   (FrbGrid_2f);
    
-    FermionField dphi_e  (FrbGrid_2f);
-    FermionField dphi_o  (FrbGrid_2f);
+    GparityFermionField dphi_e  (FrbGrid_2f);
+    GparityFermionField dphi_o  (FrbGrid_2f);

    pickCheckerboard(Even,chi_e,chi);
    pickCheckerboard(Odd ,chi_o,chi);
@@ -212,14 +258,14 @@ int main (int argc, char ** argv)

  }

-  FermionField result_2f(FGrid_2f); result_2f=zero;
-  FermionField    src_o_2f(FrbGrid_2f);
-  FermionField result_o_2f(FrbGrid_2f);
+  GparityFermionField result_2f(FGrid_2f); result_2f=zero;
+  GparityFermionField    src_o_2f(FrbGrid_2f);
+  GparityFermionField result_o_2f(FrbGrid_2f);
  pickCheckerboard(Odd,src_o_2f,src_2f);
  result_o_2f=zero;

-  ConjugateGradient<FermionField> CG2f(1.0e-8,10000);
-  SchurDiagMooeeOperator<GparityDomainWallFermionR,FermionField> HermOpEO2f(GPDdwf);
+  ConjugateGradient<GparityFermionField> CG2f(1.0e-8,10000);
+  SchurDiagMooeeOperator<GparityDiracOp,GparityFermionField> HermOpEO2f(GPDdwf);
  CG2f(HermOpEO2f,src_o_2f,result_o_2f);

  std::cout << "2f cb "<<result_o_2f.checkerboard<<std::endl;
@@ -227,10 +273,10 @@ int main (int argc, char ** argv)

  std::cout << " result norms " <<norm2(result_o_2f)<<" " <<norm2(result_o_1f)<<std::endl;

-  LatticeFermion    res0o  (FrbGrid_2f); 
-  LatticeFermion    res1o  (FrbGrid_2f); 
-  LatticeFermion    res0  (FGrid_2f); 
-  LatticeFermion    res1  (FGrid_2f); 
+  StandardFermionField    res0o  (FrbGrid_2f); 
+  StandardFermionField    res1o  (FrbGrid_2f); 
+  StandardFermionField    res0  (FGrid_2f); 
+  StandardFermionField    res1  (FGrid_2f); 

  res0=zero;
  res1=zero;
@@ -244,9 +290,9 @@ int main (int argc, char ** argv)
  setCheckerboard(res0,res0o);
  setCheckerboard(res1,res1o);

-  LatticeFermion replica (FGrid_1f);
-  LatticeFermion replica0(FGrid_1f);
-  LatticeFermion replica1(FGrid_1f);
+  StandardFermionField replica (FGrid_1f);
+  StandardFermionField replica0(FGrid_1f);
+  StandardFermionField replica1(FGrid_1f);
  Replicate(res0,replica0);
  Replicate(res1,replica1);

@@ -40,7 +40,7 @@ int main (int argc, char ** argv)
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
-  GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian     RBGrid(&Grid);

  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
@@ -84,7 +84,7 @@ int main(int argc, char **argv) {
      double volume = latt_size[0] * latt_size[1] * latt_size[2] * latt_size[3];

      GridCartesian Fine(latt_size, simd_layout, mpi_layout);
-      GridRedBlackCartesian rbFine(latt_size, simd_layout, mpi_layout);
+      GridRedBlackCartesian rbFine(&Fine);
      GridParallelRNG FineRNG(&Fine);
      GridSerialRNG SerialRNG;
      GridSerialRNG SerialRNG1;
@@ -0,0 +1,241 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/core/Test_dwf_eofa_even_odd.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class d>
+struct scal {
+    d internal;
+};
+
+Gamma::Algebra Gmu [] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ,
+    Gamma::Algebra::GammaT
+};
+
+int main (int argc, char ** argv)
+{
+    Grid_init(&argc, &argv);
+
+    int threads = GridThread::GetThreads();
+    std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+    const int Ls = 8;
+    // GridCartesian*         UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()), GridDefaultMpi());
+    GridCartesian*         UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()), GridDefaultMpi());
+    GridCartesian*         FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
+    GridRedBlackCartesian* UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+    GridRedBlackCartesian* FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
+
+    std::vector<int> seeds4({1,2,3,4});
+    std::vector<int> seeds5({5,6,7,8});
+
+    GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
+    GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
+
+    LatticeFermion    src   (FGrid); random(RNG5, src);
+    LatticeFermion    phi   (FGrid); random(RNG5, phi);
+    LatticeFermion    chi   (FGrid); random(RNG5, chi);
+    LatticeFermion    result(FGrid); result = zero;
+    LatticeFermion    ref   (FGrid); ref = zero;
+    LatticeFermion    tmp   (FGrid); tmp = zero;
+    LatticeFermion    err   (FGrid); err = zero;
+    LatticeGaugeField Umu   (UGrid); SU3::HotConfiguration(RNG4, Umu);
+    std::vector<LatticeColourMatrix> U(4,UGrid);
+
+    // Only one non-zero (y)
+    Umu = zero;
+    for(int nn=0; nn<Nd; nn++){
+        random(RNG4, U[nn]);
+        if(nn>0){ U[nn] = zero; }
+        PokeIndex<LorentzIndex>(Umu, U[nn], nn);
+    }
+
+    RealD b     = 2.5;
+    RealD c     = 1.5;
+    RealD mq1   = 0.1;
+    RealD mq2   = 0.5;
+    RealD mq3   = 1.0;
+    RealD shift = 0.1234;
+    RealD M5    = 1.8;
+    int   pm    = 1;
+    MobiusEOFAFermionR Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mq1, mq2, mq3, shift, pm, M5, b, c);
+
+    LatticeFermion src_e (FrbGrid);
+    LatticeFermion src_o (FrbGrid);
+    LatticeFermion r_e   (FrbGrid);
+    LatticeFermion r_o   (FrbGrid);
+    LatticeFermion r_eo  (FGrid);
+    LatticeFermion r_eeoo(FGrid);
+
+    std::cout << GridLogMessage << "==========================================================" << std::endl;
+    std::cout << GridLogMessage << "= Testing that Meo + Moe + Moo + Mee = Munprec " << std::endl;
+    std::cout << GridLogMessage << "==========================================================" << std::endl;
+
+    pickCheckerboard(Even, src_e, src);
+    pickCheckerboard(Odd,  src_o, src);
+
+    Ddwf.Meooe(src_e, r_o); std::cout << GridLogMessage << "Applied Meo" << std::endl;
+    Ddwf.Meooe(src_o, r_e); std::cout << GridLogMessage << "Applied Moe" << std::endl;
+    setCheckerboard(r_eo, r_o);
+    setCheckerboard(r_eo, r_e);
+
+    Ddwf.Mooee(src_e, r_e); std::cout << GridLogMessage << "Applied Mee" << std::endl;
+    Ddwf.Mooee(src_o, r_o); std::cout << GridLogMessage << "Applied Moo" << std::endl;
+    setCheckerboard(r_eeoo, r_e);
+    setCheckerboard(r_eeoo, r_o);
+
+    r_eo = r_eo + r_eeoo;
+    Ddwf.M(src, ref);
+
+    // std::cout << GridLogMessage << r_eo << std::endl;
+    // std::cout << GridLogMessage << ref  << std::endl;
+
+    err = ref - r_eo;
+    std::cout << GridLogMessage << "EO norm diff   " << norm2(err) << " " << norm2(ref) << " " << norm2(r_eo) << std::endl;
+
+    LatticeComplex cerr(FGrid);
+    cerr = localInnerProduct(err,err);
+    // std::cout << GridLogMessage << cerr << std::endl;
+
+    std::cout << GridLogMessage << "==============================================================" << std::endl;
+    std::cout << GridLogMessage << "= Test Ddagger is the dagger of D by requiring                " << std::endl;
+    std::cout << GridLogMessage << "=  < phi | Deo | chi > * = < chi | Deo^dag| phi>  " << std::endl;
+    std::cout << GridLogMessage << "==============================================================" << std::endl;
+
+    LatticeFermion chi_e (FrbGrid);
+    LatticeFermion chi_o (FrbGrid);
+
+    LatticeFermion dchi_e(FrbGrid);
+    LatticeFermion dchi_o(FrbGrid);
+
+    LatticeFermion phi_e (FrbGrid);
+    LatticeFermion phi_o (FrbGrid);
+
+    LatticeFermion dphi_e(FrbGrid);
+    LatticeFermion dphi_o(FrbGrid);
+
+    pickCheckerboard(Even, chi_e, chi);
+    pickCheckerboard(Odd , chi_o, chi);
+    pickCheckerboard(Even, phi_e, phi);
+    pickCheckerboard(Odd , phi_o, phi);
+
+    Ddwf.Meooe   (chi_e, dchi_o);
+    Ddwf.Meooe   (chi_o, dchi_e);
+    Ddwf.MeooeDag(phi_e, dphi_o);
+    Ddwf.MeooeDag(phi_o, dphi_e);
+
+    ComplexD pDce = innerProduct(phi_e, dchi_e);
+    ComplexD pDco = innerProduct(phi_o, dchi_o);
+    ComplexD cDpe = innerProduct(chi_e, dphi_e);
+    ComplexD cDpo = innerProduct(chi_o, dphi_o);
+
+    std::cout << GridLogMessage << "e " << pDce << " " << cDpe << std::endl;
+    std::cout << GridLogMessage << "o " << pDco << " " << cDpo << std::endl;
+
+    std::cout << GridLogMessage << "pDce - conj(cDpo) " << pDce-conj(cDpo) << std::endl;
+    std::cout << GridLogMessage << "pDco - conj(cDpe) " << pDco-conj(cDpe) << std::endl;
+
+    std::cout << GridLogMessage << "==============================================================" << std::endl;
+    std::cout << GridLogMessage << "= Test MeeInv Mee = 1                                         " << std::endl;
+    std::cout << GridLogMessage << "==============================================================" << std::endl;
+
+    pickCheckerboard(Even, chi_e, chi);
+    pickCheckerboard(Odd , chi_o, chi);
+
+    Ddwf.Mooee   (chi_e, src_e);
+    Ddwf.MooeeInv(src_e, phi_e);
+
+    Ddwf.Mooee   (chi_o, src_o);
+    Ddwf.MooeeInv(src_o, phi_o);
+
+    setCheckerboard(phi, phi_e);
+    setCheckerboard(phi, phi_o);
+
+    err = phi - chi;
+    std::cout << GridLogMessage << "norm diff   " << norm2(err) << std::endl;
+
+    std::cout << GridLogMessage << "==============================================================" << std::endl;
+    std::cout << GridLogMessage << "= Test MeeInvDag MeeDag = 1                                   " << std::endl;
+    std::cout << GridLogMessage << "==============================================================" << std::endl;
+
+    pickCheckerboard(Even, chi_e, chi);
+    pickCheckerboard(Odd , chi_o, chi);
+
+    Ddwf.MooeeDag   (chi_e, src_e);
+    Ddwf.MooeeInvDag(src_e, phi_e);
+
+    Ddwf.MooeeDag   (chi_o, src_o);
+    Ddwf.MooeeInvDag(src_o, phi_o);
+
+    setCheckerboard(phi, phi_e);
+    setCheckerboard(phi, phi_o);
+
+    err = phi - chi;
+    std::cout << GridLogMessage << "norm diff   " << norm2(err) << std::endl;
+
+    std::cout << GridLogMessage << "==============================================================" << std::endl;
+    std::cout << GridLogMessage << "= Test MpcDagMpc is Hermitian              " << std::endl;
+    std::cout << GridLogMessage << "==============================================================" << std::endl;
+
+    random(RNG5, phi);
+    random(RNG5, chi);
+    pickCheckerboard(Even, chi_e, chi);
+    pickCheckerboard(Odd , chi_o, chi);
+    pickCheckerboard(Even, phi_e, phi);
+    pickCheckerboard(Odd , phi_o, phi);
+    RealD t1,t2;
+
+    SchurDiagMooeeOperator<MobiusEOFAFermionR,LatticeFermion> HermOpEO(Ddwf);
+    HermOpEO.MpcDagMpc(chi_e, dchi_e, t1, t2);
+    HermOpEO.MpcDagMpc(chi_o, dchi_o, t1, t2);
+
+    HermOpEO.MpcDagMpc(phi_e, dphi_e, t1, t2);
+    HermOpEO.MpcDagMpc(phi_o, dphi_o, t1, t2);
+
+    pDce = innerProduct(phi_e, dchi_e);
+    pDco = innerProduct(phi_o, dchi_o);
+    cDpe = innerProduct(chi_e, dphi_e);
+    cDpo = innerProduct(chi_o, dphi_o);
+
+    std::cout << GridLogMessage << "e " << pDce << " " << cDpe << std::endl;
+    std::cout << GridLogMessage << "o " << pDco << " " << cDpo << std::endl;
+
+    std::cout << GridLogMessage << "pDce - conj(cDpo) " << pDco-conj(cDpo) << std::endl;
+    std::cout << GridLogMessage << "pDco - conj(cDpe) " << pDce-conj(cDpe) << std::endl;
+
+    Grid_finalize();
+}
@@ -40,7 +40,7 @@ int main (int argc, char ** argv)
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
-  GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian     RBGrid(&Grid);

  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
@@ -51,7 +51,7 @@ int main (int argc, char ** argv)
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
-  GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian     RBGrid(&Grid);

  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
@@ -52,7 +52,7 @@ int main (int argc, char ** argv)
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
-  GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian     RBGrid(&Grid);

  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
@@ -0,0 +1,102 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./tests/debug/Test_heatbath_dwf_eofa.cc
+
+    Copyright (C) 2017
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// This program sets up the initial pseudofermion field |Phi> = Meofa^{-1/2}*|eta>, and
+// then uses this Phi to compute the action <Phi|Meofa|Phi>.
+// If all is working, one should find that <eta|eta> = <Phi|Meofa|Phi>.
+//////////////////////////////////////////////////////////////////////////////////////////
+
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+// Parameters for test
+const std::vector<int> grid_dim = { 8, 8, 8, 8 };
+const int              Ls       = 8;
+const int              Npoles   = 12;
+const RealD            mf       = 0.01;
+const RealD            mpv      = 1.0;
+const RealD            M5       = 1.8;
+
+int main(int argc, char** argv)
+{
+  Grid_init(&argc, &argv);
+
+  int threads = GridThread::GetThreads();
+  std::cout << GridLogMessage << "Grid is set up to use " << threads << " threads" << std::endl;
+
+  // Initialize spacetime grid
+  std::cout << GridLogMessage << "Lattice dimensions: " << grid_dim << "  Ls: " << Ls << std::endl;
+  GridCartesian*         UGrid   = SpaceTimeGrid::makeFourDimGrid(grid_dim,
+                                      GridDefaultSimd(Nd,vComplex::Nsimd()), GridDefaultMpi());
+  GridRedBlackCartesian* UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian*         FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
+  GridRedBlackCartesian* FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
+
+  // Set up RNGs
+  std::vector<int> seeds4({1, 2, 3, 4});
+  std::vector<int> seeds5({5, 6, 7, 8});
+  GridParallelRNG RNG5(FGrid);
+  RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG RNG4(UGrid);
+  RNG4.SeedFixedIntegers(seeds4);
+
+  // Random gauge field
+  LatticeGaugeField Umu(UGrid);
+  SU3::HotConfiguration(RNG4, Umu);
+
+  DomainWallEOFAFermionR Lop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf,  mf, mpv,  0.0, -1, M5);
+  DomainWallEOFAFermionR Rop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mpv, mf, mpv, -1.0,  1, M5);
+
+  // Construct the action and test the heatbath (zero initial guess)
+  {
+    OneFlavourRationalParams Params(0.95, 100.0, 5000, 1.0e-12, Npoles);
+    ConjugateGradient<LatticeFermion> CG(1.0e-12, 5000);
+    ExactOneFlavourRatioPseudoFermionAction<WilsonImplR> Meofa(Lop, Rop, CG, Params, false);
+
+    Meofa.refresh(Umu, RNG5);
+    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
+  }
+
+  // Construct the action and test the heatbath (forecasted initial guesses)
+  {
+    OneFlavourRationalParams Params(0.95, 100.0, 5000, 1.0e-12, Npoles);
+    ConjugateGradient<LatticeFermion> CG(1.0e-12, 5000);
+    ExactOneFlavourRatioPseudoFermionAction<WilsonImplR> Meofa(Lop, Rop, CG, Params, true);
+
+    Meofa.refresh(Umu, RNG5);
+    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
+  }
+
+  return 0;
+}
@@ -0,0 +1,108 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./tests/debug/Test_heatbath_dwf_eofa.cc
+
+    Copyright (C) 2017
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// This program sets up the initial pseudofermion field |Phi> = Meofa^{-1/2}*|eta>, and
+// then uses this Phi to compute the action <Phi|Meofa|Phi>.
+// If all is working, one should find that <eta|eta> = <Phi|Meofa|Phi>.
+//////////////////////////////////////////////////////////////////////////////////////////
+
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+typedef GparityWilsonImplR FermionImplPolicy;
+typedef GparityDomainWallEOFAFermionR FermionAction;
+typedef typename FermionAction::FermionField FermionField;
+
+// Parameters for test
+const std::vector<int> grid_dim = { 8, 8, 8, 8 };
+const int              Ls       = 8;
+const int              Npoles   = 12;
+const RealD            mf       = 0.01;
+const RealD            mpv      = 1.0;
+const RealD            M5       = 1.8;
+
+int main(int argc, char** argv)
+{
+  Grid_init(&argc, &argv);
+
+  int threads = GridThread::GetThreads();
+  std::cout << GridLogMessage << "Grid is set up to use " << threads << " threads" << std::endl;
+
+  // Initialize spacetime grid
+  std::cout << GridLogMessage << "Lattice dimensions: " << grid_dim << "  Ls: " << Ls << std::endl;
+  GridCartesian*         UGrid   = SpaceTimeGrid::makeFourDimGrid(grid_dim,
+                                      GridDefaultSimd(Nd,vComplex::Nsimd()), GridDefaultMpi());
+  GridRedBlackCartesian* UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian*         FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
+  GridRedBlackCartesian* FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
+
+  // Set up RNGs
+  std::vector<int> seeds4({1, 2, 3, 4});
+  std::vector<int> seeds5({5, 6, 7, 8});
+  GridParallelRNG RNG5(FGrid);
+  RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG RNG4(UGrid);
+  RNG4.SeedFixedIntegers(seeds4);
+
+  // Random gauge field
+  LatticeGaugeField Umu(UGrid);
+  SU3::HotConfiguration(RNG4, Umu);
+
+  // GparityDomainWallFermionR::ImplParams params;
+  FermionAction::ImplParams params;
+  FermionAction Lop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf,  mf, mpv,  0.0, -1, M5, params);
+  FermionAction Rop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mpv, mf, mpv, -1.0,  1, M5, params);
+
+  // Construct the action and test the heatbath (zero initial guess)
+  {
+    OneFlavourRationalParams Params(0.95, 100.0, 5000, 1.0e-12, Npoles);
+    ConjugateGradient<FermionField> CG(1.0e-12, 5000);
+    ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> Meofa(Lop, Rop, CG, Params, false);
+
+    Meofa.refresh(Umu, RNG5);
+    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
+  }
+
+  // Construct the action and test the heatbath (forecasted initial guesses)
+  {
+    OneFlavourRationalParams Params(0.95, 100.0, 5000, 1.0e-12, Npoles);
+    ConjugateGradient<FermionField> CG(1.0e-12, 5000);
+    ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> Meofa(Lop, Rop, CG, Params, true);
+
+    Meofa.refresh(Umu, RNG5);
+    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
+  }
+
+  return 0;
+}
@@ -0,0 +1,104 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/debug/Test_heatbath_dwf_eofa.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// This program sets up the initial pseudofermion field |Phi> = Meofa^{-1/2}*|eta>, and
+// then uses this Phi to compute the action <Phi|Meofa|Phi>.
+// If all is working, one should find that <eta|eta> = <Phi|Meofa|Phi>.
+//////////////////////////////////////////////////////////////////////////////////////////
+
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+// Parameters for test
+const std::vector<int> grid_dim = { 8, 8, 8, 8 };
+const int              Ls       = 8;
+const int              Npoles   = 12;
+const RealD            b        = 2.5;
+const RealD            c        = 1.5;
+const RealD            mf       = 0.01;
+const RealD            mpv      = 1.0;
+const RealD            M5       = 1.8;
+
+int main(int argc, char** argv)
+{
+  Grid_init(&argc, &argv);
+
+  int threads = GridThread::GetThreads();
+  std::cout << GridLogMessage << "Grid is set up to use " << threads << " threads" << std::endl;
+
+  // Initialize spacetime grid
+  std::cout << GridLogMessage << "Lattice dimensions: " << grid_dim << "  Ls: " << Ls << std::endl;
+  GridCartesian*         UGrid   = SpaceTimeGrid::makeFourDimGrid(grid_dim,
+                                    GridDefaultSimd(Nd,vComplex::Nsimd()), GridDefaultMpi());
+  GridRedBlackCartesian* UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian*         FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
+  GridRedBlackCartesian* FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
+
+  // Set up RNGs
+  std::vector<int> seeds4({1, 2, 3, 4});
+  std::vector<int> seeds5({5, 6, 7, 8});
+  GridParallelRNG RNG5(FGrid);
+  RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG RNG4(UGrid);
+  RNG4.SeedFixedIntegers(seeds4);
+
+  // Random gauge field
+  LatticeGaugeField Umu(UGrid);
+  SU3::HotConfiguration(RNG4, Umu);
+
+  MobiusEOFAFermionR Lop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf,  mf, mpv,  0.0, -1, M5, b, c);
+  MobiusEOFAFermionR Rop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mpv, mf, mpv, -1.0,  1, M5, b, c);
+
+  // Construct the action and test the heatbath (zero initial guess)
+  {
+    OneFlavourRationalParams Params(0.95, 100.0, 5000, 1.0e-12, Npoles);
+    ConjugateGradient<LatticeFermion> CG(1.0e-12, 5000);
+    ExactOneFlavourRatioPseudoFermionAction<WilsonImplR> Meofa(Lop, Rop, CG, Params, false);
+
+    Meofa.refresh(Umu, RNG5);
+    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
+  }
+
+  // Construct the action and test the heatbath (forecasted initial guesses)
+  {
+    OneFlavourRationalParams Params(0.95, 100.0, 5000, 1.0e-12, Npoles);
+    ConjugateGradient<LatticeFermion> CG(1.0e-12, 5000);
+    ExactOneFlavourRatioPseudoFermionAction<WilsonImplR> Meofa(Lop, Rop, CG, Params, true);
+
+    Meofa.refresh(Umu, RNG5);
+    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
+  }
+
+  return 0;
+}
@@ -0,0 +1,109 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/debug/Test_heatbath_dwf_eofa.cc
+
+Copyright (C) 2017
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: David Murphy <dmurphy@phys.columbia.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// This program sets up the initial pseudofermion field |Phi> = Meofa^{-1/2}*|eta>, and
+// then uses this Phi to compute the action <Phi|Meofa|Phi>.
+// If all is working, one should find that <eta|eta> = <Phi|Meofa|Phi>.
+//////////////////////////////////////////////////////////////////////////////////////////
+
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+typedef GparityWilsonImplR FermionImplPolicy;
+typedef GparityMobiusEOFAFermionR FermionAction;
+typedef typename FermionAction::FermionField FermionField;
+
+// Parameters for test
+const std::vector<int> grid_dim = { 8, 8, 8, 8 };
+const int              Ls       = 8;
+const int              Npoles   = 12;
+const RealD            b        = 2.5;
+const RealD            c        = 1.5;
+const RealD            mf       = 0.01;
+const RealD            mpv      = 1.0;
+const RealD            M5       = 1.8;
+
+int main(int argc, char** argv)
+{
+  Grid_init(&argc, &argv);
+
+  int threads = GridThread::GetThreads();
+  std::cout << GridLogMessage << "Grid is set up to use " << threads << " threads" << std::endl;
+
+  // Initialize spacetime grid
+  std::cout << GridLogMessage << "Lattice dimensions: " << grid_dim << "  Ls: " << Ls << std::endl;
+  GridCartesian*         UGrid   = SpaceTimeGrid::makeFourDimGrid(grid_dim,
+                                    GridDefaultSimd(Nd,vComplex::Nsimd()), GridDefaultMpi());
+  GridRedBlackCartesian* UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian*         FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
+  GridRedBlackCartesian* FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
+
+  // Set up RNGs
+  std::vector<int> seeds4({1, 2, 3, 4});
+  std::vector<int> seeds5({5, 6, 7, 8});
+  GridParallelRNG RNG5(FGrid);
+  RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG RNG4(UGrid);
+  RNG4.SeedFixedIntegers(seeds4);
+
+  // Random gauge field
+  LatticeGaugeField Umu(UGrid);
+  SU3::HotConfiguration(RNG4, Umu);
+
+  FermionAction::ImplParams params;
+  FermionAction Lop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf,  mf, mpv,  0.0, -1, M5, b, c, params);
+  FermionAction Rop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mpv, mf, mpv, -1.0,  1, M5, b, c, params);
+
+  // Construct the action and test the heatbath (zero initial guess)
+  {
+    OneFlavourRationalParams Params(0.95, 100.0, 5000, 1.0e-12, Npoles);
+    ConjugateGradient<FermionField> CG(1.0e-12, 5000);
+    ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> Meofa(Lop, Rop, CG, Params, false);
+
+    Meofa.refresh(Umu, RNG5);
+    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
+  }
+
+  // Construct the action and test the heatbath (forecasted initial guesses)
+  {
+    OneFlavourRationalParams Params(0.95, 100.0, 5000, 1.0e-12, Npoles);
+    ConjugateGradient<FermionField> CG(1.0e-12, 5000);
+    ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> Meofa(Lop, Rop, CG, Params, true);
+
+    Meofa.refresh(Umu, RNG5);
+    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
+  }
+
+  return 0;
+}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Chulwoo Jung	4ac85b3e8f	Made checkerboard choice in staggered preconditioned solvers switchable	2018-02-13 11:49:02 -05:00
Chulwoo Jung	dd8cd8a8e8	Re-adding staggered operators and Schur solvers, consistent with CPS & MILC preconditioning	2018-02-11 02:46:50 -05:00
Chulwoo Jung	0cccb3dc82	Re-added Stag preconditioned opertor and preconditioned solvers, as the current versions are not consistent with CPS or MILC conventions.	2018-02-11 02:39:34 -05:00
Chulwoo Jung	5d44346be3	Merge branch 'develop' of https://github.com/paboyle/Grid into feature/Lanczos	2017-10-30 15:49:17 -04:00
Chulwoo Jung	3a754fcd51	Merge branch 'develop' of https://github.com/paboyle/Grid into feature/Lanczos	2017-10-27 17:34:35 -04:00
paboyle	1ef424b139	Split grid Y2K bug fix attempt	2017-10-27 14:20:35 +01:00
Azusa Yamaguchi	034de160bf	Staggered updates : Schur fixed and added a unit test for Test_staggered_cg_schur.cc giving stronger check	2017-10-26 20:58:46 +01:00
Azusa Yamaguchi	f9e28577f3	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2017-10-25 21:07:56 +01:00
Guido Cossu	8a3aae98f6	Solving minor bug in compilation	2017-10-25 10:34:49 +01:00
Guido Cossu	8309f2364b	Solving again the MPI comm bug with FFTs	2017-10-25 10:24:14 +01:00
Azusa Yamaguchi	cac1750078	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2017-10-24 23:30:36 +01:00
Chulwoo Jung	137886c316	Addressed when coor.size() != dim.size() in Lexicographic	2017-10-19 10:28:57 -04:00
Guido Cossu	27936900e6	Putting the FG verbosity in the Integrator level	2017-10-18 13:08:09 +01:00
paboyle	9fe6ac71ea	Starting reorg of Blocked lanczos	2017-10-11 10:12:07 +01:00
Christopher Kelly	ef61b549e6	Merge branch 'feature/Lanczos' into ckelly_develop4	2017-10-10 13:41:43 -04:00
Azusa Yamaguchi	f1fa00b71b	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2017-10-10 14:26:44 +01:00
paboyle	bf58557fb1	Block compressed Lanczos	2017-10-10 14:15:11 +01:00
paboyle	10cb37f504	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2017-10-10 14:09:44 +01:00
Azusa Yamaguchi	1374c943d4	Correct Schur operator called	2017-10-10 13:59:50 +01:00
paboyle	a1d80282ec	cb factorise	2017-10-10 13:49:31 +01:00
paboyle	4eb8bbbebe	Christop mods	2017-10-10 13:48:51 +01:00
paboyle	d1c6288c5f	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2017-10-10 13:38:40 +01:00
Azusa Yamaguchi	dd949bc428	Merge branch 'feature/staggering' into develop	2017-10-10 13:02:51 +01:00
Azusa Yamaguchi	bb7378cfc3	Schur for staggered	2017-10-10 12:02:18 +01:00
Azusa Yamaguchi	f0e084a88c	Schur staggered	2017-10-10 10:00:43 +01:00
paboyle	153672d8ec	Split CG testing	2017-10-09 23:20:58 +01:00
paboyle	08ca338875	Split grid communication	2017-10-09 23:19:45 +01:00
paboyle	f7cbf82c04	Better stdout/err debug	2017-10-09 23:18:48 +01:00
paboyle	07009c569a	Comms splitting improvements	2017-10-09 23:16:51 +01:00
Azusa Yamaguchi	09f4cdb11e	Merge branch 'develop' of https://github.com/paboyle/Grid into feature/staggering	2017-10-04 10:51:16 +01:00
Azusa Yamaguchi	1e54882f71	Stagger	2017-10-04 10:51:06 +01:00
paboyle	d54807b8c0	MPIT works with split grid now	2017-10-02 23:14:56 +01:00
paboyle	5625b47c7d	Merge branch 'feature/dwf-multirhs' into develop	2017-10-02 12:42:32 +01:00
paboyle	1edcf902b7	Macos ANON	2017-10-02 12:41:02 +01:00
paboyle	e5c19e1fd7	RB constructor change	2017-10-02 12:25:52 +01:00
paboyle	a11d0a33d1	Merge branch 'feature/dwf-multirhs' of https://github.com/paboyle/Grid into feature/dwf-multirhs	2017-10-02 11:42:07 +01:00
paboyle	4f8b6f26b4	Merge branch 'develop' into feature/dwf-multirhs	2017-10-02 11:41:49 +01:00
paboyle	073525c5b3	Small patch from cori	2017-10-02 03:38:21 -07:00
Azusa Yamaguchi	eb6153080a	Merge branch 'develop' of https://github.com/paboyle/Grid into feature/staggering	2017-10-02 08:56:33 +01:00
Guido Cossu	f7072d1ac2	Solving an annoying compilation error in json	2017-10-02 07:13:40 +01:00
paboyle	fddeb29d6b	Bug fix with spreadout FFT	2017-09-21 11:10:08 +01:00
paboyle	a9ec5cf564	Christoph bug report integrate	2017-09-21 10:32:41 +01:00
Peter Boyle	946a8671b9	Merge pull request #129 from djm2131/feature/eofa Add support for DWF with the exact one flavor algorithm	2017-09-21 10:15:21 +01:00
Azusa Yamaguchi	a6eeea777b	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2017-09-21 10:12:41 +01:00
Peter Boyle	771a1b8e79	Merge pull request #128 from paboyle/feature/CG-reliable-update Feature/cg reliable update	2017-09-21 10:12:03 +01:00
Peter Boyle	bfb68e6f02	Merge pull request #130 from giltirn/gparity-handunroll Gparity handunroll	2017-09-21 10:11:00 +01:00
Azusa Yamaguchi	77f7737ccc	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2017-09-19 14:28:01 +01:00
paboyle	18c335198a	Merge branch 'hotfix/dirac-ITT-fix1' into develop	2017-09-16 18:19:02 +01:00
paboyle	f9df685cde	Merge branch 'hotfix/dirac-ITT-fix1'	2017-09-16 18:18:48 +01:00
paboyle	17c5b0f152	Patching comparison point	2017-09-16 18:18:07 +01:00
paboyle	5918769f97	Subtle Naik term bug updated in Stencil; less on logical && with a function call on right	2017-09-16 12:51:26 +01:00
Guido Cossu	bbaf1ada91	Merge branch 'feature/json-fix' into develop	2017-09-08 16:02:08 +01:00
Guido Cossu	1950ac9294	Fixed the Intel compiler problem with the JSON classes	2017-09-08 15:18:59 +01:00
Guido Cossu	13fa70ac1a	Merge branch 'develop' into feature/json-fix	2017-09-08 13:42:20 +01:00
Guido Cossu	7cb2b11f26	Fixing Intel compiler error for the JSON parser	2017-09-08 13:41:53 +01:00
Guido Cossu	1184ed29ae	Merge pull request #124 from nmeyer-ur/feature/arm-neon Added integer reduce functionality	2017-09-08 10:54:35 +02:00
paboyle	203c7bf6fa	Merge branch 'hotfix/dirac-ITT-fix' into develop	2017-09-05 15:08:51 +01:00
paboyle	c709883f3f	Merge branch 'hotfix/dirac-ITT-fix'	2017-09-05 15:08:16 +01:00
paboyle	aed5de4d50	Patching macos compile	2017-09-05 15:07:07 +01:00
paboyle	ba27cc6571	Mac os happiness	2017-09-05 15:00:16 +01:00
paboyle	d856327250	Merge branch 'release/dirac-ITT' into develop	2017-09-05 14:56:12 +01:00
paboyle	d75369cb56	Merge branch 'release/dirac-ITT'	2017-09-05 14:55:54 +01:00
Azusa Yamaguchi	a5fe07c077	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2017-09-04 14:10:15 +01:00
Azusa Yamaguchi	b83b2b1415	Stability improvement to BCG. Force m_rr hermitian beyond rounding.	2017-09-04 14:09:47 +01:00
Chulwoo Jung	3006663b9c	Schur solver for staggered type (hermition Mpc) opertors	2017-08-31 21:32:01 -04:00
Christopher Kelly	59bd1fe21b	Fix for 'perm' and 'local' not being set for hand-unrolled external-site Dslash, which caused incorrect behavior of G-parity kernel	2017-08-29 13:07:37 -07:00
Nils Meyer	4e907fef2c	Merge remote-tracking branch 'grid/develop' into feature/arm-neon	2017-08-29 17:47:36 +02:00
Christopher Kelly	67888b657f	Merge branch 'gparity-handunroll' of https://github.com/giltirn/Grid into gparity-handunroll	2017-08-29 09:52:05 -04:00
Christopher Kelly	74af885d4e	Removed some no-longer-needed associated with G-parity hand unrolled kernel	2017-08-29 09:50:37 -04:00
Christopher Kelly	d36d2fb40d	Added ability to override default Ls in Benchmark_dwf	2017-08-28 06:53:56 -07:00
paboyle	4b4c2a715b	fcntl.h needed	2017-08-26 11:38:04 +01:00
paboyle	54a5e6c1d0	Check if we get huge pages on linux. Larry Meadows piece of magic.	2017-08-25 22:36:08 +01:00
Christopher Kelly	f365a83fae	In G-parity unrolled kernel, replaced calls to permute and exchange with run-time-evaluated permute type with explicit calls to appropriate underlying functions	2017-08-25 14:24:11 -04:00
Christopher Kelly	34a9aeb331	Reduced number of if-statement evaluations in G-parity unrolled kernel	2017-08-24 13:53:50 -07:00
Christopher Kelly	edabb3577f	Imported Benchmark_gparity	2017-08-23 16:54:06 -04:00
Christopher Kelly	ce5df177ee	Removed superfluous implementation of G-parity twist for hand-unrolled kernel from GparityWilsonImpl	2017-08-23 15:05:22 -04:00
Christopher Kelly	a0bb8e5b46	Added hand-unrolled kernel implementations of all the other dslash precision / comms precision combinations with G-parity	2017-08-23 14:44:40 -04:00
Christopher Kelly	46f88e6d72	G-parity hand-unrolled intrinsics twist now uses one less permute and one less temporary	2017-08-23 13:21:10 -04:00
David Murphy	dd8f1ea189	Vectorized Mobius EOFA Dperp + shift operation	2017-08-23 13:17:26 -04:00
Christopher Kelly	b61835c1a5	Added inplace version of intrinsic G-parity twist to hand-unrolled kernel	2017-08-23 12:33:48 -04:00
David Murphy	459f70e8d4	Check-in of working Mobius EOFA class and tests	2017-08-22 22:38:30 -04:00
Christopher Kelly	061e48fd73	Replaced slow unpack-repack in G-parity BC twist with intrinsics version	2017-08-22 18:12:12 -04:00
Christopher Kelly	ab50145001	Implemented first, unoptimized version of hand-unrolled G-parity kernels Improved Test_gparity	2017-08-22 17:12:25 -04:00
Chulwoo Jung	0145685f96	Added Staggered Type Preconditioned operator	2017-08-18 01:44:31 -04:00
David Murphy	9d45fca8bc	Implement MobiusEOFAFermioncache.cc	2017-08-17 23:45:36 -04:00
David Murphy	ac9e6b63c0	More re-import of Mobius EOFA	2017-08-17 19:28:53 -04:00
David Murphy	e140b3f802	Beginning to re-import Mobius EOFA	2017-08-16 23:36:23 -04:00
David Murphy	d9d3d30cc7	Minor clean-up	2017-08-16 20:57:51 -04:00
David Murphy	47a12ec7b5	Implement EOFA pseudofermion force and Shamir tests for G-parity and non G-parity cases	2017-08-16 19:50:08 -04:00
David Murphy	ec1e2f7a40	Add (mostly implemented) ExactOneFlavourRatio pseudofermion class and tests of Shamir heatbath and action	2017-08-16 12:38:59 -04:00
David Murphy	41f73ec083	Add ChronoForecast class for forecasting solutions across poles in the EOFA heatbath	2017-08-16 12:37:38 -04:00
David Murphy	6d0786ff9d	Typo fixes and check-in of G-parity action test for DWF	2017-08-15 22:47:00 -04:00
David Murphy	b7f93aeb4d	Change CayleyFermion5D::SetCoefficientsInternal to virtual to allow overriding in derived EOFA classes	2017-08-15 14:18:51 -04:00
David Murphy	202a7fe900	Re-import DWF and abstract base EOFA fermion classes and tests	2017-08-15 13:36:08 -04:00
Chulwoo Jung	e73e4b4002	Minor changes fixes	2017-08-11 01:35:25 -04:00
Chulwoo Jung	caa6605b43	Still tweaking memory saving routines in Lanczos	2017-08-07 00:01:04 -04:00
Chulwoo Jung	522c9248ae	Merge branch 'develop' of https://github.com/paboyle/Grid into feature/Lanczos	2017-08-06 23:58:21 -04:00
Christopher Kelly	7d867a8134	Merge branch 'develop' into feature/CG-reliable-update	2017-08-02 09:48:04 -04:00
Christopher Kelly	9939b267d2	Added switching to fallback linear operator in reliable update CG, and added recalculation of b parameter on update.	2017-07-31 13:39:44 -04:00
Chulwoo Jung	191fbf85fc	Added ImplicitlyRestartedLanczosCJ to Algorithms.h	2017-07-28 15:33:59 -04:00
Christopher Kelly	8f4b3049cd	Merge branch 'feature/CG-reliable-update' into ckelly_develop	2017-07-25 11:55:26 -04:00
Christopher Kelly	2a6e673a91	Merge branch 'develop' into feature/CG-reliable-update	2017-07-25 11:54:43 -04:00
Christopher Kelly	9b6cde173f	Merge branch 'feature/CG-reliable-update' into ckelly_develop	2017-07-25 11:51:08 -04:00
Christopher Kelly	9f280b82c4	Added mixed-precision CG with reliable updates	2017-07-25 11:30:41 -04:00
Chulwoo Jung	93650f3a61	Adding back (temporarily) dense matrix routines until Lanczos is fininalized	2017-07-24 21:49:25 -04:00
Chulwoo Jung	cab4b4d063	Deleting old include file references	2017-07-24 20:51:31 -04:00
Chulwoo Jung	cf4b30b2dd	re-adding ImplcitlyRestartedLanczos	2017-07-24 20:40:25 -04:00
Chulwoo Jung	c51d0b4078	Merge branch 'develop' of https://github.com/paboyle/Grid into feature/Lanczos	2017-07-24 20:35:29 -04:00
Nils Meyer	7a53dc3715	Added integer reduce functionality	2017-07-24 11:12:59 +02:00
Guido Cossu	9fa07eecde	Merge branch 'develop' into feature/json-fix	2017-07-12 15:47:22 +01:00
Guido Cossu	f64fb7bd77	Fix gcc error on JSON compilation	2017-07-12 14:55:42 +01:00
Guido Cossu	2a35449b91	Merge branch 'develop' into feature/json-fix	2017-07-12 14:47:00 +01:00
Guido Cossu	184af5bd05	Added support for std::pair in the JSON serialiser	2017-07-12 14:44:53 +01:00
Guido Cossu	097c9637ee	Fixed the JSON parsing error	2017-07-11 14:31:57 +01:00
Guido Cossu	d9593c4b81	Merge branch 'develop' into feature/json-fix	2017-07-07 14:17:50 +01:00
paboyle	ac740f73ce	Works on Cori	2017-07-02 16:47:58 -07:00
paboyle	75dc7794b9	Working on Cori	2017-07-02 16:47:42 -07:00
paboyle	dee68fc728	IO working multiple nodes again. Strategy of all nodes writing metadata is unsafe. Only one rank should do this. must identify this rank. Means pass communicator to the Objects.	2017-07-02 23:33:48 +01:00
paboyle	a2d3643634	Merge branch 'feature/dwf-multirhs' of https://github.com/paboyle/Grid into feature/dwf-multirhs	2017-07-02 14:59:22 -07:00
paboyle	57002924bc	NERSC shakeout of this	2017-07-02 14:58:30 -07:00
paboyle	4a29ab0d0a	Merge branch 'feature/dwf-multirhs' of https://github.com/paboyle/Grid into feature/dwf-multirhs	2017-06-23 23:10:43 +01:00
paboyle	0165bcb58e	Added an update to TODO list	2017-06-23 23:10:24 +01:00
paboyle	349d75e483	Precision fix	2017-06-23 02:57:59 -07:00
paboyle	e51475703a	Ticking off lots on the TODO list	2017-06-23 09:42:21 +01:00
paboyle	1feddf4ba6	const fixes	2017-06-22 19:32:41 +01:00
paboyle	600d7ddc2e	Proof of concept : Multi RHS solver, running independent solves on different ranks	2017-06-22 18:54:34 +01:00
paboyle	e504260f3d	Able to run a test job splitting into multiple MPI subdomains.	2017-06-22 18:53:11 +01:00
paboyle	5e4bea8f20	Benchmark DWF works	2017-06-22 08:38:54 +01:00
paboyle	6ebf9f15b7	Splitting communicators first cut	2017-06-22 08:14:34 +01:00
paboyle	1d7aa673a4	Include BlockCG by default	2017-06-21 21:08:53 +01:00
paboyle	b9104f3072	Block CG	2017-06-21 21:08:03 +01:00
portelli	b672717096	Test_serialiation update for JSON	2017-06-19 14:38:39 +01:00
portelli	284ee194b1	JSON update	2017-06-19 14:38:15 +01:00
Chulwoo Jung	2f4cbeb4d5	Minor changes	2017-06-12 18:25:18 -04:00
Chulwoo Jung	fb7c4fb815	Recovering lapack interface without array allocation	2017-06-07 00:00:59 -04:00
Chulwoo Jung	00bb71e5af	Checking in before reworking lapack interface	2017-06-06 16:26:41 -04:00
Chulwoo Jung	cfed2c1ea0	Broken Lanczos. Going back to an older verion temporarily.	2017-06-06 12:14:45 -04:00
Chulwoo Jung	b1b15f0b70	Further fixes from multidimensional array	2017-06-05 23:13:41 -04:00
Chulwoo Jung	927c7ae3ed	changed allocation for LAPACK temporaries, to avoid crashing with some compilers (reported by Christoph)	2017-05-25 21:43:53 -04:00
Chulwoo Jung	05d04ceff8	Adding SimpleLanczos	2017-05-25 12:30:47 -04:00
Chulwoo Jung	8313367a50	Merge branch 'develop' of https://github.com/paboyle/Grid into feature/Lanczos	2017-05-25 12:30:06 -04:00
Chulwoo Jung	5c479ce663	Merge branch 'develop' of https://github.com/paboyle/Grid into feature/Lanczos	2017-05-24 18:58:53 -04:00
Chulwoo Jung	4bf9d65bf8	Checking in memory saving version of Lanczos	2017-05-24 18:57:32 -04:00
Chulwoo Jung	3a056c4dff	Re-adding Bisection for SimpleLanczos	2017-05-22 18:23:03 -04:00
Chulwoo Jung	b0ba651654	Turning off the final sort for now	2017-05-19 10:49:09 -04:00
Chulwoo Jung	25d4c175c3	Cleaning up Lanczos	2017-05-18 18:33:47 -04:00
Chulwoo Jung	a8d7986e1c	Temporary (hopefully) change to run with GCC for now.	2017-05-05 10:55:07 -04:00
Chulwoo Jung	92ec509bfa	Commiting to move to Jlab	2017-05-04 19:32:00 -04:00
Chulwoo Jung	e80a87ff7f	Checking in before modifying	2017-05-04 16:05:07 -04:00
Chulwoo Jung	867fe93018	First Rotate reorg done.	2017-05-02 01:26:22 -04:00
Chulwoo Jung	09651c3326	Checking in before rearranging Lanczos	2017-05-02 00:47:18 -04:00
Chulwoo Jung	f87f2a3f8b	Merge branch 'develop' of https://github.com/paboyle/Grid into feature/Lanczos	2017-05-01 12:00:47 -04:00
Chulwoo Jung	a07556dd5f	Added back the convergence test from evecs of tridiagonal matrix. Bugfixes	2017-04-15 09:32:15 -04:00
Chulwoo Jung	f80a847aef	Merge branch 'develop' into bugfix/dminus	2017-04-06 23:49:10 -04:00
Chulwoo Jung	93cb5d4e97	Working version of Lanczos without the extra copy.	2017-04-06 23:35:30 -04:00
Chulwoo Jung	9e48b7dfda	MEM_SAVE in Lanczos seems to be working, but not pretty	2017-04-06 22:21:56 -04:00
Chulwoo Jung	d0c2c9c71f	Merge branch 'develop' of https://github.com/paboyle/Grid into bugfix/dminus	2017-04-04 15:20:17 -04:00
Chulwoo Jung	c8cafa77ca	Checking in the latest Lacnzos	2017-04-04 15:18:12 -04:00
Chulwoo Jung	a3bcad3804	Added preconditioned SYM2 solver (SchurRedBlackDiagTwoSolve)	2017-03-30 20:33:27 -04:00
Chulwoo Jung	5a5b66292b	Merge branch 'develop' of https://github.com/paboyle/Grid into bugfix/dminus	2017-03-30 10:44:02 -04:00
Chulwoo Jung	e63be32ad2	zmobius Meooe5D fixed?	2017-03-28 03:48:50 -04:00
Chulwoo Jung	6aa106d906	Fixing zmobius coeffs again	2017-03-27 21:57:07 -04:00
Chulwoo Jung	33d59c8869	Adding Zmobius prec test	2017-03-27 21:40:27 -04:00
Chulwoo Jung	a833fd8dbf	Merge branch 'develop' of https://github.com/paboyle/Grid into bugfix/dminus	2017-03-27 21:37:26 -04:00
Chulwoo Jung	e9712bc7fb	Zmobius test was wrong! (only mobius) checking in again	2017-03-16 23:04:28 -04:00
Azusa Yamaguchi	0cd6b1858c	Merge branch 'develop' of https://github.com/paboyle/Grid into feature/staggering	2016-12-14 09:23:22 +00:00