Merge branch 'feature/Ls-vectorised-actions' into develop

2025-07-08 09:27:06 +01:00 · 2016-07-15 19:09:47 +01:00
parent fc4a043663 980ff18956
commit da34d75841
170 changed files with 3014 additions and 1310 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -82,13 +82,14 @@ install:
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
 script:
-    - ./scripts/reconfigure_script
+    - ./autogen.sh
    - mkdir build
    - cd build
    - ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=single --enable-simd=SSE4 --enable-comms=none
    - make -j1 -C prerequisites
    - make -j4 
    - ./benchmarks/Benchmark_dwf --threads 1
-    - make clean
+    - echo make clean
    - ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=double --enable-simd=SSE4 --enable-comms=none
    - make -j4
    - ./benchmarks/Benchmark_dwf --threads 1
--- a/Makefile.am
+++ b/Makefile.am
@ -1,5 +1,6 @@
 # additional include paths necessary to compile the C++ library
-AM_CXXFLAGS = -I$(top_srcdir)/
+AM_CXXFLAGS = -I$(top_srcdir)/include/
-SUBDIRS = lib tests benchmarks
+
 SUBDIRS = prerequisites lib benchmarks tests
 filelist: $(SUBDIRS)
--- a/scripts/reconfigure_script
+++ b/scripts/reconfigure_script
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@ -25,7 +25,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
@ -196,5 +196,126 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking sequential persistent halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
  for(int lat=4;lat<=32;lat+=2){
    for(int Ls=1;Ls<=16;Ls*=2){
      std::vector<int> latt_size  ({lat,lat,lat,lat});
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      std::vector<CartesianCommunicator::CommsRequest_t> empty;
      std::vector<std::vector<CartesianCommunicator::CommsRequest_t> > requests_fwd(Nd,empty);
      std::vector<std::vector<CartesianCommunicator::CommsRequest_t> > requests_bwd(Nd,empty);
      for(int mu=0;mu<4;mu++){
 	ncomm=0;
 	if (mpi_layout[mu]>1 ) {
 	  ncomm++;
 	  int comm_proc;
 	  int xmit_to_rank;
 	  int recv_from_rank;
 	  comm_proc=1;
 	  Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	  Grid.SendToRecvFromInit(requests_fwd[mu],
 				  (void *)&xbuf[mu][0],
 				  xmit_to_rank,
 				  (void *)&rbuf[mu][0],
 				  recv_from_rank,
 				  bytes);
 	  comm_proc = mpi_layout[mu]-1;
 	  Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	  Grid.SendToRecvFromInit(requests_bwd[mu],
 				  (void *)&xbuf[mu+4][0],
 				  xmit_to_rank,
 				  (void *)&rbuf[mu+4][0],
 				  recv_from_rank,
 				  bytes);
 	}
      }
      {
 	double start=usecond();
 	for(int i=0;i<Nloop;i++){
 	  for(int mu=0;mu<4;mu++){
 	    if (mpi_layout[mu]>1 ) {
 	      Grid.SendToRecvFromBegin(requests_fwd[mu]);
 	      Grid.SendToRecvFromComplete(requests_fwd[mu]);
 	      Grid.SendToRecvFromBegin(requests_bwd[mu]);
 	      Grid.SendToRecvFromComplete(requests_bwd[mu]);
 	    }
 	  }
 	  Grid.Barrier();
 	}
 	double stop=usecond();
 	double dbytes    = bytes;
 	double xbytes    = Nloop*dbytes*2.0*ncomm;
 	double rbytes    = xbytes;
 	double bidibytes = xbytes+rbytes;
 	double time = stop-start;
 	std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
      }
      {
 	double start=usecond();
 	for(int i=0;i<Nloop;i++){
 	  for(int mu=0;mu<4;mu++){
 	    if (mpi_layout[mu]>1 ) {
 	      Grid.SendToRecvFromBegin(requests_fwd[mu]);
 	      Grid.SendToRecvFromBegin(requests_bwd[mu]);
 	      Grid.SendToRecvFromComplete(requests_fwd[mu]);
 	      Grid.SendToRecvFromComplete(requests_bwd[mu]);
 	    }
 	  }
 	  Grid.Barrier();
 	}
 	double stop=usecond();
 	double dbytes    = bytes;
 	double xbytes    = Nloop*dbytes*2.0*ncomm;
 	double rbytes    = xbytes;
 	double bidibytes = xbytes+rbytes;
 	double time = stop-start;
 	std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
      }
    }
  }
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@ -26,8 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 #include <PerfCount.h>
 using namespace std;
 using namespace Grid;
@ -46,9 +45,9 @@ struct scal {
  };
 bool overlapComms = false;
-typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
+typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
-typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
+typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
-typedef WilsonFermion5D<DomainWallRedBlack5dImplD> WilsonFermion5DD;
+typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
 int main (int argc, char ** argv)
@ -71,8 +70,8 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
  std::cout << GridLogMessage << "Making s innermost rb grids"<<std::endl;
  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
  std::vector<int> seeds4({1,2,3,4});
@ -87,6 +86,16 @@ int main (int argc, char ** argv)
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);
  /*  src=zero;
  std::vector<int> origin(5,0);
  SpinColourVector f=zero;
  for(int sp=0;sp<4;sp++){
  for(int co=0;co<3;co++){
    f()(sp)(co)=Complex(1.0,0.0); 
  }}
  pokeSite(f,src,origin);
  */
  ColourMatrix cm = Complex(1.0,0.0);
  LatticeGaugeField Umu(UGrid); 
@ -127,19 +136,16 @@ int main (int argc, char ** argv)
  RealD mass=0.1;
  RealD M5  =1.8;
  typename DomainWallFermionR::ImplParams params; 
  params.overlapCommsCompute = overlapComms;
  RealD NP = UGrid->_Nprocessors;
  for(int doasm=1;doasm<2;doasm++){
    QCD::WilsonKernelsStatic::AsmOpt=doasm;
-  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
+  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
-  int ncall =10;
+  int ncall =100;
  if (1) {
    double t0=usecond();
@ -165,11 +171,12 @@ int main (int argc, char ** argv)
  if (1)
  {
-    typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
+    typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
    LatticeFermion ssrc(sFGrid);
    LatticeFermion sref(sFGrid);
    LatticeFermion sresult(sFGrid);
-    WilsonFermion5DR sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5,params);
+
    WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
    for(int x=0;x<latt4[0];x++){
    for(int y=0;y<latt4[1];y++){
@ -181,7 +188,7 @@ int main (int argc, char ** argv)
      peekSite(tmp,src,site);
      pokeSite(tmp,ssrc,site);
    }}}}}
-
+    std::cout<<"src norms "<< norm2(src)<<" " <<norm2(ssrc)<<std::endl;
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      __SSC_START;
@ -208,6 +215,7 @@ int main (int argc, char ** argv)
      }
    }
    std::cout<<"res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl;
    RealF sum=0;
@ -221,9 +229,11 @@ int main (int argc, char ** argv)
      peekSite(normal,result,site);
      peekSite(simd,sresult,site);
      sum=sum+norm2(normal-simd);
-      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
+      if (norm2(normal-simd) > 1.0e-6 ) {
-      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<normal<<std::endl;
+	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
-      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<simd<<std::endl;
+	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" normal "<<normal<<std::endl;
 	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" simd   "<<simd<<std::endl;
      }
    }}}}}
    std::cout<<" difference between normal and simd is "<<sum<<std::endl;
@ -268,9 +278,9 @@ int main (int argc, char ** argv)
      pickCheckerboard(Even,ssrc_e,sresult);
      pickCheckerboard(Odd ,ssrc_o,sresult);
      ssrc_e = ssrc_e - sr_e;
-      std::cout<<GridLogMessage << "sE norm diff   "<< norm2(ssrc_e)<<std::endl;
+      std::cout<<GridLogMessage << "sE norm diff   "<< norm2(ssrc_e)<< "  vec nrm"<<norm2(sr_e) <<std::endl;
      ssrc_o = ssrc_o - sr_o;
-      std::cout<<GridLogMessage << "sO norm diff   "<< norm2(ssrc_o)<<std::endl;
+      std::cout<<GridLogMessage << "sO norm diff   "<< norm2(ssrc_o)<< "  vec nrm"<<norm2(sr_o) <<std::endl;
    }
--- a/benchmarks/Benchmark_dwf_ntpf.cc
+++ b/benchmarks/Benchmark_dwf_ntpf.cc
@ -26,8 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 #include <PerfCount.h>
 using namespace std;
 using namespace Grid;
--- a/benchmarks/Benchmark_dwf_sweep.cc
+++ b/benchmarks/Benchmark_dwf_sweep.cc
@ -26,8 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 #include <PerfCount.h>
 using namespace std;
 using namespace Grid;
@ -126,7 +125,6 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
  ColourMatrix cm = Complex(1.0,0.0);
  LatticeGaugeField Umu5d(FGrid); 
  // replicate across fifth dimension
@ -145,11 +143,10 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
  }
 #ifdef CHECK
-  if (1)
+  if (1) {
-  {
+
    ref = zero;
    for(int mu=0;mu<Nd;mu++){
      tmp = U[mu]*Cshift(src,mu+1,1);
      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
@ -193,20 +190,19 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
    Counter.Report();
  }
-  if ( ! report ) 
+  if ( ! report ) {
-    {
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=1344*volume*ncall;
-      double flops=1344*volume*ncall;
+    std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t";
-      std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t";
+  }
    }
 #ifdef CHECK
-    err = ref-result; 
+  err = ref-result; 
-    RealD errd = norm2(err);
+  RealD errd = norm2(err);
-    if ( errd> 1.0e-4 ) {
+  if ( errd> 1.0e-4 ) {
-      std::cout<<GridLogMessage << "oops !!! norm diff   "<< norm2(err)<<std::endl;
+    std::cout<<GridLogMessage << "oops !!! norm diff   "<< norm2(err)<<std::endl;
-      exit(-1);
+    exit(-1);
-    }
+  }
 #endif
  LatticeFermion src_e (FrbGrid);
@ -232,10 +228,9 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
      std::cout<< flops/(t1-t0);
    }
  }
 }
-#undef CHECK_SDW
+#define CHECK_SDW
 void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
 {
@ -243,7 +238,9 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
@ -277,93 +274,89 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
    }
  }
  RealD mass=0.1;
  RealD M5  =1.8;
-    typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
+  typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
-    LatticeFermion ssrc(sFGrid);
+  LatticeFermion ssrc(sFGrid);
-    LatticeFermion sref(sFGrid);
+  LatticeFermion sref(sFGrid);
-    LatticeFermion sresult(sFGrid);
+  LatticeFermion sresult(sFGrid);
-    WilsonFermion5DR sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5);
+  WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
-    for(int x=0;x<latt4[0];x++){
+  for(int x=0;x<latt4[0];x++){
-    for(int y=0;y<latt4[1];y++){
+  for(int y=0;y<latt4[1];y++){
-    for(int z=0;z<latt4[2];z++){
+  for(int z=0;z<latt4[2];z++){
-    for(int t=0;t<latt4[3];t++){
+  for(int t=0;t<latt4[3];t++){
-    for(int s=0;s<Ls;s++){
+  for(int s=0;s<Ls;s++){
-      std::vector<int> site({s,x,y,z,t});
+    std::vector<int> site({s,x,y,z,t});
-      SpinColourVector tmp;
+    SpinColourVector tmp;
-      peekSite(tmp,src,site);
+    peekSite(tmp,src,site);
-      pokeSite(tmp,ssrc,site);
+    pokeSite(tmp,ssrc,site);
-    }}}}}
+  }}}}}
-    double t0=usecond();
+  double t0=usecond();
-    sDw.Dhop(ssrc,sresult,0);
+  sDw.Dhop(ssrc,sresult,0);
-    double t1=usecond();
+  double t1=usecond();
 #ifdef TIMERS_OFF
-    int ncall =10;
+  int ncall =10;
 #else 
-    int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
+  int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
 #endif
-    PerformanceCounter Counter(8);
+  PerformanceCounter Counter(8);
-    Counter.Start();
+  Counter.Start();
-    t0=usecond();
+  t0=usecond();
-    for(int i=0;i<ncall;i++){
+  for(int i=0;i<ncall;i++){
-      sDw.Dhop(ssrc,sresult,0);
+    sDw.Dhop(ssrc,sresult,0);
-    }
+  }
-    t1=usecond();
+  t1=usecond();
-    Counter.Stop();
+  Counter.Stop();
-    if ( report ) {
+  if ( report ) {
-      Counter.Report();
+    Counter.Report();
-    } else { 
+  } else { 
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=1344*volume*ncall;
    std::cout<<"\t"<< flops/(t1-t0);
  }
-      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+  LatticeFermion sr_eo(sFGrid);
-      double flops=1344*volume*ncall;
+  LatticeFermion serr(sFGrid);
      std::cout<<"\t"<< flops/(t1-t0);
    }
  LatticeFermion ssrc_e (sFrbGrid);
  LatticeFermion ssrc_o (sFrbGrid);
  LatticeFermion sr_e   (sFrbGrid);
  LatticeFermion sr_o   (sFrbGrid);
-    LatticeFermion sr_eo(sFGrid);
+  pickCheckerboard(Even,ssrc_e,ssrc);
-    LatticeFermion serr(sFGrid);
+  pickCheckerboard(Odd,ssrc_o,ssrc);
-    LatticeFermion ssrc_e (sFrbGrid);
+  setCheckerboard(sr_eo,ssrc_o);
-    LatticeFermion ssrc_o (sFrbGrid);
+  setCheckerboard(sr_eo,ssrc_e);
    LatticeFermion sr_e   (sFrbGrid);
    LatticeFermion sr_o   (sFrbGrid);
-    pickCheckerboard(Even,ssrc_e,ssrc);
+  sr_e = zero;
-    pickCheckerboard(Odd,ssrc_o,ssrc);
+  sr_o = zero;
    setCheckerboard(sr_eo,ssrc_o);
    setCheckerboard(sr_eo,ssrc_e);
    sr_e = zero;
    sr_o = zero;
  sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
  PerformanceCounter CounterSdw(8);
  CounterSdw.Start();
  t0=usecond();
  for(int i=0;i<ncall;i++){
    __SSC_START;
    sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
-    PerformanceCounter CounterSdw(8);
+    __SSC_STOP;
-    CounterSdw.Start();
+  }
-    t0=usecond();
+  t1=usecond();
-    for(int i=0;i<ncall;i++){
+  CounterSdw.Stop();
      __SSC_START;
      sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
      __SSC_STOP;
    }
    t1=usecond();
    CounterSdw.Stop();
-    if ( report ) { 
+  if ( report ) { 
-      CounterSdw.Report();
+    CounterSdw.Report();
-    } else {
+  } else {
-
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=(1344.0*volume*ncall)/2;
-      double flops=(1344.0*volume*ncall)/2;
+    std::cout<<"\t"<< flops/(t1-t0);
-      std::cout<<"\t"<< flops/(t1-t0);
+  }
    }
 }
--- a/benchmarks/Benchmark_memory_asynch.cc
+++ b/benchmarks/Benchmark_memory_asynch.cc
@ -26,7 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
--- a/benchmarks/Benchmark_memory_bandwidth.cc
+++ b/benchmarks/Benchmark_memory_bandwidth.cc
@ -26,7 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@ -26,7 +26,7 @@ Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
--- a/benchmarks/Benchmark_wilson.cc
+++ b/benchmarks/Benchmark_wilson.cc
@ -26,7 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
--- a/benchmarks/Benchmark_zmm.cc
+++ b/benchmarks/Benchmark_zmm.cc
@ -25,8 +25,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 #include <PerfCount.h>
 using namespace Grid;
--- a/benchmarks/Makefile.am
+++ b/benchmarks/Makefile.am
@ -1,5 +1,5 @@
 # additional include paths necessary to compile the C++ library
-AM_CXXFLAGS = -I$(top_srcdir)/lib
+AM_CXXFLAGS = -I$(top_srcdir)/include
 AM_LDFLAGS = -L$(top_builddir)/lib
 #
--- a/configure.ac
+++ b/configure.ac
@ -10,6 +10,7 @@ AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk])
 AC_CANONICAL_SYSTEM
 AM_INIT_AUTOMAKE(subdir-objects)
 AC_CONFIG_MACRO_DIR([m4])
 AC_LINK_FILES(lib,include/Grid )
 AC_CONFIG_SRCDIR([lib/Grid.h])
 AC_CONFIG_HEADERS([lib/Config.h])
 m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
@ -231,7 +232,6 @@ esac
 # Chroma regression tests
 #
 AC_ARG_ENABLE([chroma],[AC_HELP_STRING([--enable-chroma],[Expect chroma compiled under c++11 ])],ac_CHROMA=yes,ac_CHROMA=no)
 case ${ac_CHROMA} in
     yes)
       echo Enabling tests regressing to Chroma
@ -269,15 +269,15 @@ AM_CONDITIONAL(USE_LAPACK_LIB,[ test "X${ac_LAPACK}X" != "XyesX" ])
 ###################################################################
 # Checks for doxygen support
 # if present enables the "make doxyfile" command
-#echo
+echo
-#echo Checking doxygen support 
+echo Checking doxygen support 
-#echo :::::::::::::::::::::::::::::::::::::::::::
+echo :::::::::::::::::::::::::::::::::::::::::::
-#AC_PROG_DOXYGEN
+AC_PROG_DOXYGEN
-#if test -n "$DOXYGEN"
+if test -n "$DOXYGEN"
-#then
+then
-#AC_CONFIG_FILES([docs/doxy.cfg])
+AC_CONFIG_FILES([docs/doxy.cfg])
-#fi
+fi
 echo
 echo Creating configuration files
@ -285,8 +285,15 @@ echo :::::::::::::::::::::::::::::::::::::::::::
 AC_CONFIG_FILES(Makefile)
 AC_CONFIG_FILES(lib/Makefile)
 AC_CONFIG_FILES(tests/Makefile)
 AC_CONFIG_FILES(tests/IO/Makefile)
 AC_CONFIG_FILES(tests/core/Makefile)
 AC_CONFIG_FILES(tests/debug/Makefile)
 AC_CONFIG_FILES(tests/forces/Makefile)
 AC_CONFIG_FILES(tests/hmc/Makefile)
 AC_CONFIG_FILES(tests/solver/Makefile)
 AC_CONFIG_FILES(tests/qdpxx/Makefile)
 AC_CONFIG_FILES(benchmarks/Makefile)
 AC_CONFIG_FILES(prerequisites/Makefile)
 AC_OUTPUT
--- a/include/Grid
+++ b/include/Grid
@ -0,0 +1 @@
 ../lib
--- a/lib/Algorithms.h
+++ b/lib/Algorithms.h
@ -29,27 +29,27 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_ALGORITHMS_H
 #define GRID_ALGORITHMS_H
-#include <algorithms/SparseMatrix.h>
+#include <Grid/algorithms/SparseMatrix.h>
-#include <algorithms/LinearOperator.h>
+#include <Grid/algorithms/LinearOperator.h>
-#include <algorithms/Preconditioner.h>
+#include <Grid/algorithms/Preconditioner.h>
-#include <algorithms/approx/Zolotarev.h>
+#include <Grid/algorithms/approx/Zolotarev.h>
-#include <algorithms/approx/Chebyshev.h>
+#include <Grid/algorithms/approx/Chebyshev.h>
-#include <algorithms/approx/Remez.h>
+#include <Grid/algorithms/approx/Remez.h>
-#include <algorithms/approx/MultiShiftFunction.h>
+#include <Grid/algorithms/approx/MultiShiftFunction.h>
-#include <algorithms/iterative/ConjugateGradient.h>
+#include <Grid/algorithms/iterative/ConjugateGradient.h>
-#include <algorithms/iterative/ConjugateResidual.h>
+#include <Grid/algorithms/iterative/ConjugateResidual.h>
-#include <algorithms/iterative/NormalEquations.h>
+#include <Grid/algorithms/iterative/NormalEquations.h>
-#include <algorithms/iterative/SchurRedBlack.h>
+#include <Grid/algorithms/iterative/SchurRedBlack.h>
-#include <algorithms/iterative/ConjugateGradientMultiShift.h>
+#include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 // Lanczos support
-#include <algorithms/iterative/MatrixUtils.h>
+#include <Grid/algorithms/iterative/MatrixUtils.h>
-#include <algorithms/iterative/ImplicitlyRestartedLanczos.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
-#include <algorithms/CoarsenedMatrix.h>
+#include <Grid/algorithms/CoarsenedMatrix.h>
 // Eigen/lanczos
 // EigCg
--- a/lib/Cartesian.h
+++ b/lib/Cartesian.h
@ -28,8 +28,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_CARTESIAN_H
 #define GRID_CARTESIAN_H
-#include <cartesian/Cartesian_base.h>
+#include <Grid/cartesian/Cartesian_base.h>
-#include <cartesian/Cartesian_full.h>
+#include <Grid/cartesian/Cartesian_full.h>
-#include <cartesian/Cartesian_red_black.h> 
+#include <Grid/cartesian/Cartesian_red_black.h> 
 #endif
--- a/lib/Communicator.h
+++ b/lib/Communicator.h
@ -28,6 +28,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_COMMUNICATOR_H
 #define GRID_COMMUNICATOR_H
-#include <communicator/Communicator_base.h>
+#include <Grid/communicator/Communicator_base.h>
 #endif
--- a/lib/Cshift.h
+++ b/lib/Cshift.h
@ -28,17 +28,17 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef _GRID_CSHIFT_H_
 #define _GRID_CSHIFT_H_
-#include <cshift/Cshift_common.h>
+#include <Grid/cshift/Cshift_common.h>
 #ifdef GRID_COMMS_NONE
-#include <cshift/Cshift_none.h>
+#include <Grid/cshift/Cshift_none.h>
 #endif
 #ifdef GRID_COMMS_MPI
-#include <cshift/Cshift_mpi.h>
+#include <Grid/cshift/Cshift_mpi.h>
 #endif 
 #ifdef GRID_COMMS_SHMEM
-#include <cshift/Cshift_mpi.h> // uses same implementation of communicator
+#include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
 #endif 
 #endif
--- a/lib/Eigen.inc
+++ b/lib/Eigen.inc
--- a/lib/Grid.h
+++ b/lib/Grid.h
@ -59,29 +59,29 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 ///////////////////
 // Grid headers
 ///////////////////
-#include <serialisation/Serialisation.h>
+#include <Grid/serialisation/Serialisation.h>
-#include <Config.h>
+#include "Config.h"
-#include <Timer.h>
+#include <Grid/Timer.h>
-#include <PerfCount.h>
+#include <Grid/PerfCount.h>
-#include <Log.h>
+#include <Grid/Log.h>
-#include <AlignedAllocator.h>
+#include <Grid/AlignedAllocator.h>
-#include <Simd.h>
+#include <Grid/Simd.h>
-#include <Threads.h>
+#include <Grid/Threads.h>
-#include <Lexicographic.h>
+#include <Grid/Lexicographic.h>
-#include <Communicator.h> 
+#include <Grid/Communicator.h> 
-#include <Cartesian.h>    
+#include <Grid/Cartesian.h>    
-#include <Tensors.h>      
+#include <Grid/Tensors.h>      
-#include <Lattice.h>      
+#include <Grid/Lattice.h>      
-#include <Cshift.h>       
+#include <Grid/Cshift.h>       
-#include <Stencil.h>      
+#include <Grid/Stencil.h>      
-#include <Algorithms.h>   
+#include <Grid/Algorithms.h>   
-#include <parallelIO/BinaryIO.h>
+#include <Grid/parallelIO/BinaryIO.h>
-#include <qcd/QCD.h>
+#include <Grid/qcd/QCD.h>
-#include <parallelIO/NerscIO.h>
+#include <Grid/parallelIO/NerscIO.h>
-#include <Init.h>
+#include <Grid/Init.h>
-#include <qcd/hmc/NerscCheckpointer.h>
+#include <Grid/qcd/hmc/NerscCheckpointer.h>
-#include <qcd/hmc/HmcRunner.h>
+#include <Grid/qcd/hmc/HmcRunner.h>
--- a/lib/Lattice.h
+++ b/lib/Lattice.h
@ -28,6 +28,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_LATTICE_H
 #define GRID_LATTICE_H
-#include <lattice/Lattice_base.h>
+#include <Grid/lattice/Lattice_base.h>
 #endif
--- a/lib/Make.inc
+++ b/lib/Make.inc
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@ -1,5 +1,5 @@
 # additional include paths necessary to compile the C++ library
-AM_CXXFLAGS = -I$(top_srcdir)/
+AM_CXXFLAGS = -I$(top_srcdir)/include/
 extra_sources=
 if BUILD_COMMS_MPI
@ -17,16 +17,19 @@ endif
 #
 # Libraries
 #
 include Make.inc
 include Eigen.inc
 lib_LIBRARIES = libGrid.a
 libGrid_a_SOURCES = $(CCFILES) $(extra_sources)
 fftwdir = $(prefix)/lib/
 fftw_DATA = libfftw3.a
 #	qcd/action/fermion/PartialFractionFermion5D.cc\	\
 #
 # Include files
 #
-nobase_include_HEADERS=$(HFILES)
+otherincludedir = $(includedir)/Grid
 nobase_otherinclude_HEADERS =$(HFILES) $(EFILES) fftw3.h Config.h
--- a/lib/Simd.h
+++ b/lib/Simd.h
@ -163,8 +163,8 @@ namespace Grid {
 };
-#include <simd/Grid_vector_types.h>
+#include "simd/Grid_vector_types.h"
-#include <simd/Grid_vector_unops.h>
+#include "simd/Grid_vector_unops.h"
 namespace Grid {
  // Default precision
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@ -30,7 +30,7 @@
 #include <thread>
- #include <stencil/Lebesgue.h>   // subdir aggregate
+ #include <Grid/stencil/Lebesgue.h>   // subdir aggregate
 //////////////////////////////////////////////////////////////////////////////////////////
 // Must not lose sight that goal is to be able to construct really efficient
--- a/lib/Tensors.h
+++ b/lib/Tensors.h
@ -30,22 +30,22 @@ Author: neo <cossu@post.kek.jp>
 #ifndef GRID_MATH_H
 #define GRID_MATH_H
-#include <tensors/Tensor_traits.h>
+#include <Grid/tensors/Tensor_traits.h>
-#include <tensors/Tensor_class.h>
+#include <Grid/tensors/Tensor_class.h>
-#include <tensors/Tensor_arith.h>
+#include <Grid/tensors/Tensor_arith.h>
-#include <tensors/Tensor_inner.h>
+#include <Grid/tensors/Tensor_inner.h>
-#include <tensors/Tensor_outer.h>
+#include <Grid/tensors/Tensor_outer.h>
-#include <tensors/Tensor_transpose.h>
+#include <Grid/tensors/Tensor_transpose.h>
-#include <tensors/Tensor_trace.h>
+#include <Grid/tensors/Tensor_trace.h>
-#include <tensors/Tensor_index.h>
+#include <Grid/tensors/Tensor_index.h>
-#include <tensors/Tensor_Ta.h>
+#include <Grid/tensors/Tensor_Ta.h>
-#include <tensors/Tensor_determinant.h>
+#include <Grid/tensors/Tensor_determinant.h>
-#include <tensors/Tensor_exp.h>
+#include <Grid/tensors/Tensor_exp.h>
-//#include <tensors/Tensor_peek.h>
+//#include <Grid/tensors/Tensor_peek.h>
-//#include <tensors/Tensor_poke.h>
+//#include <Grid/tensors/Tensor_poke.h>
-#include <tensors/Tensor_reality.h>
+#include <Grid/tensors/Tensor_reality.h>
-#include <tensors/Tensor_unary.h>
+#include <Grid/tensors/Tensor_unary.h>
-#include <tensors/Tensor_extract_merge.h>
+#include <Grid/tensors/Tensor_extract_merge.h>
-#include <tensors/Tensor_logical.h>
+#include <Grid/tensors/Tensor_logical.h>
 #endif
--- a/lib/algorithms/CoarsenedMatrix.h
+++ b/lib/algorithms/CoarsenedMatrix.h
@ -31,7 +31,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_ALGORITHM_COARSENED_MATRIX_H
 #define  GRID_ALGORITHM_COARSENED_MATRIX_H
 #include <Grid.h>
 namespace Grid {
--- a/lib/algorithms/SparseMatrix.h
+++ b/lib/algorithms/SparseMatrix.h
@ -28,7 +28,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_ALGORITHM_SPARSE_MATRIX_H
 #define  GRID_ALGORITHM_SPARSE_MATRIX_H
 #include <Grid.h>
 namespace Grid {
--- a/lib/algorithms/approx/Chebyshev.h
+++ b/lib/algorithms/approx/Chebyshev.h
@ -29,8 +29,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_CHEBYSHEV_H
 #define GRID_CHEBYSHEV_H
-#include<Grid.h>
+#include <Grid/algorithms/LinearOperator.h>
 #include<algorithms/LinearOperator.h>
 namespace Grid {
--- a/lib/algorithms/approx/Remez.h
+++ b/lib/algorithms/approx/Remez.h
@ -19,9 +19,9 @@
 #include <Config.h>
 #ifdef HAVE_GMP_H
-#include <algorithms/approx/bigfloat.h>
+#include "bigfloat.h"
 #else
-#include <algorithms/approx/bigfloat_double.h>
+#include "algorithms/approx/bigfloat_double.h"
 #endif
 #define JMAX 10000 //Maximum number of iterations of Newton's approximation
--- a/lib/algorithms/iterative/DenseMatrix.h
+++ b/lib/algorithms/iterative/DenseMatrix.h
@ -130,8 +130,8 @@ DenseMatrix<T> GetSubMtx(DenseMatrix<T> &A,int row_st, int row_end, int col_st,
 }
-#include <algorithms/iterative/Householder.h>
+#include "Householder.h"
-#include <algorithms/iterative/Francis.h>
+#include "Francis.h"
 #endif
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@ -33,8 +33,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifdef USE_LAPACK
 #include <lapacke.h>
 #endif
-#include <algorithms/iterative/DenseMatrix.h>
+#include "DenseMatrix.h"
-#include <algorithms/iterative/EigenSort.h>
+#include "EigenSort.h"
 namespace Grid {
--- a/lib/cartesian/Cartesian_base.h
+++ b/lib/cartesian/Cartesian_base.h
@ -29,7 +29,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_CARTESIAN_BASE_H
 #define GRID_CARTESIAN_BASE_H
 #include <Grid.h>
 namespace Grid{
@ -107,6 +106,12 @@ public:
        for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
        return idx;
    }
    virtual int iIndex(std::vector<int> &lcoor)
    {
        int idx=0;
        for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
        return idx;
    }
    inline int oIndexReduced(std::vector<int> &ocoor)
    {
      int idx=0; 
@ -123,12 +128,6 @@ public:
    //////////////////////////////////////////////////////////
    // SIMD lane addressing
    //////////////////////////////////////////////////////////
    inline int iIndex(std::vector<int> &lcoor)
    {
        int idx=0;
        for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
        return idx;
    }
    inline void iCoorFromIindex(std::vector<int> &coor,int lane)
    {
      Lexicographic::CoorFromIndex(coor,lane,_simd_layout);
@ -220,7 +219,7 @@ public:
      }
      i_idx= iIndex(cblcoor);// this does not imply divide by 2 on checker dim
-      o_idx= oIndex(lcoor);// this implies divide by 2 on checkerdim
+      o_idx= oIndex(lcoor);  // this implies divide by 2 on checkerdim
    }
    void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector<int> &gcoor)
--- a/lib/cartesian/Cartesian_red_black.h
+++ b/lib/cartesian/Cartesian_red_black.h
@ -32,16 +32,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
-    static const int CbRed  =0;
+  static const int CbRed  =0;
-    static const int CbBlack=1;
+  static const int CbBlack=1;
-    static const int Even   =CbRed;
+  static const int Even   =CbRed;
-    static const int Odd    =CbBlack;
+  static const int Odd    =CbBlack;
    // Perhaps these are misplaced and 
    // should be in sparse matrix.
    // Also should make these a named enum type
    static const int DaggerNo=0;
    static const int DaggerYes=1;
 // Specialise this for red black grids storing half the data like a chess board.
 class GridRedBlackCartesian : public GridBase
@ -227,6 +221,18 @@ protected:
        return idx;
    };
    virtual int iIndex(std::vector<int> &lcoor)
    {
        int idx=0;
        for(int d=0;d<_ndimension;d++) {
 	  if( d==_checker_dim ) {
 	    idx+=_istride[d]*(lcoor[d]/(2*_rdimensions[d]));
 	  } else { 
 	    idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
 	  }
 	}
        return idx;
    }
 };
 }
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@ -127,12 +127,21 @@ class CartesianCommunicator {
 			int recv_from_rank,
 			int bytes);
    void SendToRecvFromInit(std::vector<CommsRequest_t> &list,
 			    void *xmit,
 			    int xmit_to_rank,
 			    void *recv,
 			    int recv_from_rank,
 			    int bytes);
    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 			 void *xmit,
 			 int xmit_to_rank,
 			 void *recv,
 			 int recv_from_rank,
 			 int bytes);
    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list);
    void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
    ////////////////////////////////////////////////////////////
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@ -144,6 +144,28 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
 }
 // Basic Halo comms primitive
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromInit(std::vector<CommsRequest_t> &list,
 					       void *xmit,
 					       int dest,
 					       void *recv,
 					       int from,
 					       int bytes)
 {
  MPI_Request xrq;
  MPI_Request rrq;
  int rank = _processor;
  int ierr;
  ierr =MPI_Send_init(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
  ierr|=MPI_Recv_init(recv, bytes, MPI_CHAR,dest,_processor,communicator,&rrq);
  assert(ierr==0);
  list.push_back(xrq);
  list.push_back(rrq);
 }
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list)
 {
  MPI_Startall(list.size(),&list[0]);
 }
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
@ -151,17 +173,12 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 						int from,
 						int bytes)
 {
-  MPI_Request xrq;
+  std::vector<CommsRequest_t> reqs(0);
-  MPI_Request rrq;
+  SendToRecvFromInit(reqs,xmit,dest,recv,from,bytes);
-  int rank = _processor;
+  SendToRecvFromBegin(reqs);
-  int ierr;
+  for(int i=0;i<reqs.size();i++){
-  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+    list.push_back(reqs[i]);
-  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
+  }
  assert(ierr==0);
  list.push_back(xrq);
  list.push_back(rrq);
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@ -84,6 +84,19 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 {
  assert(0);
 }
 void CartesianCommunicator::SendToRecvFromInit(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes)
 {
  assert(0);
 }
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list)
 {
  assert(0);
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  assert(0);
--- a/lib/communicator/Communicator_shmem.cc
+++ b/lib/communicator/Communicator_shmem.cc
@ -268,6 +268,10 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list)
 {
  assert(0); //unimplemented
 }
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
@ -280,6 +284,15 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
  //  shmem_putmem_nb(recv,xmit,bytes,dest,NULL);
  shmem_putmem(recv,xmit,bytes,dest);
 }
 void CartesianCommunicator::SendToRecvFromInit(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes)
 {
  assert(0); // Unimplemented
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  //  shmem_quiet();      // I'm done
--- a/lib/lattice/Lattice_base.h
+++ b/lib/lattice/Lattice_base.h
@ -101,6 +101,7 @@ public:
    int begin(void) { return 0;};
    int end(void)   { return _odata.size(); }
    vobj & operator[](int i) { return _odata[i]; };
    const vobj & operator[](int i) const { return _odata[i]; };
 public:
    typedef typename vobj::scalar_type scalar_type;
@ -324,27 +325,27 @@ PARALLEL_FOR_LOOP
-#include <lattice/Lattice_conformable.h>
+#include "Lattice_conformable.h"
 #define GRID_LATTICE_EXPRESSION_TEMPLATES
 #ifdef  GRID_LATTICE_EXPRESSION_TEMPLATES
-#include <lattice/Lattice_ET.h>
+#include "Lattice_ET.h"
 #else 
-#include <lattice/Lattice_overload.h>
+#include "Lattice_overload.h"
 #endif
-#include <lattice/Lattice_arith.h>
+#include "Lattice_arith.h"
-#include <lattice/Lattice_trace.h>
+#include "Lattice_trace.h"
-#include <lattice/Lattice_transpose.h>
+#include "Lattice_transpose.h"
-#include <lattice/Lattice_local.h>
+#include "Lattice_local.h"
-#include <lattice/Lattice_reduction.h>
+#include "Lattice_reduction.h"
-#include <lattice/Lattice_peekpoke.h>
+#include "Lattice_peekpoke.h"
-#include <lattice/Lattice_reality.h>
+#include "Lattice_reality.h"
-#include <lattice/Lattice_comparison_utils.h>
+#include "Lattice_comparison_utils.h"
-#include <lattice/Lattice_comparison.h>
+#include "Lattice_comparison.h"
-#include <lattice/Lattice_coordinate.h>
+#include "Lattice_coordinate.h"
-#include <lattice/Lattice_where.h>
+#include "Lattice_where.h"
-#include <lattice/Lattice_rng.h>
+#include "Lattice_rng.h"
-#include <lattice/Lattice_unary.h>
+#include "Lattice_unary.h"
-#include <lattice/Lattice_transfer.h>
+#include "Lattice_transfer.h"
 #endif
--- a/lib/pugixml/pugixml.h
+++ b/lib/pugixml/pugixml.h
@ -17,7 +17,7 @@
 #endif
 // Include user configuration file (this can define various configuration macros)
-#include <pugixml/pugiconfig.hpp>
+#include "pugiconfig.hpp"
 #ifndef HEADER_PUGIXML_HPP
 #define HEADER_PUGIXML_HPP
--- a/lib/qcd/QCD.h
+++ b/lib/qcd/QCD.h
@ -60,6 +60,12 @@ namespace QCD {
    static const int SpinIndex   = 1;
    static const int LorentzIndex= 0;
    // Also should make these a named enum type
    static const int DaggerNo=0;
    static const int DaggerYes=1;
    static const int InverseNo=0;
    static const int InverseYes=1;
    // Useful traits is this a spin index
    //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
@ -484,16 +490,16 @@ namespace QCD {
 }   //namespace QCD
 } // Grid
-#include <qcd/utils/SpaceTimeGrid.h>
+#include <Grid/qcd/utils/SpaceTimeGrid.h>
-#include <qcd/spin/Dirac.h>
+#include <Grid/qcd/spin/Dirac.h>
-#include <qcd/spin/TwoSpinor.h>
+#include <Grid/qcd/spin/TwoSpinor.h>
-#include <qcd/utils/LinalgUtils.h>
+#include <Grid/qcd/utils/LinalgUtils.h>
-#include <qcd/utils/CovariantCshift.h>
+#include <Grid/qcd/utils/CovariantCshift.h>
-#include <qcd/utils/SUn.h>
+#include <Grid/qcd/utils/SUn.h>
-#include <qcd/action/Actions.h>
+#include <Grid/qcd/action/Actions.h>
-#include <qcd/hmc/integrators/Integrator.h>
+#include <Grid/qcd/hmc/integrators/Integrator.h>
-#include <qcd/hmc/integrators/Integrator_algorithm.h>
+#include <Grid/qcd/hmc/integrators/Integrator_algorithm.h>
-#include <qcd/hmc/HMC.h>
+#include <Grid/qcd/hmc/HMC.h>
 #endif
--- a/lib/qcd/action/Actions.h
+++ b/lib/qcd/action/Actions.h
@ -40,25 +40,25 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 ////////////////////////////////////////////
 // Abstract base interface
 ////////////////////////////////////////////
-#include <qcd/action/ActionBase.h>
+#include <Grid/qcd/action/ActionBase.h>
-#include <qcd/action/ActionParams.h>
+#include <Grid/qcd/action/ActionParams.h>
 ////////////////////////////////////////////
 // Utility functions
 ////////////////////////////////////////////
-#include <qcd/action/gauge/GaugeImpl.h>
+#include <Grid/qcd/action/gauge/GaugeImpl.h>
-#include <qcd/utils/WilsonLoops.h>
+#include <Grid/qcd/utils/WilsonLoops.h>
-#include <qcd/action/fermion/WilsonCompressor.h>     //used by all wilson type fermions
+#include <Grid/qcd/action/fermion/WilsonCompressor.h>     //used by all wilson type fermions
-#include <qcd/action/fermion/FermionOperatorImpl.h>
+#include <Grid/qcd/action/fermion/FermionOperatorImpl.h>
-#include <qcd/action/fermion/FermionOperator.h>
+#include <Grid/qcd/action/fermion/FermionOperator.h>
-#include <qcd/action/fermion/WilsonKernels.h>        //used by all wilson type fermions
+#include <Grid/qcd/action/fermion/WilsonKernels.h>        //used by all wilson type fermions
 ////////////////////////////////////////////
 // Gauge Actions
 ////////////////////////////////////////////
-#include <qcd/action/gauge/WilsonGaugeAction.h>
+#include <Grid/qcd/action/gauge/WilsonGaugeAction.h>
-#include <qcd/action/gauge/PlaqPlusRectangleAction.h>
+#include <Grid/qcd/action/gauge/PlaqPlusRectangleAction.h>
 namespace Grid {
 namespace QCD {
@ -107,41 +107,50 @@ typedef SymanzikGaugeAction<ConjugateGimplD>        ConjugateSymanzikGaugeAction
 // for EVERY .cc file. This define centralises the list and restores global push of impl cases
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-#define FermOpTemplateInstantiate(A) \
+
 #define FermOp4dVecTemplateInstantiate(A) \
  template class A<WilsonImplF>;		\
  template class A<WilsonImplD>;		\
  template class A<GparityWilsonImplF>;		\
  template class A<GparityWilsonImplD>;		
 #define FermOp5dVecTemplateInstantiate(A) \
  template class A<DomainWallVec5dImplF>;	\
  template class A<DomainWallVec5dImplD>;	
 #define FermOpTemplateInstantiate(A) \
 FermOp4dVecTemplateInstantiate(A) \
 FermOp5dVecTemplateInstantiate(A) 
 #define GparityFermOpTemplateInstantiate(A) 
 ////////////////////////////////////////////
 // Fermion operators / actions
 ////////////////////////////////////////////
-#include <qcd/action/fermion/WilsonFermion.h>       // 4d wilson like
+#include <Grid/qcd/action/fermion/WilsonFermion.h>       // 4d wilson like
-#include <qcd/action/fermion/WilsonTMFermion.h>       // 4d wilson like
+#include <Grid/qcd/action/fermion/WilsonTMFermion.h>       // 4d wilson like
-#include <qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
+#include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
-//#include <qcd/action/fermion/CloverFermion.h>
+//#include <Grid/qcd/action/fermion/CloverFermion.h>
-#include <qcd/action/fermion/CayleyFermion5D.h>     // Cayley types
+#include <Grid/qcd/action/fermion/CayleyFermion5D.h>     // Cayley types
-#include <qcd/action/fermion/DomainWallFermion.h>
+#include <Grid/qcd/action/fermion/DomainWallFermion.h>
-#include <qcd/action/fermion/DomainWallFermion.h>
+#include <Grid/qcd/action/fermion/DomainWallFermion.h>
-#include <qcd/action/fermion/MobiusFermion.h>
+#include <Grid/qcd/action/fermion/MobiusFermion.h>
-#include <qcd/action/fermion/ScaledShamirFermion.h>
+#include <Grid/qcd/action/fermion/ScaledShamirFermion.h>
-#include <qcd/action/fermion/MobiusZolotarevFermion.h>
+#include <Grid/qcd/action/fermion/MobiusZolotarevFermion.h>
-#include <qcd/action/fermion/ShamirZolotarevFermion.h>
+#include <Grid/qcd/action/fermion/ShamirZolotarevFermion.h>
-#include <qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h>
+#include <Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h>
-#include <qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h>
+#include <Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h>
-#include <qcd/action/fermion/ContinuedFractionFermion5D.h>               // Continued fraction
+#include <Grid/qcd/action/fermion/ContinuedFractionFermion5D.h>               // Continued fraction
-#include <qcd/action/fermion/OverlapWilsonContfracTanhFermion.h>
+#include <Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h>
-#include <qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h>
+#include <Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h>
-#include <qcd/action/fermion/PartialFractionFermion5D.h>                 // Partial fraction
+#include <Grid/qcd/action/fermion/PartialFractionFermion5D.h>                 // Partial fraction
-#include <qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
+#include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
-#include <qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h>
+#include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h>
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // More maintainable to maintain the following typedef list centrally, as more "impl" targets
@ -222,21 +231,21 @@ typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
 ///////////////////////////////////////////////////////////////////////////////
 // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
 ///////////////////////////////////////////////////////////////////////////////
-#include <qcd/action/fermion/g5HermitianLinop.h>
+#include <Grid/qcd/action/fermion/g5HermitianLinop.h>
 ////////////////////////////////////////
 // Pseudo fermion combinations for HMC
 ////////////////////////////////////////
-#include <qcd/action/pseudofermion/EvenOddSchurDifferentiable.h>
+#include <Grid/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h>
-#include <qcd/action/pseudofermion/TwoFlavour.h>
+#include <Grid/qcd/action/pseudofermion/TwoFlavour.h>
-#include <qcd/action/pseudofermion/TwoFlavourRatio.h>
+#include <Grid/qcd/action/pseudofermion/TwoFlavourRatio.h>
-#include <qcd/action/pseudofermion/TwoFlavourEvenOdd.h>
+#include <Grid/qcd/action/pseudofermion/TwoFlavourEvenOdd.h>
-#include <qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h>
+#include <Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h>
-#include <qcd/action/pseudofermion/OneFlavourRational.h>
+#include <Grid/qcd/action/pseudofermion/OneFlavourRational.h>
-#include <qcd/action/pseudofermion/OneFlavourRationalRatio.h>
+#include <Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h>
-#include <qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
+#include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
-#include <qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>
+#include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>
 #endif
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@ -28,7 +28,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid.h>
 namespace Grid {
 namespace QCD {
@ -45,487 +48,343 @@ namespace QCD {
 		   FourDimGrid,
 	 	   FourDimRedBlackGrid,_M5,p),
   mass(_mass)
- {
+ { }
 }
- template<class Impl>
+template<class Impl>  
-  void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din)
+void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
-  {
+{
-    // Assemble Din
+  int Ls=this->Ls;
-    int Ls=this->Ls;
+  std::vector<RealD> diag (Ls,1.0);
-    for(int s=0;s<Ls;s++){
+  std::vector<RealD> upper(Ls,-1.0); upper[Ls-1]=mass;
-      if ( s==0 ) {
+  std::vector<RealD> lower(Ls,-1.0); lower[0]   =mass;
-	//	Din = bs psi[s] + cs[s] psi[s+1}
+  M5D(psi,chi,chi,lower,diag,upper);
-	axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
+}
-	//      Din+= -mass*cs[s] psi[s+1}
+template<class Impl>
-	axpby_ssp_pplus (Din,1.0,Din,-mass*cs[s],psi,s,Ls-1);
+void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din)
-      } else if ( s==(Ls-1)) { 
+{
-	axpby_ssp_pminus(Din,bs[s],psi,-mass*cs[s],psi,s,0);
+  int Ls=this->Ls;
-	axpby_ssp_pplus (Din,1.0,Din,cs[s],psi,s,s-1);
+  std::vector<RealD> diag = bs;
-      } else {
+  std::vector<RealD> upper= cs;
-	axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
+  std::vector<RealD> lower= cs; 
-	axpby_ssp_pplus(Din,1.0,Din,cs[s],psi,s,s-1);
+  upper[Ls-1]=-mass*upper[Ls-1];
-      }
+  lower[0]   =-mass*lower[0];
-    }
+  M5D(psi,psi,Din,lower,diag,upper);
 }
 template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
  std::vector<RealD> diag = beo;
  std::vector<RealD> upper(Ls);
  std::vector<RealD> lower(Ls);
  for(int i=0;i<Ls;i++) {
    upper[i]=-ceo[i];
    lower[i]=-ceo[i];
  }
- template<class Impl>
+  upper[Ls-1]=-mass*upper[Ls-1];
-  void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din)
+  lower[0]   =-mass*lower[0];
-  {
+  M5D(psi,psi,chi,lower,diag,upper);
-    int Ls=this->Ls;
+}
-    for(int s=0;s<Ls;s++){
+template<class Impl>
-      if ( s==0 ) {
+void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
-	axpby_ssp_pplus (Din,bs[s],psi,cs[s+1],psi,s,s+1);
+{
-	axpby_ssp_pminus(Din,1.0,Din,-mass*cs[Ls-1],psi,s,Ls-1);
+  int Ls=this->Ls;
-      } else if ( s==(Ls-1)) { 
+  std::vector<RealD> diag = bee;
-	axpby_ssp_pplus (Din,bs[s],psi,-mass*cs[0],psi,s,0);
+  std::vector<RealD> upper(Ls);
-	axpby_ssp_pminus(Din,1.0,Din,cs[s-1],psi,s,s-1);
+  std::vector<RealD> lower(Ls);
-      } else {
+  for(int i=0;i<Ls;i++) {
-	axpby_ssp_pplus (Din,bs[s],psi,cs[s+1],psi,s,s+1);
+    upper[i]=-cee[i];
-	axpby_ssp_pminus(Din,1.0,Din,cs[s-1],psi,s,s-1);
+    lower[i]=-cee[i];
      }
    }
  }
  upper[Ls-1]=-mass*upper[Ls-1];
  lower[0]   =-mass*lower[0];
  M5D(psi,psi,chi,lower,diag,upper);
 }
-  // override multiply
+template<class Impl>
- template<class Impl>
+void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
-  RealD CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
+{
-  {
+  int Ls=this->Ls;
-    int Ls=this->Ls;
+  std::vector<RealD> diag = bee;
  std::vector<RealD> upper(Ls);
  std::vector<RealD> lower(Ls);
-    FermionField Din(psi._grid);
+  for (int s=0;s<Ls;s++){
    // Assemble Din
    /*
    for(int s=0;s<Ls;s++){
      if ( s==0 ) {
 	//	Din = bs psi[s] + cs[s] psi[s+1}
 	axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
 	//      Din+= -mass*cs[s] psi[s+1}
 	axpby_ssp_pplus (Din,1.0,Din,-mass*cs[s],psi,s,Ls-1);
      } else if ( s==(Ls-1)) { 
 	axpby_ssp_pminus(Din,bs[s],psi,-mass*cs[s],psi,s,0);
 	axpby_ssp_pplus (Din,1.0,Din,cs[s],psi,s,s-1);
      } else {
 	axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
 	axpby_ssp_pplus(Din,1.0,Din,cs[s],psi,s,s-1);
      }
    }
    */
    Meooe5D(psi,Din);
    this->DW(Din,chi,DaggerNo);
    // ((b D_W + D_w hop terms +1) on s-diag
    axpby(chi,1.0,1.0,chi,psi); 
    // Call Mooee??
    for(int s=0;s<Ls;s++){
      if ( s==0 ){
 	axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s+1);
 	axpby_ssp_pplus (chi,1.0,chi,mass,psi,s,Ls-1);
      } else if ( s==(Ls-1)) {
 	axpby_ssp_pminus(chi,1.0,chi,mass,psi,s,0);
 	axpby_ssp_pplus (chi,1.0,chi,-1.0,psi,s,s-1);
      } else {
 	axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s+1);
 	axpby_ssp_pplus (chi,1.0,chi,-1.0,psi,s,s-1);
      }
    }
    return norm2(chi);
  }
 template<class Impl>
  RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
  {
    // Under adjoint
    //D1+        D1- P-    ->   D1+^dag   P+ D2-^dag
    //D2- P+     D2+            P-D1-^dag D2+dag
    FermionField Din(psi._grid);
    // Apply Dw
    this->DW(psi,Din,DaggerYes); 
    MeooeDag5D(Din,chi);
    int Ls=this->Ls;
    for(int s=0;s<Ls;s++){
      // Collect the terms in DW
      //	Chi = bs Din[s] + cs[s] Din[s+1}
      //    Chi+= -mass*cs[s] psi[s+1}
      /*
      if ( s==0 ) {
 	axpby_ssp_pplus (chi,bs[s],Din,cs[s+1],Din,s,s+1);
 	axpby_ssp_pminus(chi,1.0,chi,-mass*cs[Ls-1],Din,s,Ls-1);
      } else if ( s==(Ls-1)) { 
 	axpby_ssp_pplus (chi,bs[s],Din,-mass*cs[0],Din,s,0);
 	axpby_ssp_pminus(chi,1.0,chi,cs[s-1],Din,s,s-1);
      } else {
 	axpby_ssp_pplus (chi,bs[s],Din,cs[s+1],Din,s,s+1);
 	axpby_ssp_pminus(chi,1.0,chi,cs[s-1],Din,s,s-1);
      }
      */
      // FIXME just call MooeeDag??
      // Collect the terms indept of DW
      if ( s==0 ){
 	axpby_ssp_pplus (chi,1.0,chi,-1.0,psi,s,s+1);
 	axpby_ssp_pminus(chi,1.0,chi,mass,psi,s,Ls-1);
      } else if ( s==(Ls-1)) {
 	axpby_ssp_pplus (chi,1.0,chi,mass,psi,s,0);
 	axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s-1);
      } else {
 	axpby_ssp_pplus(chi,1.0,chi,-1.0,psi,s,s+1);
 	axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s-1);
      }
    }
    // ((b D_W + D_w hop terms +1) on s-diag
    axpby (chi,1.0,1.0,chi,psi); 
    return norm2(chi);
  }
  // half checkerboard operations
 template<class Impl>
  void CayleyFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
  {
    int Ls=this->Ls;
    FermionField tmp(psi._grid);
    // Assemble the 5d matrix
-    Meooe5D(psi,tmp); 
+    if ( s==0 ) {
-#if 0
+      upper[s] = -cee[s+1] ;
-    std::cout << "Meooe Test replacement norm2 tmp = " <<norm2(tmp)<<std::endl;
+      lower[s] = mass*cee[Ls-1];
-    for(int s=0;s<Ls;s++){
+    } else if ( s==(Ls-1)) { 
-      if ( s==0 ) {
+      upper[s] = mass*cee[0];
-	//	tmp = bs psi[s] + cs[s] psi[s+1}
+      lower[s] = -cee[s-1];
 	//      tmp+= -mass*cs[s] psi[s+1}
 	axpby_ssp_pminus(tmp,beo[s],psi,-ceo[s],psi ,s, s+1);
 	axpby_ssp_pplus(tmp,1.0,tmp,mass*ceo[s],psi,s,Ls-1);
      } else if ( s==(Ls-1)) { 
 	axpby_ssp_pminus(tmp,beo[s],psi,mass*ceo[s],psi,s,0);
 	axpby_ssp_pplus(tmp,1.0,tmp,-ceo[s],psi,s,s-1);
      } else {
 	axpby_ssp_pminus(tmp,beo[s],psi,-ceo[s],psi,s,s+1);
 	axpby_ssp_pplus (tmp,1.0,tmp,-ceo[s],psi,s,s-1);
      }
    }
    std::cout << "Meooe Test replacement norm2 tmp old = " <<norm2(tmp)<<std::endl;
 #endif
    // Apply 4d dslash
    if ( psi.checkerboard == Odd ) {
      this->DhopEO(tmp,chi,DaggerNo);
    } else {
-      this->DhopOE(tmp,chi,DaggerNo);
+      upper[s]=-cee[s+1];
      lower[s]=-cee[s-1];
    }
  }
-  template<class Impl>
+  M5Ddag(psi,psi,chi,lower,diag,upper);
-  void CayleyFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
+}
  {
    FermionField tmp(psi._grid);
    // Apply 4d dslash
    if ( psi.checkerboard == Odd ) {
      this->DhopEO(psi,tmp,DaggerYes);
    } else {
      this->DhopOE(psi,tmp,DaggerYes);
    }
-    MeooeDag5D(tmp,chi); 
+template<class Impl>
-#if 0
+void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
-    std::cout << "Meooe Test replacement norm2 chi new = " <<norm2(chi)<<std::endl;
+{
-    // Assemble the 5d matrix
+  int Ls=this->Ls;
-    int Ls=this->Ls;
+  std::vector<RealD> diag(Ls,1.0);
-    for(int s=0;s<Ls;s++){
+  std::vector<RealD> upper(Ls,-1.0);
-      if ( s==0 ) {
+  std::vector<RealD> lower(Ls,-1.0);
-	axpby_ssp_pplus(chi,beo[s],tmp,   -ceo[s+1]  ,tmp,s,s+1);
+  upper[Ls-1]=-mass*upper[Ls-1];
-	axpby_ssp_pminus(chi,   1.0,chi,mass*ceo[Ls-1],tmp,s,Ls-1);
+  lower[0]   =-mass*lower[0];
-      } else if ( s==(Ls-1)) { 
+  M5Ddag(psi,chi,chi,lower,diag,upper);
-	axpby_ssp_pplus(chi,beo[s],tmp,mass*ceo[0],tmp,s,0);
+}
 	axpby_ssp_pminus(chi,1.0,chi,-ceo[s-1],tmp,s,s-1);
      } else {
 	axpby_ssp_pplus(chi,beo[s],tmp,-ceo[s+1],tmp,s,s+1);
 	axpby_ssp_pminus(chi,1.0   ,chi,-ceo[s-1],tmp,s,s-1);
      }
    }
    std::cout << "Meooe Test replacement norm2 chi old = " <<norm2(chi)<<std::endl;
 #endif
 template<class Impl>
 void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din)
 {
  int Ls=this->Ls;
  std::vector<RealD> diag =bs;
  std::vector<RealD> upper=cs;
  std::vector<RealD> lower=cs;
  upper[Ls-1]=-mass*upper[Ls-1];
  lower[0]   =-mass*lower[0];
  M5Ddag(psi,psi,Din,lower,diag,upper);
 }
 template<class Impl>
 RealD CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
  FermionField Din(psi._grid);
  // Assemble Din
  Meooe5D(psi,Din);
  this->DW(Din,chi,DaggerNo);
  // ((b D_W + D_w hop terms +1) on s-diag
  axpby(chi,1.0,1.0,chi,psi); 
  M5D(psi,chi);
  return(norm2(chi));
 }
 template<class Impl>
 RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
 {
  // Under adjoint
  //D1+        D1- P-    ->   D1+^dag   P+ D2-^dag
  //D2- P+     D2+            P-D1-^dag D2+dag
  FermionField Din(psi._grid);
  // Apply Dw
  this->DW(psi,Din,DaggerYes); 
  MeooeDag5D(Din,chi);
  M5Ddag(psi,chi);
  // ((b D_W + D_w hop terms +1) on s-diag
  axpby (chi,1.0,1.0,chi,psi); 
  return norm2(chi);
 }
 // half checkerboard operations
 template<class Impl>
 void CayleyFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
  FermionField tmp(psi._grid);
  Meooe5D(psi,tmp); 
  if ( psi.checkerboard == Odd ) {
    this->DhopEO(tmp,chi,DaggerNo);
  } else {
    this->DhopOE(tmp,chi,DaggerNo);
  }
 }
- template<class Impl>
+template<class Impl>
-  void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
+void CayleyFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
-  {
+{
-    int Ls=this->Ls;
+  FermionField tmp(psi._grid);
-    for (int s=0;s<Ls;s++){
+  // Apply 4d dslash
-      if ( s==0 ) {
+  if ( psi.checkerboard == Odd ) {
-	axpby_ssp_pminus(chi,bee[s],psi ,-cee[s],psi,s,s+1);
+    this->DhopEO(psi,tmp,DaggerYes);
-	axpby_ssp_pplus (chi,1.0,chi,mass*cee[s],psi,s,Ls-1);
+  } else {
-      } else if ( s==(Ls-1)) { 
+    this->DhopOE(psi,tmp,DaggerYes);
 	axpby_ssp_pminus(chi,bee[s],psi,mass*cee[s],psi,s,0);
 	axpby_ssp_pplus (chi,1.0,chi,-cee[s],psi,s,s-1);
      } else {
 	axpby_ssp_pminus(chi,bee[s],psi,-cee[s],psi,s,s+1);
 	axpby_ssp_pplus (chi,1.0,chi,-cee[s],psi,s,s-1);
      }
    }
  }
  MeooeDag5D(tmp,chi); 
 }
- template<class Impl>
+template<class Impl>
-  void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
+void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
-    int Ls=this->Ls;
+  FermionField tmp(psi._grid);
-    FermionField tmp(psi._grid);
+  Meo5D(psi,tmp);
-    // Assemble the 5d matrix
+  // Apply 4d dslash fragment
-    for(int s=0;s<Ls;s++){
+  this->DhopDir(tmp,chi,dir,disp);
-      if ( s==0 ) {
+}
-	//	tmp = bs psi[s] + cs[s] psi[s+1}
+// force terms; five routines; default to Dhop on diagonal
-	//      tmp+= -mass*cs[s] psi[s+1}
+template<class Impl>
-	axpby_ssp_pminus(tmp,beo[s],psi,-ceo[s],psi ,s, s+1);
+void CayleyFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-	axpby_ssp_pplus(tmp,1.0,tmp,mass*ceo[s],psi,s,Ls-1);
+{
-      } else if ( s==(Ls-1)) { 
+  FermionField Din(V._grid);
-	axpby_ssp_pminus(tmp,beo[s],psi,mass*ceo[s],psi,s,0);
+  
-	axpby_ssp_pplus(tmp,1.0,tmp,-ceo[s],psi,s,s-1);
+  if ( dag == DaggerNo ) {
-      } else {
+    //      U d/du [D_w D5] V = U d/du DW D5 V
-	axpby_ssp_pminus(tmp,beo[s],psi,-ceo[s],psi,s,s+1);
+    Meooe5D(V,Din);
-	axpby_ssp_pplus (tmp,1.0,tmp,-ceo[s],psi,s,s-1);
+    this->DhopDeriv(mat,U,Din,dag);
-      }
+  } else {
-    }
+    //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
-    // Apply 4d dslash fragment
+    Meooe5D(U,Din);
-    this->DhopDir(tmp,chi,dir,disp);
+    this->DhopDeriv(mat,Din,V,dag);
  }
 };
 template<class Impl>
 void CayleyFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
 {
  FermionField Din(V._grid);
- template<class Impl>
+  if ( dag == DaggerNo ) {
-  void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
+    //      U d/du [D_w D5] V = U d/du DW D5 V
-  {
+    Meooe5D(V,Din);
-    int Ls=this->Ls;
+    this->DhopDerivOE(mat,U,Din,dag);
-    for (int s=0;s<Ls;s++){
+  } else {
-      // Assemble the 5d matrix
+    //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
      if ( s==0 ) {
 	axpby_ssp_pplus(chi,bee[s],psi,-cee[s+1]  ,psi,s,s+1);
 	axpby_ssp_pminus(chi,1.0,chi,mass*cee[Ls-1],psi,s,Ls-1);
      } else if ( s==(Ls-1)) { 
 	axpby_ssp_pplus(chi,bee[s],psi,mass*cee[0],psi,s,0);
 	axpby_ssp_pminus(chi,1.0,chi,-cee[s-1],psi,s,s-1);
      } else {
 	axpby_ssp_pplus(chi,bee[s],psi,-cee[s+1],psi,s,s+1);
 	axpby_ssp_pminus(chi,1.0   ,chi,-cee[s-1],psi,s,s-1);
      }
    }
  }
 template<class Impl>
  void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
  {
    int Ls=this->Ls;
    // Apply (L^{\prime})^{-1}
    axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0]
    for (int s=1;s<Ls;s++){
      axpby_ssp_pplus(chi,1.0,psi,-lee[s-1],chi,s,s-1);// recursion Psi[s] -lee P_+ chi[s-1]
    }
    // L_m^{-1} 
    for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
      axpby_ssp_pminus(chi,1.0,chi,-leem[s],chi,Ls-1,s);
    }
    // U_m^{-1} D^{-1}
    for (int s=0;s<Ls-1;s++){
      // Chi[s] + 1/d chi[s] 
      axpby_ssp_pplus(chi,1.0/dee[s],chi,-ueem[s]/dee[Ls-1],chi,s,Ls-1);
    }	
    axpby_ssp(chi,1.0/dee[Ls-1],chi,0.0,chi,Ls-1,Ls-1); // Modest avoidable 
    // Apply U^{-1}
    for (int s=Ls-2;s>=0;s--){
      axpby_ssp_pminus (chi,1.0,chi,-uee[s],chi,s,s+1);  // chi[Ls]
    }
  }
 template<class Impl>
  void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
  {
    int Ls=this->Ls;
    // Apply (U^{\prime})^{-dagger}
    axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0]
    for (int s=1;s<Ls;s++){
      axpby_ssp_pminus(chi,1.0,psi,-uee[s-1],chi,s,s-1);
    }
    // U_m^{-\dagger} 
    for (int s=0;s<Ls-1;s++){
      axpby_ssp_pplus(chi,1.0,chi,-ueem[s],chi,Ls-1,s);
    }
    // L_m^{-\dagger} D^{-dagger}
    for (int s=0;s<Ls-1;s++){
      axpby_ssp_pminus(chi,1.0/dee[s],chi,-leem[s]/dee[Ls-1],chi,s,Ls-1);
    }	
    axpby_ssp(chi,1.0/dee[Ls-1],chi,0.0,chi,Ls-1,Ls-1); // Modest avoidable 
    // Apply L^{-dagger}
    for (int s=Ls-2;s>=0;s--){
      axpby_ssp_pplus (chi,1.0,chi,-lee[s],chi,s,s+1);  // chi[Ls]
    }
  }
  // force terms; five routines; default to Dhop on diagonal
  template<class Impl>
  void CayleyFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
  {
    FermionField Din(V._grid);
    if ( dag == DaggerNo ) {
      //      U d/du [D_w D5] V = U d/du DW D5 V
      Meooe5D(V,Din);
      this->DhopDeriv(mat,U,Din,dag);
    } else {
      //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
      Meooe5D(U,Din);
      this->DhopDeriv(mat,Din,V,dag);
    }
  };
 template<class Impl>
  void CayleyFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
  {
    FermionField Din(V._grid);
    if ( dag == DaggerNo ) {
      //      U d/du [D_w D5] V = U d/du DW D5 V
      Meooe5D(V,Din);
      this->DhopDerivOE(mat,U,Din,dag);
    } else {
      //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
      Meooe5D(U,Din);
      this->DhopDerivOE(mat,Din,V,dag);
-    }
+  }
-  };
+};
- template<class Impl>
+template<class Impl>
-  void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-  {
+{
-    FermionField Din(V._grid);
+  FermionField Din(V._grid);
  if ( dag == DaggerNo ) {
    //      U d/du [D_w D5] V = U d/du DW D5 V
    Meooe5D(V,Din);
    this->DhopDerivEO(mat,U,Din,dag);
  } else {
    //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
    Meooe5D(U,Din);
    this->DhopDerivEO(mat,Din,V,dag);
  }
 };
 // Tanh
 template<class Impl>
 void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
 {
  SetCoefficientsZolotarev(1.0,zdata,b,c);
 }
 //Zolo
 template<class Impl>
 void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
 {
  int Ls=this->Ls;
  ///////////////////////////////////////////////////////////
  // The Cayley coeffs (unprec)
  ///////////////////////////////////////////////////////////
  omega.resize(Ls);
  bs.resize(Ls);
  cs.resize(Ls);
  as.resize(Ls);
  // 
  // Ts = (    [bs+cs]Dw        )^-1 (    (bs+cs) Dw         )
  //     -(g5  -------       -1 )    ( g5 ---------     + 1  )
  //      (   {2+(bs-cs)Dw}     )    (    2+(bs-cs) Dw       )
  //
  //  bs = 1/2( (1/omega_s + 1)*b + (1/omega - 1)*c ) = 1/2(  1/omega(b+c) + (b-c) )
  //  cs = 1/2( (1/omega_s - 1)*b + (1/omega + 1)*c ) = 1/2(  1/omega(b+c) - (b-c) )
  //
  // bs+cs = 0.5*( 1/omega(b+c) + (b-c) + 1/omega(b+c) - (b-c) ) = 1/omega(b+c)
  // bs-cs = 0.5*( 1/omega(b+c) + (b-c) - 1/omega(b+c) + (b-c) ) = b-c
  //
  // So 
  //
  // Ts = (    [b+c]Dw/omega_s    )^-1 (    (b+c) Dw /omega_s        )
  //     -(g5  -------         -1 )    ( g5 ---------           + 1  )
  //      (   {2+(b-c)Dw}         )    (    2+(b-c) Dw               )
  //
  // Ts = (    [b+c]Dw            )^-1 (    (b+c) Dw                 )
  //     -(g5  -------    -omega_s)    ( g5 ---------      + omega_s )
  //      (   {2+(b-c)Dw}         )    (    2+(b-c) Dw               )
  // 
  double bpc = b+c;
  double bmc = b-c;
  for(int i=0; i < Ls; i++){
    as[i] = 1.0;
    omega[i] = ((double)zdata->gamma[i])*zolo_hi; //NB reciprocal relative to Chroma NEF code
    bs[i] = 0.5*(bpc/omega[i] + bmc);
    cs[i] = 0.5*(bpc/omega[i] - bmc);
  }
  ////////////////////////////////////////////////////////
  // Constants for the preconditioned matrix Cayley form
  ////////////////////////////////////////////////////////
  bee.resize(Ls);
  cee.resize(Ls);
  beo.resize(Ls);
  ceo.resize(Ls);
  for(int i=0;i<Ls;i++){
    bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);
    cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
    beo[i]=as[i]*bs[i];
    ceo[i]=-as[i]*cs[i];
  }
  aee.resize(Ls);
  aeo.resize(Ls);
  for(int i=0;i<Ls;i++){
    aee[i]=cee[i];
    aeo[i]=ceo[i];
  }
  //////////////////////////////////////////
  // LDU decomposition of eeoo
  //////////////////////////////////////////
  dee.resize(Ls);
  lee.resize(Ls);
  leem.resize(Ls);
  uee.resize(Ls);
  ueem.resize(Ls);
  for(int i=0;i<Ls;i++){
    dee[i] = bee[i];
    if ( i < Ls-1 ) {
      lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
      leem[i]=mass*cee[Ls-1]/bee[0];
      for(int j=0;j<i;j++)  leem[i]*= aee[j]/bee[j+1];
      uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
      ueem[i]=mass;
      for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
      ueem[i]*= aee[0]/bee[0];
    if ( dag == DaggerNo ) {
      //      U d/du [D_w D5] V = U d/du DW D5 V
      Meooe5D(V,Din);
      this->DhopDerivEO(mat,U,Din,dag);
    } else { 
-      //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
+      lee[i] =0.0;
-      Meooe5D(U,Din);
+      leem[i]=0.0;
-      this->DhopDerivEO(mat,Din,V,dag);
+      uee[i] =0.0;
-    }
+      ueem[i]=0.0;
  };
  // Tanh
 template<class Impl>
  void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
  {
    SetCoefficientsZolotarev(1.0,zdata,b,c);
  }
  //Zolo
 template<class Impl>
  void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
  {
    int Ls=this->Ls;
    ///////////////////////////////////////////////////////////
    // The Cayley coeffs (unprec)
    ///////////////////////////////////////////////////////////
    omega.resize(Ls);
    bs.resize(Ls);
    cs.resize(Ls);
    as.resize(Ls);
    // 
    // Ts = (    [bs+cs]Dw        )^-1 (    (bs+cs) Dw         )
    //     -(g5  -------       -1 )    ( g5 ---------     + 1  )
    //      (   {2+(bs-cs)Dw}     )    (    2+(bs-cs) Dw       )
    //
    //  bs = 1/2( (1/omega_s + 1)*b + (1/omega - 1)*c ) = 1/2(  1/omega(b+c) + (b-c) )
    //  cs = 1/2( (1/omega_s - 1)*b + (1/omega + 1)*c ) = 1/2(  1/omega(b+c) - (b-c) )
    //
    // bs+cs = 0.5*( 1/omega(b+c) + (b-c) + 1/omega(b+c) - (b-c) ) = 1/omega(b+c)
    // bs-cs = 0.5*( 1/omega(b+c) + (b-c) - 1/omega(b+c) + (b-c) ) = b-c
    //
    // So 
    //
    // Ts = (    [b+c]Dw/omega_s    )^-1 (    (b+c) Dw /omega_s        )
    //     -(g5  -------         -1 )    ( g5 ---------           + 1  )
    //      (   {2+(b-c)Dw}         )    (    2+(b-c) Dw               )
    //
    // Ts = (    [b+c]Dw            )^-1 (    (b+c) Dw                 )
    //     -(g5  -------    -omega_s)    ( g5 ---------      + omega_s )
    //      (   {2+(b-c)Dw}         )    (    2+(b-c) Dw               )
    // 
    double bpc = b+c;
    double bmc = b-c;
    for(int i=0; i < Ls; i++){
      as[i] = 1.0;
      omega[i] = ((double)zdata->gamma[i])*zolo_hi; //NB reciprocal relative to Chroma NEF code
      bs[i] = 0.5*(bpc/omega[i] + bmc);
      cs[i] = 0.5*(bpc/omega[i] - bmc);
    }
    ////////////////////////////////////////////////////////
    // Constants for the preconditioned matrix Cayley form
    ////////////////////////////////////////////////////////
    bee.resize(Ls);
    cee.resize(Ls);
    beo.resize(Ls);
    ceo.resize(Ls);
    for(int i=0;i<Ls;i++){
      bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);
      cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
      beo[i]=as[i]*bs[i];
      ceo[i]=-as[i]*cs[i];
    }
    aee.resize(Ls);
    aeo.resize(Ls);
    for(int i=0;i<Ls;i++){
      aee[i]=cee[i];
      aeo[i]=ceo[i];
    }
    //////////////////////////////////////////
    // LDU decomposition of eeoo
    //////////////////////////////////////////
    dee.resize(Ls);
    lee.resize(Ls);
    leem.resize(Ls);
    uee.resize(Ls);
    ueem.resize(Ls);
    for(int i=0;i<Ls;i++){
      dee[i] = bee[i];
      if ( i < Ls-1 ) {
 	lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
 	leem[i]=mass*cee[Ls-1]/bee[0];
 	for(int j=0;j<i;j++)  leem[i]*= aee[j]/bee[j+1];
 	uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
 	ueem[i]=mass;
 	for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
 	ueem[i]*= aee[0]/bee[0];
      } else { 
 	lee[i] =0.0;
 	leem[i]=0.0;
 	uee[i] =0.0;
 	ueem[i]=0.0;
      }
    }
    { 
      double delta_d=mass*cee[Ls-1];
      for(int j=0;j<Ls-1;j++) delta_d *= cee[j]/bee[j];
      dee[Ls-1] += delta_d;
    }
  }
  { 
    double delta_d=mass*cee[Ls-1];
    for(int j=0;j<Ls-1;j++) delta_d *= cee[j]/bee[j];
    dee[Ls-1] += delta_d;
  }  
 }
  FermOpTemplateInstantiate(CayleyFermion5D);
  GparityFermOpTemplateInstantiate(CayleyFermion5D);
--- a/lib/qcd/action/fermion/CayleyFermion5D.h
+++ b/lib/qcd/action/fermion/CayleyFermion5D.h
@ -51,6 +51,29 @@ namespace Grid {
      virtual void   MooeeDag    (const FermionField &in, FermionField &out);
      virtual void   MooeeInv    (const FermionField &in, FermionField &out);
      virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
      virtual void   Meo5D (const FermionField &psi, FermionField &chi);
      virtual void   M5D   (const FermionField &psi, FermionField &chi);
      virtual void   M5Ddag(const FermionField &psi, FermionField &chi);
      /////////////////////////////////////////////////////
      // Instantiate different versions depending on Impl
      /////////////////////////////////////////////////////
      void M5D(const FermionField &psi,
 	       const FermionField &phi, 
 	       FermionField &chi,
 	       std::vector<RealD> &lower,
 	       std::vector<RealD> &diag,
 	       std::vector<RealD> &upper);
      void M5Ddag(const FermionField &psi,
 		  const FermionField &phi, 
 		  FermionField &chi,
 		  std::vector<RealD> &lower,
 		  std::vector<RealD> &diag,
 		  std::vector<RealD> &upper);
      void MooeeInternal(const FermionField &in, FermionField &out,int dag,int inv);
      virtual void   Instantiatable(void)=0;
      // force terms; five routines; default to Dhop on diagonal
@ -94,6 +117,8 @@ namespace Grid {
 		      GridRedBlackCartesian &FourDimRedBlackGrid,
 		      RealD _mass,RealD _M5,const ImplParams &p= ImplParams());
    protected:
      void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
      void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
@ -101,5 +126,15 @@ namespace Grid {
  }
 }
 #define INSTANTIATE_DPERP(A)\
 template void CayleyFermion5D< A >::M5D(const FermionField &psi,const FermionField &phi,FermionField &chi,\
 					std::vector<RealD> &lower,std::vector<RealD> &diag,std::vector<RealD> &upper); \
 template void CayleyFermion5D< A >::M5Ddag(const FermionField &psi,const FermionField &phi,FermionField &chi,\
 					   std::vector<RealD> &lower,std::vector<RealD> &diag,std::vector<RealD> &upper); \
 template void CayleyFermion5D< A >::MooeeInv    (const FermionField &psi, FermionField &chi); \
 template void CayleyFermion5D< A >::MooeeInvDag (const FermionField &psi, FermionField &chi);
 #define CAYLEY_DPERP_CACHE
 #undef  CAYLEY_DPERP_LINALG
 #endif
--- a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
@ -0,0 +1,209 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
    Copyright (C) 2015
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid.h>
 namespace Grid {
 namespace QCD {
  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
  // Pminus fowards
  // Pplus  backwards..
 template<class Impl>  
 void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 				const FermionField &phi, 
 				FermionField &chi,
 				std::vector<RealD> &lower,
 				std::vector<RealD> &diag,
 				std::vector<RealD> &upper)
 {
  int Ls =this->Ls;
  GridBase *grid=psi._grid;
  assert(phi.checkerboard == psi.checkerboard);
  chi.checkerboard=psi.checkerboard;
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    for(int s=0;s<Ls;s++){
      auto tmp = psi._odata[0];
      if ( s==0 ) {
 	                            spProj5m(tmp,psi._odata[ss+s+1]);
 	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
 	                    spProj5p(tmp,psi._odata[ss+Ls-1]);
 	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
      } else if ( s==(Ls-1)) {
 	                            spProj5m(tmp,psi._odata[ss+0]);
 	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
 	                    spProj5p(tmp,psi._odata[ss+s-1]);
 	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
      } else { 
 	                            spProj5m(tmp,psi._odata[ss+s+1]);
 	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
 	                    spProj5p(tmp,psi._odata[ss+s-1]);
 	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
      }
    }
  }
 }
 template<class Impl>  
 void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
 				   const FermionField &phi, 
 				   FermionField &chi,
 				   std::vector<RealD> &lower,
 				   std::vector<RealD> &diag,
 				   std::vector<RealD> &upper)
 {
  int Ls =this->Ls;
  GridBase *grid=psi._grid;
  assert(phi.checkerboard == psi.checkerboard);
  chi.checkerboard=psi.checkerboard;
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    auto tmp = psi._odata[0];
    for(int s=0;s<Ls;s++){
      if ( s==0 ) {
 	spProj5p(tmp,psi._odata[ss+s+1]);
 	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
 	spProj5m(tmp,psi._odata[ss+Ls-1]);
 	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
      } else if ( s==(Ls-1)) {
 	spProj5p(tmp,psi._odata[ss+0]);
 	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
 	spProj5m(tmp,psi._odata[ss+s-1]);
 	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
      } else { 
 	spProj5p(tmp,psi._odata[ss+s+1]);
 	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
 	spProj5m(tmp,psi._odata[ss+s-1]);
 	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
      }
    }
  }
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
 {
  GridBase *grid=psi._grid;
  int Ls=this->Ls;
  chi.checkerboard=psi.checkerboard;
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    auto tmp = psi._odata[0];
    // Apply (L^{\prime})^{-1}
    chi[ss]=psi[ss]; // chi[0]=psi[0]
    for(int s=1;s<Ls;s++){
                            spProj5p(tmp,chi[ss+s-1]);  
      chi[ss+s] = psi[ss+s]-lee[s-1]*tmp;
    }
    // L_m^{-1} 
    for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
                                   spProj5m(tmp,chi[ss+s]);    
      chi[ss+Ls-1] = chi[ss+Ls-1] - leem[s]*tmp;
    }
    // U_m^{-1} D^{-1}
    for (int s=0;s<Ls-1;s++){
      // Chi[s] + 1/d chi[s] 
                                                spProj5p(tmp,chi[ss+Ls-1]); 
      chi[ss+s] = (1.0/dee[s])*chi[ss+s]-(ueem[s]/dee[Ls-1])*tmp;
    }	
    chi[ss+Ls-1]= (1.0/dee[Ls-1])*chi[ss+Ls-1];
    // Apply U^{-1}
    for (int s=Ls-2;s>=0;s--){
                            spProj5m(tmp,chi[ss+s+1]);  
      chi[ss+s] = chi[ss+s] - uee[s]*tmp;
    }
  }
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
 {
  GridBase *grid=psi._grid;
  int Ls=this->Ls;
  assert(psi.checkerboard == psi.checkerboard);
  chi.checkerboard=psi.checkerboard;
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    auto tmp = psi._odata[0];
    // Apply (U^{\prime})^{-dagger}
    chi[ss]=psi[ss];
    for (int s=1;s<Ls;s++){
                            spProj5m(tmp,chi[ss+s-1]);
      chi[ss+s] = psi[ss+s]-uee[s-1]*tmp;
    }
    // U_m^{-\dagger} 
    for (int s=0;s<Ls-1;s++){
                                   spProj5p(tmp,chi[ss+s]);
      chi[ss+Ls-1] = chi[ss+Ls-1] - ueem[s]*tmp;
    }
    // L_m^{-\dagger} D^{-dagger}
    for (int s=0;s<Ls-1;s++){
      spProj5m(tmp,chi[ss+Ls-1]);
      chi[ss+s] = (1.0/dee[s])*chi[ss+s]-(leem[s]/dee[Ls-1])*tmp;
    }	
    chi[ss+Ls-1]= (1.0/dee[Ls-1])*chi[ss+Ls-1];
    // Apply L^{-dagger}
    for (int s=Ls-2;s>=0;s--){
      spProj5p(tmp,chi[ss+s+1]);
      chi[ss+s] = chi[ss+s] - lee[s]*tmp;
    }
  }
 }
 #ifdef CAYLEY_DPERP_CACHE
  INSTANTIATE_DPERP(WilsonImplF);
  INSTANTIATE_DPERP(WilsonImplD);
  INSTANTIATE_DPERP(GparityWilsonImplF);
  INSTANTIATE_DPERP(GparityWilsonImplD);
 #endif
 }}
--- a/lib/qcd/action/fermion/CayleyFermion5Ddense.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Ddense.cc
@ -0,0 +1,133 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
    Copyright (C) 2015
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Eigen/Dense>
 #include <Grid.h>
 namespace Grid {
 namespace QCD {
  /*
   * Dense matrix versions of routines
   */
  /*
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
 {
  this->MooeeInternal(psi,chi,DaggerYes,InverseYes);
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
 {
  this->MooeeInternal(psi,chi,DaggerNo,InverseYes);
 }
  */
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
 {
  int Ls=this->Ls;
  int LLs = psi._grid->_rdimensions[0];
  int vol = psi._grid->oSites()/LLs;
  chi.checkerboard=psi.checkerboard;
  assert(Ls==LLs);
  Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
  Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
  for(int s=0;s<Ls;s++){
    Pplus(s,s) = bee[s];
    Pminus(s,s)= bee[s];
  }
  for(int s=0;s<Ls-1;s++){
    Pminus(s,s+1) = -cee[s];
  }
  for(int s=0;s<Ls-1;s++){
    Pplus(s+1,s) = -cee[s+1];
  }
  Pplus (0,Ls-1) = mass*cee[0];
  Pminus(Ls-1,0) = mass*cee[Ls-1];
  Eigen::MatrixXd PplusMat ;
  Eigen::MatrixXd PminusMat;
  if ( inv ) {
    PplusMat =Pplus.inverse();
    PminusMat=Pminus.inverse();
  } else { 
    PplusMat =Pplus;
    PminusMat=Pminus;
  }
  if(dag){
    PplusMat.adjointInPlace();
    PminusMat.adjointInPlace();
  }
  // For the non-vectorised s-direction this is simple
  for(auto site=0;site<vol;site++){
    SiteSpinor     SiteChi;
    SiteHalfSpinor SitePplus;
    SiteHalfSpinor SitePminus;
    for(int s1=0;s1<Ls;s1++){
      SiteChi =zero;
      for(int s2=0;s2<Ls;s2++){
 	int lex2 = s2+Ls*site;
 	if ( PplusMat(s1,s2) != 0.0 ) {
 	  spProj5p(SitePplus,psi[lex2]);
 	  accumRecon5p(SiteChi,PplusMat (s1,s2)*SitePplus);
 	}
 	if ( PminusMat(s1,s2) != 0.0 ) {
 	  spProj5m(SitePminus,psi[lex2]);
 	  accumRecon5m(SiteChi,PminusMat(s1,s2)*SitePminus);
 	}
      }
      chi[s1+Ls*site] = SiteChi*0.5;
    }
  }
 }
 template void CayleyFermion5D<GparityWilsonImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 template void CayleyFermion5D<GparityWilsonImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 template void CayleyFermion5D<WilsonImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 template void CayleyFermion5D<WilsonImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 }}
--- a/lib/qcd/action/fermion/CayleyFermion5Dssp.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dssp.cc
@ -0,0 +1,149 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
    Copyright (C) 2015
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid.h>
 namespace Grid {
 namespace QCD {
  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
  // Pminus fowards
  // Pplus  backwards
 template<class Impl>  
 void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 				const FermionField &phi, 
 				FermionField &chi,
 				std::vector<RealD> &lower,
 				std::vector<RealD> &diag,
 				std::vector<RealD> &upper)
 {
  int Ls=this->Ls;
  for(int s=0;s<Ls;s++){
    if ( s==0 ) {
      axpby_ssp_pminus(chi,diag[s],phi,upper[s],psi,s,s+1);
      axpby_ssp_pplus (chi,1.0,chi,lower[s],psi,s,Ls-1);
    } else if ( s==(Ls-1)) { 
      axpby_ssp_pminus(chi,diag[s],phi,upper[s],psi,s,0);
      axpby_ssp_pplus (chi,1.0,chi,lower[s],psi,s,s-1);
    } else {
      axpby_ssp_pminus(chi,diag[s],phi,upper[s],psi,s,s+1);
      axpby_ssp_pplus(chi,1.0,chi,lower[s],psi,s,s-1);
    }
  }
 }
 template<class Impl>  
 void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
 				   const FermionField &phi, 
 				   FermionField &chi,
 				   std::vector<RealD> &lower,
 				   std::vector<RealD> &diag,
 				   std::vector<RealD> &upper)
 {
  int Ls=this->Ls;
  for(int s=0;s<Ls;s++){
    if ( s==0 ) {
      axpby_ssp_pplus (chi,diag[s],phi,upper[s],psi,s,s+1);
      axpby_ssp_pminus(chi,1.0,chi,lower[s],psi,s,Ls-1);
    } else if ( s==(Ls-1)) { 
      axpby_ssp_pplus (chi,diag[s],phi,upper[s],psi,s,0);
      axpby_ssp_pminus(chi,1.0,chi,lower[s],psi,s,s-1);
    } else {
      axpby_ssp_pplus (chi,diag[s],phi,upper[s],psi,s,s+1);
      axpby_ssp_pminus(chi,1.0,chi,lower[s],psi,s,s-1);
    }
  }
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
 {
  chi.checkerboard=psi.checkerboard;
  int Ls=this->Ls;
  // Apply (L^{\prime})^{-1}
  axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0]
  for (int s=1;s<Ls;s++){
    axpby_ssp_pplus(chi,1.0,psi,-lee[s-1],chi,s,s-1);// recursion Psi[s] -lee P_+ chi[s-1]
  }
  // L_m^{-1} 
  for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
    axpby_ssp_pminus(chi,1.0,chi,-leem[s],chi,Ls-1,s);
  }
  // U_m^{-1} D^{-1}
  for (int s=0;s<Ls-1;s++){
    // Chi[s] + 1/d chi[s] 
    axpby_ssp_pplus(chi,1.0/dee[s],chi,-ueem[s]/dee[Ls-1],chi,s,Ls-1);
  }	
  axpby_ssp(chi,1.0/dee[Ls-1],chi,0.0,chi,Ls-1,Ls-1); // Modest avoidable 
  // Apply U^{-1}
  for (int s=Ls-2;s>=0;s--){
    axpby_ssp_pminus (chi,1.0,chi,-uee[s],chi,s,s+1);  // chi[Ls]
  }
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
 {
  chi.checkerboard=psi.checkerboard;
  int Ls=this->Ls;
  // Apply (U^{\prime})^{-dagger}
  axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0]
  for (int s=1;s<Ls;s++){
    axpby_ssp_pminus(chi,1.0,psi,-uee[s-1],chi,s,s-1);
  }
  // U_m^{-\dagger} 
  for (int s=0;s<Ls-1;s++){
    axpby_ssp_pplus(chi,1.0,chi,-ueem[s],chi,Ls-1,s);
  }
  // L_m^{-\dagger} D^{-dagger}
  for (int s=0;s<Ls-1;s++){
    axpby_ssp_pminus(chi,1.0/dee[s],chi,-leem[s]/dee[Ls-1],chi,s,Ls-1);
  }	
  axpby_ssp(chi,1.0/dee[Ls-1],chi,0.0,chi,Ls-1,Ls-1); // Modest avoidable 
  // Apply L^{-dagger}
  for (int s=Ls-2;s>=0;s--){
    axpby_ssp_pplus (chi,1.0,chi,-lee[s],chi,s,s+1);  // chi[Ls]
  }
 }
 #ifdef CAYLEY_DPERP_LINALG
  INSTANTIATE(WilsonImplF);
  INSTANTIATE(WilsonImplD);
  INSTANTIATE(GparityWilsonImplF);
  INSTANTIATE(GparityWilsonImplD);
 #endif
 }
 }
--- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
@ -0,0 +1,305 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
    Copyright (C) 2015
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Eigen/Dense>
 #include <Grid.h>
 namespace Grid {
 namespace QCD {
  /*
   * Dense matrix versions of routines
   */
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
 {
  this->MooeeInternal(psi,chi,DaggerYes,InverseYes);
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
 {
  this->MooeeInternal(psi,chi,DaggerNo,InverseYes);
 }
 template<class Impl>  
 void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 				const FermionField &phi, 
 				FermionField &chi,
 				std::vector<RealD> &lower,
 				std::vector<RealD> &diag,
 				std::vector<RealD> &upper)
 {
  GridBase *grid=psi._grid;
  int Ls   = this->Ls;
  int LLs  = grid->_rdimensions[0];
  int nsimd= Simd::Nsimd();
  Vector<iSinglet<Simd> > u(LLs);
  Vector<iSinglet<Simd> > l(LLs);
  Vector<iSinglet<Simd> > d(LLs);
  assert(Ls/LLs==nsimd);
  assert(phi.checkerboard == psi.checkerboard);
  chi.checkerboard=psi.checkerboard;
  // just directly address via type pun
  typedef typename Simd::scalar_type scalar_type;
  scalar_type * u_p = (scalar_type *)&u[0];
  scalar_type * l_p = (scalar_type *)&l[0];
  scalar_type * d_p = (scalar_type *)&d[0];
  for(int o=0;o<LLs;o++){ // outer
  for(int i=0;i<nsimd;i++){ //inner
    int s  = o+i*LLs;
    int ss = o*nsimd+i;
    u_p[ss] = upper[s];
    l_p[ss] = lower[s];
    d_p[ss] = diag[s];
  }}
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
    alignas(64) SiteHalfSpinor hp;
    alignas(64) SiteHalfSpinor hm;
    alignas(64) SiteSpinor fp;
    alignas(64) SiteSpinor fm;
    for(int v=0;v<LLs;v++){
      int vp=(v+1)%LLs;
      int vm=(v+LLs-1)%LLs;
      spProj5m(hp,psi[ss+vp]);
      spProj5p(hm,psi[ss+vm]);
      if ( vp<=v ) rotate(hp,hp,1);
      if ( vm>=v ) rotate(hm,hm,nsimd-1);
      hp=hp*0.5;
      hm=hm*0.5;
      spRecon5m(fp,hp);
      spRecon5p(fm,hm);
      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
      chi[ss+v] = chi[ss+v]     +l[v]*fm;
    }
  }
 }
 template<class Impl>  
 void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
 				   const FermionField &phi, 
 				   FermionField &chi,
 				   std::vector<RealD> &lower,
 				   std::vector<RealD> &diag,
 				   std::vector<RealD> &upper)
 {
  GridBase *grid=psi._grid;
  int Ls   = this->Ls;
  int LLs  = grid->_rdimensions[0];
  int nsimd= Simd::Nsimd();
  Vector<iSinglet<Simd> > u(LLs);
  Vector<iSinglet<Simd> > l(LLs);
  Vector<iSinglet<Simd> > d(LLs);
  assert(Ls/LLs==nsimd);
  assert(phi.checkerboard == psi.checkerboard);
  chi.checkerboard=psi.checkerboard;
  // just directly address via type pun
  typedef typename Simd::scalar_type scalar_type;
  scalar_type * u_p = (scalar_type *)&u[0];
  scalar_type * l_p = (scalar_type *)&l[0];
  scalar_type * d_p = (scalar_type *)&d[0];
  for(int o=0;o<LLs;o++){ // outer
  for(int i=0;i<nsimd;i++){ //inner
    int s  = o+i*LLs;
    int ss = o*nsimd+i;
    u_p[ss] = upper[s];
    l_p[ss] = lower[s];
    d_p[ss] = diag[s];
  }}
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
    alignas(64) SiteHalfSpinor hp;
    alignas(64) SiteHalfSpinor hm;
    alignas(64) SiteSpinor fp;
    alignas(64) SiteSpinor fm;
    for(int v=0;v<LLs;v++){
      int vp=(v+1)%LLs;
      int vm=(v+LLs-1)%LLs;
      spProj5p(hp,psi[ss+vp]);
      spProj5m(hm,psi[ss+vm]);
      if ( vp<=v ) rotate(hp,hp,1);
      if ( vm>=v ) rotate(hm,hm,nsimd-1);
      hp=hp*0.5;
      hm=hm*0.5;
      spRecon5p(fp,hp);
      spRecon5m(fm,hm);
      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
      chi[ss+v] = chi[ss+v]     +l[v]*fm;
    }
  }
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
 {
  int Ls=this->Ls;
  int LLs = psi._grid->_rdimensions[0];
  int vol = psi._grid->oSites()/LLs;
  chi.checkerboard=psi.checkerboard;
  Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
  Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
  for(int s=0;s<Ls;s++){
    Pplus(s,s) = bee[s];
    Pminus(s,s)= bee[s];
  }
  for(int s=0;s<Ls-1;s++){
    Pminus(s,s+1) = -cee[s];
  }
  for(int s=0;s<Ls-1;s++){
    Pplus(s+1,s) = -cee[s+1];
  }
  Pplus (0,Ls-1) = mass*cee[0];
  Pminus(Ls-1,0) = mass*cee[Ls-1];
  Eigen::MatrixXd PplusMat ;
  Eigen::MatrixXd PminusMat;
  if ( inv ) {
    PplusMat =Pplus.inverse();
    PminusMat=Pminus.inverse();
  } else { 
    PplusMat =Pplus;
    PminusMat=Pminus;
  }
  if(dag){
    PplusMat.adjointInPlace();
    PminusMat.adjointInPlace();
  }
  typedef typename SiteHalfSpinor::scalar_type scalar_type;
  const int Nsimd=Simd::Nsimd();
  Vector<iSinglet<Simd> > Matp(Ls*LLs);
  Vector<iSinglet<Simd> > Matm(Ls*LLs);
  for(int s2=0;s2<Ls;s2++){
  for(int s1=0;s1<LLs;s1++){
    int istride = LLs;
    int ostride = 1;
      Simd Vp;
      Simd Vm;
      scalar_type *sp = (scalar_type *)&Vp;
      scalar_type *sm = (scalar_type *)&Vm;
      for(int l=0;l<Nsimd;l++){
 	sp[l] = PplusMat (l*istride+s1*ostride ,s2);
 	sm[l] = PminusMat(l*istride+s1*ostride,s2);
      }
      Matp[LLs*s2+s1] = Vp;
      Matm[LLs*s2+s1] = Vm;
    }
  }
  // Dynamic allocate on stack to get per thread without serialised heap acces
 PARALLEL_FOR_LOOP
  for(auto site=0;site<vol;site++){
    //    SiteHalfSpinor *SitePplus =(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
    //    SiteHalfSpinor *SitePminus=(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
    //    SiteSpinor     *SiteChi   =(SiteSpinor *)     alloca(LLs*sizeof(SiteSpinor));
    Vector<SiteHalfSpinor> SitePplus(LLs);
    Vector<SiteHalfSpinor> SitePminus(LLs);
    Vector<SiteHalfSpinor> SiteChiP(LLs);
    Vector<SiteHalfSpinor> SiteChiM(LLs);
    Vector<SiteSpinor>     SiteChi(LLs);
    SiteHalfSpinor BcastP;
    SiteHalfSpinor BcastM;
    for(int s=0;s<LLs;s++){
      int lex = s+LLs*site;
      spProj5p(SitePplus[s] ,psi[lex]);
      spProj5m(SitePminus[s],psi[lex]);
      SiteChiP[s]=zero;
      SiteChiM[s]=zero;
    }
    int s=0;
    for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
      for(int s2=0;s2<LLs;s2++){ // Column loop of right hand side
 	vbroadcast(BcastP,SitePplus [s2],l);
 	vbroadcast(BcastM,SitePminus[s2],l);
 	for(int s1=0;s1<LLs;s1++){ // Column loop of reduction variables
 	  SiteChiP[s1]=SiteChiP[s1]+Matp[LLs*s+s1]*BcastP;
 	  SiteChiM[s1]=SiteChiM[s1]+Matm[LLs*s+s1]*BcastM;
 	}
      s++;
    }}
    for(int s=0;s<LLs;s++){
      int lex = s+LLs*site;
      spRecon5p(SiteChi[s],SiteChiP[s]);
      accumRecon5m(SiteChi[s],SiteChiM[s]);
      chi[lex] = SiteChi[s]*0.5;
    }
  }
 }
 INSTANTIATE_DPERP(DomainWallVec5dImplD);
 INSTANTIATE_DPERP(DomainWallVec5dImplF);
 template void CayleyFermion5D<DomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 template void CayleyFermion5D<DomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 }}
--- a/lib/qcd/action/fermion/DomainWallFermion.h
+++ b/lib/qcd/action/fermion/DomainWallFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_DOMAIN_WALL_FERMION_H
 #define  GRID_QCD_DOMAIN_WALL_FERMION_H
-#include <Grid.h>
+#include <Grid/Grid.h>
 namespace Grid {
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@ -113,6 +113,8 @@ namespace Grid {
    class WilsonImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
    public:
      const bool LsVectorised=false;
      typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl;
      INHERIT_GIMPL_TYPES(Gimpl);
@ -191,9 +193,11 @@ PARALLEL_FOR_LOOP
    // Single flavour four spinors with colour index, 5d redblack
    ///////
    template<class S,int Nrepresentation=Nc>
-    class DomainWallRedBlack5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
+    class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
    public:
      const bool LsVectorised=true;
      typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl;
      INHERIT_GIMPL_TYPES(Gimpl);
@ -221,7 +225,7 @@ PARALLEL_FOR_LOOP
      ImplParams Params;
-      DomainWallRedBlack5dImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
+      DomainWallVec5dImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
      bool overlapCommsCompute(void) { return false; };
@ -287,6 +291,8 @@ PARALLEL_FOR_LOOP
    class GparityWilsonImpl : public ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> >{ 
    public:
      const bool LsVectorised=false;
      typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;
      INHERIT_GIMPL_TYPES(Gimpl);
@ -449,7 +455,7 @@ PARALLEL_FOR_LOOP
 	auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde,A));  
 PARALLEL_FOR_LOOP
        for(auto ss=tmp.begin();ss<tmp.end();ss++){
-	  link[ss]() = tmp[ss](0,0) - conjugate(tmp[ss](1,1)) ;
+	  link[ss]() = tmp[ss](0,0) - conjugate(tmp[ss](1,1)) ; // IS THIS SIGN RIGHT?
 	}
 	PokeIndex<LorentzIndex>(mat,link,mu);
 	return;
@ -477,9 +483,9 @@ PARALLEL_FOR_LOOP
    typedef WilsonImpl<vComplexF,Nc> WilsonImplF; // Float
    typedef WilsonImpl<vComplexD,Nc> WilsonImplD; // Double
-    typedef DomainWallRedBlack5dImpl<vComplex ,Nc> DomainWallRedBlack5dImplR; // Real.. whichever prec
+    typedef DomainWallVec5dImpl<vComplex ,Nc> DomainWallVec5dImplR; // Real.. whichever prec
-    typedef DomainWallRedBlack5dImpl<vComplexF,Nc> DomainWallRedBlack5dImplF; // Float
+    typedef DomainWallVec5dImpl<vComplexF,Nc> DomainWallVec5dImplF; // Float
-    typedef DomainWallRedBlack5dImpl<vComplexD,Nc> DomainWallRedBlack5dImplD; // Double
+    typedef DomainWallVec5dImpl<vComplexD,Nc> DomainWallVec5dImplD; // Double
    typedef GparityWilsonImpl<vComplex ,Nc> GparityWilsonImplR; // Real.. whichever prec
    typedef GparityWilsonImpl<vComplexF,Nc> GparityWilsonImplF; // Float
--- a/lib/qcd/action/fermion/MobiusFermion.h
+++ b/lib/qcd/action/fermion/MobiusFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_MOBIUS_FERMION_H
 #define  GRID_QCD_MOBIUS_FERMION_H
-#include <Grid.h>
+#include <Grid/Grid.h>
 namespace Grid {
--- a/lib/qcd/action/fermion/MobiusZolotarevFermion.h
+++ b/lib/qcd/action/fermion/MobiusZolotarevFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_MOBIUS_ZOLOTAREV_FERMION_H
 #define  GRID_QCD_MOBIUS_ZOLOTAREV_FERMION_H
-#include <Grid.h>
+#include <Grid/Grid.h>
 namespace Grid {
--- a/lib/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef OVERLAP_WILSON_CAYLEY_TANH_FERMION_H
 #define OVERLAP_WILSON_CAYLEY_TANH_FERMION_H
-#include <Grid.h>
+#include <Grid/Grid.h>
 namespace Grid {
--- a/lib/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  OVERLAP_WILSON_CAYLEY_ZOLOTAREV_FERMION_H
 #define  OVERLAP_WILSON_CAYLEY_ZOLOTAREV_FERMION_H
-#include <Grid.h>
+#include <Grid/Grid.h>
 namespace Grid {
--- a/lib/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef OVERLAP_WILSON_CONTFRAC_TANH_FERMION_H
 #define OVERLAP_WILSON_CONTFRAC_TANH_FERMION_H
-#include <Grid.h>
+#include <Grid/Grid.h>
 namespace Grid {
--- a/lib/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef OVERLAP_WILSON_CONTFRAC_ZOLOTAREV_FERMION_H
 #define OVERLAP_WILSON_CONTFRAC_ZOLOTAREV_FERMION_H
-#include <Grid.h>
+#include <Grid/Grid.h>
 namespace Grid {
--- a/lib/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef OVERLAP_WILSON_PARTFRAC_TANH_FERMION_H
 #define OVERLAP_WILSON_PARTFRAC_TANH_FERMION_H
-#include <Grid.h>
+#include <Grid/Grid.h>
 namespace Grid {
--- a/lib/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef OVERLAP_WILSON_PARTFRAC_ZOLOTAREV_FERMION_H
 #define OVERLAP_WILSON_PARTFRAC_ZOLOTAREV_FERMION_H
-#include <Grid.h>
+#include <Grid/Grid.h>
 namespace Grid {
--- a/lib/qcd/action/fermion/ScaledShamirFermion.h
+++ b/lib/qcd/action/fermion/ScaledShamirFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_SCALED_SHAMIR_FERMION_H
 #define  GRID_QCD_SCALED_SHAMIR_FERMION_H
-#include <Grid.h>
+#include <Grid/Grid.h>
 namespace Grid {
--- a/lib/qcd/action/fermion/ShamirZolotarevFermion.h
+++ b/lib/qcd/action/fermion/ShamirZolotarevFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_SHAMIR_ZOLOTAREV_FERMION_H
 #define  GRID_QCD_SHAMIR_ZOLOTAREV_FERMION_H
-#include <Grid.h>
+#include <Grid/Grid.h>
 namespace Grid {
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@ -48,9 +48,9 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
 				       GridRedBlackCartesian &FourDimRedBlackGrid,
 				       RealD _M5,const ImplParams &p) :
  Kernels(p),
-  _FiveDimGrid(&FiveDimGrid),
+  _FiveDimGrid        (&FiveDimGrid),
  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
-  _FourDimGrid(&FourDimGrid),
+  _FourDimGrid        (&FourDimGrid),
  _FourDimRedBlackGrid(&FourDimRedBlackGrid),
  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements),
  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
@ -62,60 +62,83 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  Lebesgue(_FourDimGrid),
  LebesgueEvenOdd(_FourDimRedBlackGrid)
 {
-  // some assertions
+  if (Impl::LsVectorised) { 
  assert(FiveDimGrid._ndimension==5);
  assert(FourDimGrid._ndimension==4);
  assert(FiveDimRedBlackGrid._ndimension==5);
  assert(FourDimRedBlackGrid._ndimension==4);
  assert(FiveDimRedBlackGrid._checker_dim==1);
-  // Dimension zero of the five-d is the Ls direction
+    int nsimd = Simd::Nsimd();
  Ls=FiveDimGrid._fdimensions[0];
  assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
  assert(FiveDimRedBlackGrid._processors[0] ==1);
  assert(FiveDimRedBlackGrid._simd_layout[0]==1);
  assert(FiveDimGrid._processors[0]         ==1);
  assert(FiveDimGrid._simd_layout[0]        ==1);
-  // Other dimensions must match the decomposition of the four-D fields 
+    // some assertions
-  for(int d=0;d<4;d++){
+    assert(FiveDimGrid._ndimension==5);
-    assert(FourDimRedBlackGrid._fdimensions[d]  ==FourDimGrid._fdimensions[d]);
+    assert(FiveDimRedBlackGrid._ndimension==5);
-    assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
+    assert(FiveDimRedBlackGrid._checker_dim==1); // Don't checker the s direction
    assert(FourDimGrid._ndimension==4);
-    assert(FourDimRedBlackGrid._processors[d]   ==FourDimGrid._processors[d]);
+    // Dimension zero of the five-d is the Ls direction
-    assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
+    Ls=FiveDimGrid._fdimensions[0];
    assert(FiveDimGrid._processors[0]         ==1);
    assert(FiveDimGrid._simd_layout[0]        ==nsimd);
-    assert(FourDimRedBlackGrid._simd_layout[d]  ==FourDimGrid._simd_layout[d]);
+    assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
-    assert(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]);
+    assert(FiveDimRedBlackGrid._processors[0] ==1);
    assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
-    assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
+    // Other dimensions must match the decomposition of the four-D fields 
-    assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
+    for(int d=0;d<4;d++){
-    assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
+      assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
      assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
      assert(FourDimGrid._simd_layout[d]=1);
      assert(FourDimRedBlackGrid._simd_layout[d]=1);
      assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
      assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
      assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
      assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
    }
  } else {
    // some assertions
    assert(FiveDimGrid._ndimension==5);
    assert(FourDimGrid._ndimension==4);
    assert(FiveDimRedBlackGrid._ndimension==5);
    assert(FourDimRedBlackGrid._ndimension==4);
    assert(FiveDimRedBlackGrid._checker_dim==1);
    // Dimension zero of the five-d is the Ls direction
    Ls=FiveDimGrid._fdimensions[0];
    assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
    assert(FiveDimRedBlackGrid._processors[0] ==1);
    assert(FiveDimRedBlackGrid._simd_layout[0]==1);
    assert(FiveDimGrid._processors[0]         ==1);
    assert(FiveDimGrid._simd_layout[0]        ==1);
    // Other dimensions must match the decomposition of the four-D fields 
    for(int d=0;d<4;d++){
      assert(FourDimRedBlackGrid._fdimensions[d]  ==FourDimGrid._fdimensions[d]);
      assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
      assert(FourDimRedBlackGrid._processors[d]   ==FourDimGrid._processors[d]);
      assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
      assert(FourDimRedBlackGrid._simd_layout[d]  ==FourDimGrid._simd_layout[d]);
      assert(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]);
      assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
      assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
      assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
    }
  }
  // Allocate the required comms buffer
  ImportGauge(_Umu);
 }
-
+  /*
 template<class Impl>
 WilsonFermion5D<Impl>::WilsonFermion5D(int simd,GaugeField &_Umu,
 				       GridCartesian         &FiveDimGrid,
 				       GridRedBlackCartesian &FiveDimRedBlackGrid,
 				       GridCartesian         &FourDimGrid,
 				       RealD _M5,const ImplParams &p) :
  Kernels(p),
  _FiveDimGrid        (&FiveDimGrid),
  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
  _FourDimGrid        (&FourDimGrid),
  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements),
  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
  StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd
  M5(_M5),
  Umu(_FourDimGrid),
  UmuEven(_FourDimGrid),
  UmuOdd (_FourDimGrid),
  Lebesgue(_FourDimGrid),
  LebesgueEvenOdd(_FourDimGrid)
 {
  int nsimd = Simd::Nsimd();
@ -148,14 +171,9 @@ WilsonFermion5D<Impl>::WilsonFermion5D(int simd,GaugeField &_Umu,
  }
  {
    GaugeField HUmu(_Umu._grid);
    HUmu = _Umu*(-0.5);
    Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
    UmuEven=Umu;// Really want a reference.
    UmuOdd =Umu;
  }
 }  
-
+  */
 template<class Impl>
 void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
@ -376,8 +394,6 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag
 FermOpTemplateInstantiate(WilsonFermion5D);
 GparityFermOpTemplateInstantiate(WilsonFermion5D);
 template class WilsonFermion5D<DomainWallRedBlack5dImplF>;		
 template class WilsonFermion5D<DomainWallRedBlack5dImplD>;
 }}
--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@ -125,12 +125,14 @@ namespace Grid {
 		      double _M5,const ImplParams &p= ImplParams());
      // Constructors
      /*
      WilsonFermion5D(int simd, 
 		      GaugeField &_Umu,
 		      GridCartesian         &FiveDimGrid,
 		      GridRedBlackCartesian &FiveDimRedBlackGrid,
 		      GridCartesian         &FourDimGrid,
 		      double _M5,const ImplParams &p= ImplParams());
      */
      // DoubleStore
      void ImportGauge(const GaugeField &_Umu);
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@ -572,7 +572,4 @@ void WilsonKernels<Impl>::DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
  FermOpTemplateInstantiate(WilsonKernels);
 template class WilsonKernels<DomainWallRedBlack5dImplF>;		
 template class WilsonKernels<DomainWallRedBlack5dImplD>;
 }}
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@ -90,7 +90,7 @@ void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrd
 #define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
 template<>
-void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
@ -110,10 +110,10 @@ template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl
 template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
+template void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
+template void WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 }}
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@ -867,16 +867,16 @@ template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(Stencil
 									 int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallVec5dImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallVec5dImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallVec5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallVec5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
--- a/lib/qcd/action/fermion/WilsonTMFermion.h
+++ b/lib/qcd/action/fermion/WilsonTMFermion.h
@ -28,7 +28,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_WILSON_TM_FERMION_H
 #define  GRID_QCD_WILSON_TM_FERMION_H
-#include <Grid.h>
+#include <Grid/Grid.h>
 namespace Grid {
--- a/lib/qcd/utils/SpaceTimeGrid.cc
+++ b/lib/qcd/utils/SpaceTimeGrid.cc
@ -84,7 +84,7 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridC
 GridCartesian         *SpaceTimeGrid::makeFiveDimDWFGrid(int Ls,const GridCartesian *FourDimGrid)
 {
-  int N4=FourDimGrid->_ndimension;
+  int N4    = FourDimGrid->_ndimension;
  int nsimd = FourDimGrid->Nsimd();
  std::vector<int> latt5(1,Ls);
@ -103,11 +103,11 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(int Ls,const Gr
 {
  int N4=FourDimGrid->_ndimension;
  int nsimd = FourDimGrid->Nsimd();
-  int cbd=0;
+  int cbd=1;
  std::vector<int> latt5(1,Ls);
  std::vector<int> simd5(1,nsimd);
  std::vector<int>  mpi5(1,1);
-  std::vector<int>   cb5(1,1);
+  std::vector<int>   cb5(1,0);
  for(int d=0;d<N4;d++){
    latt5.push_back(FourDimGrid->_fdimensions[d]);
--- a/lib/serialisation/Serialisation.h
+++ b/lib/serialisation/Serialisation.h
@ -29,18 +29,18 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_SERIALISATION_READER_H
 #define GRID_SERIALISATION_READER_H
 #include <serialisation/MacroMagic.h>
 #include <serialisation/BaseIO.h>
 #include <stdint.h>
 #include "MacroMagic.h"
 #include "BaseIO.h"
 #include "BinaryIO.h"
 #include "TextIO.h"
 #include "XmlIO.h"
 //////////////////////////////////////////
 // Todo:
 //////////////////////////////////////////
-#include <serialisation/BinaryIO.h>
+//#include "JsonIO.h"
-#include <serialisation/TextIO.h>
+//#include "YamlIO.h"
 //#include <serialisation/JsonIO.h>
 //#include <serialisation/YamlIO.h>
 #include <serialisation/XmlIO.h>
 //////////////////////////////////////////
 // Select the default serialiser use ifdef's
--- a/lib/serialisation/XmlIO.h
+++ b/lib/serialisation/XmlIO.h
@ -38,7 +38,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <vector>
 #include <cassert>
-#include "pugixml/pugixml.h"
+#include <Grid/pugixml/pugixml.h>
 namespace Grid
 {
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@ -303,13 +303,14 @@ namespace Grid {
 	int dist = perm&0xF;
        y=rotate(b,dist);
 	return;
-      }
+      } else { 
-      switch(perm){
+	switch(perm){
-      case 3: permute3(y,b); break;
+	case 3: permute3(y,b); break;
-      case 2: permute2(y,b); break;
+	case 2: permute2(y,b); break;
-      case 1: permute1(y,b); break;
+	case 1: permute1(y,b); break;
-      case 0: permute0(y,b); break;
+	case 0: permute0(y,b); break;
-      default: assert(0);
+	default: assert(0);
 	}
      }
    }
@ -336,6 +337,20 @@ namespace Grid {
    ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
    return ret;
  }
  template <class S, class V, IfNotComplex<S> =0> 
  inline void rotate( Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
  {
    nrot = nrot % Grid_simd<S,V>::Nsimd();
    //    std::cout << "Rotate Real by "<<nrot<<std::endl;
    ret.v = Optimization::Rotate::rotate(b.v,nrot);
  }
  template <class S, class V, IfComplex<S> =0> 
    inline void rotate(Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
  {
    nrot = nrot % Grid_simd<S,V>::Nsimd();
    //    std::cout << "Rotate Complex by "<<nrot<<std::endl;
    ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
  }
  ///////////////////////
  // Splat
@ -347,6 +362,12 @@ namespace Grid {
    ret.v = binary<V>(a, b, VsplatSIMD());
  }    
  template <class S, class V> 
  inline void vbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
    S* typepun =(S*) &src;
    vsplat(ret,typepun[lane]);
  }    
  // overload if complex
  template <class S,class V> inline void vsplat(Grid_simd<S,V> &ret, EnableIf<is_complex < S >, S> c) {
    vsplat(ret,real(c),imag(c));
@ -354,7 +375,7 @@ namespace Grid {
  //if real fill with a, if complex fill with a in the real part (first function above)
  template <class S,class V>
-    inline void vsplat(Grid_simd<S,V> &ret,NotEnableIf<is_complex< S>,S> a){
+  inline void vsplat(Grid_simd<S,V> &ret,NotEnableIf<is_complex< S>,S> a){
    ret.v = unary<V>(a, VsplatSIMD());
  }    
  //////////////////////////
--- a/lib/simd/Intel512wilson.h
+++ b/lib/simd/Intel512wilson.h
@ -90,8 +90,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define Chimu_31 UChi_11
 #define Chimu_32 UChi_12
-#include <simd/Intel512common.h>
+#include "Intel512common.h"
-#include <simd/Intel512avx.h>
+#include "Intel512avx.h"
 //////////////////////////////////////////////////////////////////
 // Macros used to build wilson kernel -- can rationalise and simplify
--- a/lib/tensors/Tensor_arith.h
+++ b/lib/tensors/Tensor_arith.h
@ -28,11 +28,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_MATH_ARITH_H
 #define GRID_MATH_ARITH_H
-#include <tensors/Tensor_arith_add.h>
+#include "Tensor_arith_add.h"
-#include <tensors/Tensor_arith_sub.h>
+#include "Tensor_arith_sub.h"
-#include <tensors/Tensor_arith_mac.h>
+#include "Tensor_arith_mac.h"
-#include <tensors/Tensor_arith_mul.h>
+#include "Tensor_arith_mul.h"
-#include <tensors/Tensor_arith_scalar.h>
+#include "Tensor_arith_scalar.h"
 #endif
--- a/lib/tensors/Tensor_class.h
+++ b/lib/tensors/Tensor_class.h
@ -84,6 +84,9 @@ public:
  friend strong_inline void vstream(iScalar<vtype> &out,const iScalar<vtype> &in){
    vstream(out._internal,in._internal);
  }
  friend strong_inline void vbroadcast(iScalar<vtype> &out,const iScalar<vtype> &in,int lane){
    vbroadcast(out._internal,in._internal,lane);
  }
  friend strong_inline void zeroit(iScalar<vtype> &that){
    zeroit(that._internal);
  }
@ -93,6 +96,9 @@ public:
  friend strong_inline void permute(iScalar<vtype> &out,const iScalar<vtype> &in,int permutetype){
    permute(out._internal,in._internal,permutetype);
  }
  friend strong_inline void rotate(iScalar<vtype> &out,const iScalar<vtype> &in,int rot){
    rotate(out._internal,in._internal,rot);
  }
  // Unary negation
  friend strong_inline iScalar<vtype> operator -(const iScalar<vtype> &r) {
@ -200,11 +206,21 @@ public:
      vstream(out._internal[i],in._internal[i]);
    }
  }
  friend strong_inline void vbroadcast(iVector<vtype,N> &out,const iVector<vtype,N> &in,int lane){
    for(int i=0;i<N;i++){
      vbroadcast(out._internal[i],in._internal[i],lane);
    }
  }
  friend strong_inline void permute(iVector<vtype,N> &out,const iVector<vtype,N> &in,int permutetype){
    for(int i=0;i<N;i++){
      permute(out._internal[i],in._internal[i],permutetype);
    }
  }
  friend strong_inline void rotate(iVector<vtype,N> &out,const iVector<vtype,N> &in,int rot){
    for(int i=0;i<N;i++){
      rotate(out._internal[i],in._internal[i],rot);
    }
  }
  // Unary negation
  friend strong_inline iVector<vtype,N> operator -(const iVector<vtype,N> &r) {
@ -318,7 +334,13 @@ public:
      for(int j=0;j<N;j++){
 	vstream(out._internal[i][j],in._internal[i][j]);
      }}
-    }
+  }
  friend strong_inline void vbroadcast(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int lane){
      for(int i=0;i<N;i++){
      for(int j=0;j<N;j++){
 	vbroadcast(out._internal[i][j],in._internal[i][j],lane);
      }}
  }
  friend strong_inline void permute(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int permutetype){
    for(int i=0;i<N;i++){
@ -326,6 +348,12 @@ public:
 	permute(out._internal[i][j],in._internal[i][j],permutetype);
    }}
  }
  friend strong_inline void rotate(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int rot){
    for(int i=0;i<N;i++){
      for(int j=0;j<N;j++){
 	rotate(out._internal[i][j],in._internal[i][j],rot);
    }}
  }
  // Unary negation
--- a/prerequisites/Makefile.am
+++ b/prerequisites/Makefile.am
@ -0,0 +1,38 @@
 FFTFLAGS=$(filter-out -std=c++11, $(CXXFLAGS) )
 EIGENVER=3.2.8
 EIGEN=eigen$(EIGENVER)
 EIGENTAR=$(EIGEN).tar.bz2
 EIGENURL=https://bitbucket.org/eigen/eigen/get/$(EIGENVER).tar.bz2
 FFTWVER=3.3.4
 FFTW=fftw-$(FFTWVER)
 FFTWTAR=fftw-$(FFTWVER).tar.gz
 FFTWURL=http://www.fftw.org/$(FFTWTAR)
 all: Eigen FFTW headerlist
 $(top_srcdir)/prerequisites/$(EIGENTAR): 
 	curl -v $(EIGENURL) -o $(top_srcdir)/prerequisites/$(EIGENTAR)
 $(top_srcdir)/prerequisites/$(FFTWTAR): 
 	curl -v $(FFTWURL) -o $(top_srcdir)/prerequisites/$(FFTWTAR)
 Eigen:  $(top_srcdir)/prerequisites/$(EIGENTAR)
 	tar xvf $(top_srcdir)/prerequisites/$(EIGENTAR)
 	- rm -rf  $(top_srcdir)/lib/Eigen
 	mv eigen-eigen*/Eigen .
 	echo EFILES=`find Eigen -type f -name '*.h' ` > $(top_srcdir)/lib/Eigen.inc
 	mv Eigen $(top_srcdir)/lib/
 	touch Eigen
 FFTW: $(top_srcdir)/prerequisites/$(FFTWTAR)
 	tar xvf $(top_srcdir)/prerequisites/$(FFTWTAR)
 	cd $(FFTW) &&	./configure --prefix=@abs_top_builddir@/prerequisites/fftwinstall CFLAGS="$(FFTFLAGS)" CC=$(CC) LDFLAGS="$(LDFLAGS)" && make all install
 	cp -pr fftwinstall/include/fftw3.h ../include/Grid/
 	cp -pr fftwinstall/lib/libfftw3.a  ../lib/
 	touch FFTW
 headerlist:
 	cd $(top_srcdir) && ./scripts/filelist
 	touch headerlist
--- a/prerequisites/eigen3.2.8.tar.bz2
+++ b/prerequisites/eigen3.2.8.tar.bz2
--- a/prerequisites/eigenIGENVER.tar.bz2
+++ b/prerequisites/eigenIGENVER.tar.bz2
--- a/scripts/filelist
+++ b/scripts/filelist
@ -1,7 +1,8 @@
 #!/bin/bash
-cd lib
+home=`pwd`
 cd $home/lib
 HFILES=`find . -type f -name '*.h'`
 CCFILES=`find . -type f -name '*.cc' -not  -name '*ommunicator*.cc'`
 echo> Make.inc
@ -9,18 +10,24 @@ echo HFILES=$HFILES >> Make.inc
 echo >> Make.inc
 echo CCFILES=$CCFILES >> Make.inc
-cd ..
+cd $home/tests
-cd tests
+dirs=`find . -type d `
 for subdir in $dirs
 do
 pwd
 echo subdir is $subdir of $dirs
 cd $home/tests/$subdir
 echo> Make.inc
 TESTS=`ls T*.cc`
 TESTLIST=`echo ${TESTS} | sed s/.cc//g `
 echo > Make.inc
 echo bin_PROGRAMS += ${TESTLIST} | sed s/Test_zmm//g >> Make.inc
 echo >> Make.inc
 for f in $TESTS
 do
 BNAME=`basename $f .cc`
@ -30,31 +37,10 @@ echo ${BNAME}_LDADD=-lGrid>> Make.inc
 echo >> Make.inc
 done
 cd qdpxx
 echo> Make.inc
 TESTS=`ls T*.cc`
 TESTLIST=`echo ${TESTS} | sed s/.cc//g `
 echo > Make.inc
 echo bin_PROGRAMS = ${TESTLIST} >> Make.inc
 echo >> Make.inc
 for f in $TESTS
 do
 BNAME=`basename $f .cc`
 echo >> Make.inc
 echo ${BNAME}_SOURCES=$f  >> Make.inc
 echo ${BNAME}_LDADD=-lGrid>> Make.inc
 echo >> Make.inc
 done
-cd ..
+cd $home/benchmarks
 cd ..
 cd benchmarks
 echo> Make.inc
 TESTS=`ls B*.cc`
--- a/scripts/linecount
+++ b/scripts/linecount
@ -19,7 +19,7 @@ count=`grep total tmp.fil`
 echo $count " in lib/${sdir}" 
 done
-wc -l tests/*.cc | grep total >& tmp.fil
+wc -l tests/*.cc tests/*/*.cc | grep total >& tmp.fil
 count=`grep total tmp.fil`
 echo  $count " in tests"
--- a/tests/IO/Make.inc
+++ b/tests/IO/Make.inc
@ -0,0 +1,11 @@
 bin_PROGRAMS += Test_nersc_io Test_serialisation
 Test_nersc_io_SOURCES=Test_nersc_io.cc
 Test_nersc_io_LDADD=-lGrid
 Test_serialisation_SOURCES=Test_serialisation.cc
 Test_serialisation_LDADD=-lGrid
--- a/tests/IO/Makefile.am
+++ b/tests/IO/Makefile.am
@ -0,0 +1,19 @@
 # additional include paths necessary to compile the C++ library
 bin_PROGRAMS =
 SUBDIRS =
 AM_CXXFLAGS = -I$(top_srcdir)/include
 AM_LDFLAGS = -L$(top_builddir)/lib
 if USE_LAPACK
 AM_CXXFLAGS += -DUSE_LAPACK
 if USE_LAPACK_LIB
 #if test "X${ac_LAPACK}X" != XyesX 
 AM_CXXFLAGS += -I$(ac_LAPACK)/include
 AM_LDFLAGS += -L$(ac_LAPACK)/lib
 #fi
 endif
 endif
 include Make.inc
--- a/tests/IO/Test_nersc_io.cc
+++ b/tests/IO/Test_nersc_io.cc
@ -27,7 +27,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
--- a/tests/IO/Test_serialisation.cc
+++ b/tests/IO/Test_serialisation.cc
@ -26,7 +26,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 namespace Grid {
--- a/tests/Make.inc
+++ b/tests/Make.inc
@ -1,239 +1,11 @@
-bin_PROGRAMS += Test_GaugeAction Test_RectPlaq Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_cshift_red_black_rotate Test_cshift_rotate Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_dwf_rb5d Test_gamma Test_gp_rect_force Test_gparity Test_gpdwf_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd 
+bin_PROGRAMS += Test_cshift Test_simd Test_stencil
 Test_GaugeAction_SOURCES=Test_GaugeAction.cc
 Test_GaugeAction_LDADD=-lGrid
 Test_RectPlaq_SOURCES=Test_RectPlaq.cc
 Test_RectPlaq_LDADD=-lGrid
 Test_cayley_cg_SOURCES=Test_cayley_cg.cc
 Test_cayley_cg_LDADD=-lGrid
 Test_cayley_coarsen_support_SOURCES=Test_cayley_coarsen_support.cc
 Test_cayley_coarsen_support_LDADD=-lGrid
 Test_cayley_even_odd_SOURCES=Test_cayley_even_odd.cc
 Test_cayley_even_odd_LDADD=-lGrid
 Test_cayley_ldop_cr_SOURCES=Test_cayley_ldop_cr.cc
 Test_cayley_ldop_cr_LDADD=-lGrid
 Test_cf_coarsen_support_SOURCES=Test_cf_coarsen_support.cc
 Test_cf_coarsen_support_LDADD=-lGrid
 Test_cf_cr_unprec_SOURCES=Test_cf_cr_unprec.cc
 Test_cf_cr_unprec_LDADD=-lGrid
 Test_cheby_SOURCES=Test_cheby.cc
 Test_cheby_LDADD=-lGrid
 Test_contfrac_cg_SOURCES=Test_contfrac_cg.cc
 Test_contfrac_cg_LDADD=-lGrid
 Test_contfrac_even_odd_SOURCES=Test_contfrac_even_odd.cc
 Test_contfrac_even_odd_LDADD=-lGrid
 Test_contfrac_force_SOURCES=Test_contfrac_force.cc
 Test_contfrac_force_LDADD=-lGrid
 Test_cshift_SOURCES=Test_cshift.cc
 Test_cshift_LDADD=-lGrid
 Test_cshift_red_black_SOURCES=Test_cshift_red_black.cc
 Test_cshift_red_black_LDADD=-lGrid
 Test_cshift_red_black_rotate_SOURCES=Test_cshift_red_black_rotate.cc
 Test_cshift_red_black_rotate_LDADD=-lGrid
 Test_cshift_rotate_SOURCES=Test_cshift_rotate.cc
 Test_cshift_rotate_LDADD=-lGrid
 Test_dwf_cg_prec_SOURCES=Test_dwf_cg_prec.cc
 Test_dwf_cg_prec_LDADD=-lGrid
 Test_dwf_cg_schur_SOURCES=Test_dwf_cg_schur.cc
 Test_dwf_cg_schur_LDADD=-lGrid
 Test_dwf_cg_unprec_SOURCES=Test_dwf_cg_unprec.cc
 Test_dwf_cg_unprec_LDADD=-lGrid
 Test_dwf_cr_unprec_SOURCES=Test_dwf_cr_unprec.cc
 Test_dwf_cr_unprec_LDADD=-lGrid
 Test_dwf_even_odd_SOURCES=Test_dwf_even_odd.cc
 Test_dwf_even_odd_LDADD=-lGrid
 Test_dwf_force_SOURCES=Test_dwf_force.cc
 Test_dwf_force_LDADD=-lGrid
 Test_dwf_fpgcr_SOURCES=Test_dwf_fpgcr.cc
 Test_dwf_fpgcr_LDADD=-lGrid
 Test_dwf_gpforce_SOURCES=Test_dwf_gpforce.cc
 Test_dwf_gpforce_LDADD=-lGrid
 Test_dwf_hdcr_SOURCES=Test_dwf_hdcr.cc
 Test_dwf_hdcr_LDADD=-lGrid
 Test_dwf_lanczos_SOURCES=Test_dwf_lanczos.cc
 Test_dwf_lanczos_LDADD=-lGrid
 Test_dwf_rb5d_SOURCES=Test_dwf_rb5d.cc
 Test_dwf_rb5d_LDADD=-lGrid
 Test_gamma_SOURCES=Test_gamma.cc
 Test_gamma_LDADD=-lGrid
 Test_gp_rect_force_SOURCES=Test_gp_rect_force.cc
 Test_gp_rect_force_LDADD=-lGrid
 Test_gparity_SOURCES=Test_gparity.cc
 Test_gparity_LDADD=-lGrid
 Test_gpdwf_force_SOURCES=Test_gpdwf_force.cc
 Test_gpdwf_force_LDADD=-lGrid
 Test_gpwilson_even_odd_SOURCES=Test_gpwilson_even_odd.cc
 Test_gpwilson_even_odd_LDADD=-lGrid
 Test_hmc_EODWFRatio_SOURCES=Test_hmc_EODWFRatio.cc
 Test_hmc_EODWFRatio_LDADD=-lGrid
 Test_hmc_EODWFRatio_Gparity_SOURCES=Test_hmc_EODWFRatio_Gparity.cc
 Test_hmc_EODWFRatio_Gparity_LDADD=-lGrid
 Test_hmc_EOWilsonFermionGauge_SOURCES=Test_hmc_EOWilsonFermionGauge.cc
 Test_hmc_EOWilsonFermionGauge_LDADD=-lGrid
 Test_hmc_EOWilsonRatio_SOURCES=Test_hmc_EOWilsonRatio.cc
 Test_hmc_EOWilsonRatio_LDADD=-lGrid
 Test_hmc_GparityIwasakiGauge_SOURCES=Test_hmc_GparityIwasakiGauge.cc
 Test_hmc_GparityIwasakiGauge_LDADD=-lGrid
 Test_hmc_GparityWilsonGauge_SOURCES=Test_hmc_GparityWilsonGauge.cc
 Test_hmc_GparityWilsonGauge_LDADD=-lGrid
 Test_hmc_IwasakiGauge_SOURCES=Test_hmc_IwasakiGauge.cc
 Test_hmc_IwasakiGauge_LDADD=-lGrid
 Test_hmc_RectGauge_SOURCES=Test_hmc_RectGauge.cc
 Test_hmc_RectGauge_LDADD=-lGrid
 Test_hmc_WilsonFermionGauge_SOURCES=Test_hmc_WilsonFermionGauge.cc
 Test_hmc_WilsonFermionGauge_LDADD=-lGrid
 Test_hmc_WilsonGauge_SOURCES=Test_hmc_WilsonGauge.cc
 Test_hmc_WilsonGauge_LDADD=-lGrid
 Test_hmc_WilsonRatio_SOURCES=Test_hmc_WilsonRatio.cc
 Test_hmc_WilsonRatio_LDADD=-lGrid
 Test_lie_generators_SOURCES=Test_lie_generators.cc
 Test_lie_generators_LDADD=-lGrid
 Test_main_SOURCES=Test_main.cc
 Test_main_LDADD=-lGrid
 Test_multishift_sqrt_SOURCES=Test_multishift_sqrt.cc
 Test_multishift_sqrt_LDADD=-lGrid
 Test_nersc_io_SOURCES=Test_nersc_io.cc
 Test_nersc_io_LDADD=-lGrid
 Test_partfrac_force_SOURCES=Test_partfrac_force.cc
 Test_partfrac_force_LDADD=-lGrid
 Test_quenched_update_SOURCES=Test_quenched_update.cc
 Test_quenched_update_LDADD=-lGrid
 Test_rect_force_SOURCES=Test_rect_force.cc
 Test_rect_force_LDADD=-lGrid
 Test_remez_SOURCES=Test_remez.cc
 Test_remez_LDADD=-lGrid
 Test_rhmc_EOWilson1p1_SOURCES=Test_rhmc_EOWilson1p1.cc
 Test_rhmc_EOWilson1p1_LDADD=-lGrid
 Test_rhmc_EOWilsonRatio_SOURCES=Test_rhmc_EOWilsonRatio.cc
 Test_rhmc_EOWilsonRatio_LDADD=-lGrid
 Test_rhmc_Wilson1p1_SOURCES=Test_rhmc_Wilson1p1.cc
 Test_rhmc_Wilson1p1_LDADD=-lGrid
 Test_rhmc_WilsonRatio_SOURCES=Test_rhmc_WilsonRatio.cc
 Test_rhmc_WilsonRatio_LDADD=-lGrid
 Test_rng_SOURCES=Test_rng.cc
 Test_rng_LDADD=-lGrid
 Test_rng_fixed_SOURCES=Test_rng_fixed.cc
 Test_rng_fixed_LDADD=-lGrid
 Test_serialisation_SOURCES=Test_serialisation.cc
 Test_serialisation_LDADD=-lGrid
 Test_simd_SOURCES=Test_simd.cc
 Test_simd_LDADD=-lGrid
@ -241,47 +13,3 @@ Test_simd_LDADD=-lGrid
 Test_stencil_SOURCES=Test_stencil.cc
 Test_stencil_LDADD=-lGrid
 Test_synthetic_lanczos_SOURCES=Test_synthetic_lanczos.cc
 Test_synthetic_lanczos_LDADD=-lGrid
 Test_wilson_cg_prec_SOURCES=Test_wilson_cg_prec.cc
 Test_wilson_cg_prec_LDADD=-lGrid
 Test_wilson_cg_schur_SOURCES=Test_wilson_cg_schur.cc
 Test_wilson_cg_schur_LDADD=-lGrid
 Test_wilson_cg_unprec_SOURCES=Test_wilson_cg_unprec.cc
 Test_wilson_cg_unprec_LDADD=-lGrid
 Test_wilson_cr_unprec_SOURCES=Test_wilson_cr_unprec.cc
 Test_wilson_cr_unprec_LDADD=-lGrid
 Test_wilson_even_odd_SOURCES=Test_wilson_even_odd.cc
 Test_wilson_even_odd_LDADD=-lGrid
 Test_wilson_force_SOURCES=Test_wilson_force.cc
 Test_wilson_force_LDADD=-lGrid
 Test_wilson_force_phiMdagMphi_SOURCES=Test_wilson_force_phiMdagMphi.cc
 Test_wilson_force_phiMdagMphi_LDADD=-lGrid
 Test_wilson_force_phiMphi_SOURCES=Test_wilson_force_phiMphi.cc
 Test_wilson_force_phiMphi_LDADD=-lGrid
 Test_wilson_tm_even_odd_SOURCES=Test_wilson_tm_even_odd.cc
 Test_wilson_tm_even_odd_LDADD=-lGrid
 Test_zmm_SOURCES=Test_zmm.cc
 Test_zmm_LDADD=-lGrid
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@ -1,11 +1,16 @@
 # additional include paths necessary to compile the C++ library
-SUBDIRS = 
+#SUBDIRS = core
 # Uncomment to enable complete test suite build
 SUBDIRS = core forces hmc solver debug	
 if BUILD_CHROMA_REGRESSION
  SUBDIRS+= qdpxx
 endif
 bin_PROGRAMS =
-AM_CXXFLAGS = -I$(top_srcdir)/lib
+AM_CXXFLAGS = -I$(top_srcdir)/include
 AM_LDFLAGS = -L$(top_builddir)/lib
 if USE_LAPACK
@ -18,10 +23,4 @@ AM_LDFLAGS += -L$(ac_LAPACK)/lib
 endif
 endif
 if BUILD_ZMM
  bin_PROGRAMS=Test_zmm
 else
  bin_PROGRAMS=
 endif
 include Make.inc
--- a/tests/Test_cshift.cc
+++ b/tests/Test_cshift.cc
@ -26,7 +26,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 using namespace Grid;
 using namespace Grid::QCD;
--- a/tests/Test_simd.cc
+++ b/tests/Test_simd.cc
@ -26,7 +26,7 @@ Author: neo <cossu@post.kek.jp>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
--- a/tests/Test_stencil.cc
+++ b/tests/Test_stencil.cc
@ -27,7 +27,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include "Grid.h"
+#include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
--- a/tests/core/Make.inc
+++ b/tests/core/Make.inc
@ -0,0 +1,83 @@
 bin_PROGRAMS += Test_GaugeAction Test_RectPlaq Test_cf_coarsen_support Test_checker Test_contfrac_even_odd Test_cshift_red_black Test_cshift_red_black_rotate Test_cshift_rotate Test_dwf_even_odd Test_dwf_rb5d Test_gamma Test_gparity Test_gpwilson_even_odd Test_lie_generators Test_main Test_quenched_update Test_rng Test_rng_fixed Test_wilson_even_odd Test_wilson_tm_even_odd
 Test_GaugeAction_SOURCES=Test_GaugeAction.cc
 Test_GaugeAction_LDADD=-lGrid
 Test_RectPlaq_SOURCES=Test_RectPlaq.cc
 Test_RectPlaq_LDADD=-lGrid
 Test_cf_coarsen_support_SOURCES=Test_cf_coarsen_support.cc
 Test_cf_coarsen_support_LDADD=-lGrid
 Test_checker_SOURCES=Test_checker.cc
 Test_checker_LDADD=-lGrid
 Test_contfrac_even_odd_SOURCES=Test_contfrac_even_odd.cc
 Test_contfrac_even_odd_LDADD=-lGrid
 Test_cshift_red_black_SOURCES=Test_cshift_red_black.cc
 Test_cshift_red_black_LDADD=-lGrid
 Test_cshift_red_black_rotate_SOURCES=Test_cshift_red_black_rotate.cc
 Test_cshift_red_black_rotate_LDADD=-lGrid
 Test_cshift_rotate_SOURCES=Test_cshift_rotate.cc
 Test_cshift_rotate_LDADD=-lGrid
 Test_dwf_even_odd_SOURCES=Test_dwf_even_odd.cc
 Test_dwf_even_odd_LDADD=-lGrid
 Test_dwf_rb5d_SOURCES=Test_dwf_rb5d.cc
 Test_dwf_rb5d_LDADD=-lGrid
 Test_gamma_SOURCES=Test_gamma.cc
 Test_gamma_LDADD=-lGrid
 Test_gparity_SOURCES=Test_gparity.cc
 Test_gparity_LDADD=-lGrid
 Test_gpwilson_even_odd_SOURCES=Test_gpwilson_even_odd.cc
 Test_gpwilson_even_odd_LDADD=-lGrid
 Test_lie_generators_SOURCES=Test_lie_generators.cc
 Test_lie_generators_LDADD=-lGrid
 Test_main_SOURCES=Test_main.cc
 Test_main_LDADD=-lGrid
 Test_quenched_update_SOURCES=Test_quenched_update.cc
 Test_quenched_update_LDADD=-lGrid
 Test_rng_SOURCES=Test_rng.cc
 Test_rng_LDADD=-lGrid
 Test_rng_fixed_SOURCES=Test_rng_fixed.cc
 Test_rng_fixed_LDADD=-lGrid
 Test_wilson_even_odd_SOURCES=Test_wilson_even_odd.cc
 Test_wilson_even_odd_LDADD=-lGrid
 Test_wilson_tm_even_odd_SOURCES=Test_wilson_tm_even_odd.cc
 Test_wilson_tm_even_odd_LDADD=-lGrid
--- a/tests/core/Makefile.am
+++ b/tests/core/Makefile.am
@ -0,0 +1,19 @@
 # additional include paths necessary to compile the C++ library
 bin_PROGRAMS =
 SUBDIRS =
 AM_CXXFLAGS = -I$(top_srcdir)/include
 AM_LDFLAGS = -L$(top_builddir)/lib
 if USE_LAPACK
 AM_CXXFLAGS += -DUSE_LAPACK
 if USE_LAPACK_LIB
 #if test "X${ac_LAPACK}X" != XyesX 
 AM_CXXFLAGS += -I$(ac_LAPACK)/include
 AM_LDFLAGS += -L$(ac_LAPACK)/lib
 #fi
 endif
 endif
 include Make.inc
--- a/tests/core/Test_GaugeAction.cc
+++ b/tests/core/Test_GaugeAction.cc
@ -27,10 +27,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 #include <qcd/utils/CovariantCshift.h>
 #include <qcd/utils/WilsonLoops.h>
 using namespace std;
 using namespace Grid;
--- a/tests/core/Test_RectPlaq.cc
+++ b/tests/core/Test_RectPlaq.cc
@ -26,10 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 #include <qcd/utils/CovariantCshift.h>
 #include <qcd/utils/WilsonLoops.h>
 using namespace std;
 using namespace Grid;
--- a/tests/core/Test_cf_coarsen_support.cc
+++ b/tests/core/Test_cf_coarsen_support.cc
@ -25,7 +25,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
--- a/tests/core/Test_checker.cc
+++ b/tests/core/Test_checker.cc
@ -0,0 +1,155 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_dwf_cg_prec.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 template<class d>
 struct scal {
  d internal;
 };
  Gamma::GammaMatrix Gmu [] = {
    Gamma::GammaX,
    Gamma::GammaY,
    Gamma::GammaZ,
    Gamma::GammaT
  };
 int toint(const char* str){
  std::stringstream os; os << str;
  int out; os >> out;
  return out;
 }
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  assert(argc >= 5);
  std::vector<int> latt(4,0);
  latt[0] = toint(argv[1]);
  latt[1] = toint(argv[2]);
  latt[2] = toint(argv[3]);
  latt[3] = toint(argv[4]);
  const int Ls= toint(argv[5]);
  std::cout << "Lattice size (" << latt[0] << "," << latt[1] << "," << latt[2] << "," << latt[3] << ") Ls=" << Ls << std::endl;
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
  std::cout << "SIMD layout (" << simd_layout[0] << "," << simd_layout[1] << "," << simd_layout[2] << "," << simd_layout[3] << ")" << std::endl;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt, simd_layout,GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  //typedef Lattice<iGparitySpinColourVector<vComplexD> > LatticeType;
  typedef LatticeFermionD LatticeType;
  LatticeType    src(FGrid); random(RNG5,src);
  LatticeType src_o(FrbGrid);
  pickCheckerboard(Odd,src_o,src);
  std::vector<int> site(5);
  std::vector<int> cbsite(5);
  typedef typename GridTypeMapper<LatticeType::vector_object>::scalar_object sobj;
  // std::cout << "sizeof(vobj) " << sizeof(LatticeType::vector_object) << std::endl;
  // std::cout << "sizeof(sobj) " << sizeof(sobj) << std::endl;
  std::cout << "v1 from uncheckerboarded field, v2 from odd-parity red-black field\n";
  for(site[0]=0;site[0]<Ls;site[0]++){
    for(site[4]=0;site[4]<latt[3];site[4]++){
      for(site[3]=0;site[3]<latt[2];site[3]++){
 	for(site[2]=0;site[2]<latt[1];site[2]++){
 	  for(site[1]=0;site[1]<latt[0];site[1]++){
 	    if(src_o._grid->CheckerBoard(site) != src_o.checkerboard)
 	      continue;
 	    std::cout << "Site (" << site[0] << "," << site[1] << "," << site[2] << "," << site[3] << "," << site[4] << ")" << std::endl;
 	    sobj v1, v2;
 	    peekLocalSite(v1,src,site);
 	    peekLocalSite(v2,src_o,site);
 	    RealD v1_norm = norm2(v1);
 	    RealD v2_norm = norm2(v2);
 	    RealD diff = v2_norm - v1_norm;
 	    std::cout << v1_norm << " " << v2_norm << " " << diff << '\n';
 	    if(fabs(diff) > 1e-12){
 	      std::cout << "ERROR!\n";
 	      exit(-1);
 	    }
 	  }
 	}
      }
    }
  }
  // LatticeFermion result(FGrid); result=zero;
  // LatticeGaugeField Umu(UGrid); 
  // SU3::HotConfiguration(RNG4,Umu);
  // std::vector<LatticeColourMatrix> U(4,UGrid);
  // for(int mu=0;mu<Nd;mu++){
  //   U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
  // }
  // RealD mass=0.1;
  // RealD M5=1.8;
  // DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  // LatticeFermion    src_o(FrbGrid);
  // LatticeFermion result_o(FrbGrid);
  // pickCheckerboard(Odd,src_o,src);
  // result_o=zero;
  // SchurDiagMooeeOperator<DomainWallFermionR,LatticeFermion> HermOpEO(Ddwf);
  // ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
  // CG(HermOpEO,src_o,result_o);
  Grid_finalize();
 }
--- a/tests/core/Test_contfrac_even_odd.cc
+++ b/tests/core/Test_contfrac_even_odd.cc
@ -25,7 +25,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
--- a/tests/core/Test_cshift_red_black.cc
+++ b/tests/core/Test_cshift_red_black.cc
@ -27,7 +27,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 using namespace Grid;
 using namespace Grid::QCD;
--- a/tests/core/Test_cshift_red_black_rotate.cc
+++ b/tests/core/Test_cshift_red_black_rotate.cc
@ -27,7 +27,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 using namespace Grid;
 using namespace Grid::QCD;
--- a/tests/core/Test_cshift_rotate.cc
+++ b/tests/core/Test_cshift_rotate.cc
@ -26,7 +26,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 using namespace Grid;
 using namespace Grid::QCD;
--- a/tests/core/Test_dwf_even_odd.cc
+++ b/tests/core/Test_dwf_even_odd.cc
@ -26,7 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
--- a/tests/core/Test_dwf_rb5d.cc
+++ b/tests/core/Test_dwf_rb5d.cc
@ -26,7 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
@ -44,9 +44,9 @@ struct scal {
    Gamma::GammaT
  };
-typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
+typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
-typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
+typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
-typedef WilsonFermion5D<DomainWallRedBlack5dImplD> WilsonFermion5DD;
+typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
 typedef WilsonFermion5D<WilsonImplR> WilsonFermion5D_OKR;
@ -97,14 +97,12 @@ int main (int argc, char ** argv)
  RealD M5  =1.8;
  typename WilsonFermion5DR::ImplParams params;
-  WilsonFermion5DR Dw(1,Umu,*FGrid,*FrbGrid,*sUGrid,M5,params);
+  WilsonFermion5DR Dw(Umu,*FGrid,*FrbGrid,*sUGrid,*sUrbGrid,M5,params);
  Dw.Dhop(src,result,0);
  std::cout << "Norm src = "<<norm2(src)<<" Norm res = "<<norm2(result) << std::endl;
  GridCartesian         * FokGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FokrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  WilsonFermion5D_OKR Dok(Umu,*FokGrid,*FokrbGrid,*UGrid,*UrbGrid,M5,params);
@ -131,7 +129,7 @@ int main (int argc, char ** argv)
  std::cout << "Reference = "<<norm2(src_ok)<<" res = "<<norm2(ref_ok) << std::endl;
  ref_ok = ref_ok - result_ok;
-  std::cout << "Reference diff = "<<norm2(result_ok)<< std::endl;
+  std::cout << "Result = "<<norm2(result_ok)<< std::endl;
  std::cout << "Reference diff = "<<norm2(ref_ok)<< std::endl;
  Grid_finalize();
--- a/Show More
+++ b/Show More