Merge branch 'feature/Ls-vectorised-actions' into develop

2025-09-19 09:41:05 +01:00 · 2016-07-15 19:09:47 +01:00
parent fc4a043663 980ff18956
commit da34d75841
170 changed files with 3014 additions and 1310 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -82,13 +82,14 @@ install:
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
    
 script:
-    - ./scripts/reconfigure_script
+    - ./autogen.sh
    - mkdir build
    - cd build
    - ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=single --enable-simd=SSE4 --enable-comms=none
-    - make -j4
+    - make -j1 -C prerequisites
+    - make -j4 
    - ./benchmarks/Benchmark_dwf --threads 1
-    - make clean
+    - echo make clean
    - ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=double --enable-simd=SSE4 --enable-comms=none
    - make -j4
    - ./benchmarks/Benchmark_dwf --threads 1
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,5 +1,6 @@
 # additional include paths necessary to compile the C++ library
-AM_CXXFLAGS = -I$(top_srcdir)/
-SUBDIRS = lib tests benchmarks
+AM_CXXFLAGS = -I$(top_srcdir)/include/
+
+SUBDIRS = prerequisites lib benchmarks tests

 filelist: $(SUBDIRS)
--- a/scripts/reconfigure_script
+++ b/scripts/reconfigure_script
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -25,7 +25,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
@@ -196,5 +196,126 @@ int main (int argc, char ** argv)



+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking sequential persistent halo exchange in "<<nmu<<" dimensions"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
+
+
+  for(int lat=4;lat<=32;lat+=2){
+    for(int Ls=1;Ls<=16;Ls*=2){
+
+      std::vector<int> latt_size  ({lat,lat,lat,lat});
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+
+      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+
+
+      int ncomm;
+      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+
+
+      std::vector<CartesianCommunicator::CommsRequest_t> empty;
+      std::vector<std::vector<CartesianCommunicator::CommsRequest_t> > requests_fwd(Nd,empty);
+      std::vector<std::vector<CartesianCommunicator::CommsRequest_t> > requests_bwd(Nd,empty);
+
+      for(int mu=0;mu<4;mu++){
+	ncomm=0;
+	if (mpi_layout[mu]>1 ) {
+	  ncomm++;
+
+	  int comm_proc;
+	  int xmit_to_rank;
+	  int recv_from_rank;
+
+	  comm_proc=1;
+	  Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	  Grid.SendToRecvFromInit(requests_fwd[mu],
+				  (void *)&xbuf[mu][0],
+				  xmit_to_rank,
+				  (void *)&rbuf[mu][0],
+				  recv_from_rank,
+				  bytes);
+
+	  comm_proc = mpi_layout[mu]-1;
+	  Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	  Grid.SendToRecvFromInit(requests_bwd[mu],
+				  (void *)&xbuf[mu+4][0],
+				  xmit_to_rank,
+				  (void *)&rbuf[mu+4][0],
+				  recv_from_rank,
+				  bytes);
+
+	}
+      }
+
+      {
+	double start=usecond();
+	for(int i=0;i<Nloop;i++){
+	  
+	  for(int mu=0;mu<4;mu++){
+	    
+	    if (mpi_layout[mu]>1 ) {
+	      
+	      Grid.SendToRecvFromBegin(requests_fwd[mu]);
+	      Grid.SendToRecvFromComplete(requests_fwd[mu]);
+	      Grid.SendToRecvFromBegin(requests_bwd[mu]);
+	      Grid.SendToRecvFromComplete(requests_bwd[mu]);
+	    }
+	  }
+	  Grid.Barrier();
+	}
+	
+	double stop=usecond();
+	
+	double dbytes    = bytes;
+	double xbytes    = Nloop*dbytes*2.0*ncomm;
+	double rbytes    = xbytes;
+	double bidibytes = xbytes+rbytes;
+	
+	double time = stop-start;
+	
+	std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
+
+      }
+
+
+      {
+	double start=usecond();
+	for(int i=0;i<Nloop;i++){
+	  
+	  for(int mu=0;mu<4;mu++){
+	    
+	    if (mpi_layout[mu]>1 ) {
+	      
+	      Grid.SendToRecvFromBegin(requests_fwd[mu]);
+	      Grid.SendToRecvFromBegin(requests_bwd[mu]);
+	      Grid.SendToRecvFromComplete(requests_fwd[mu]);
+	      Grid.SendToRecvFromComplete(requests_bwd[mu]);
+	    }
+	  }
+	  Grid.Barrier();
+	}
+	
+	double stop=usecond();
+	
+	double dbytes    = bytes;
+	double xbytes    = Nloop*dbytes*2.0*ncomm;
+	double rbytes    = xbytes;
+	double bidibytes = xbytes+rbytes;
+	
+	double time = stop-start;
+	
+	std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
+
+      }
+
+    }
+  }
+
+
+
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -26,8 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
-#include <PerfCount.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
@@ -46,9 +45,9 @@ struct scal {
  };

 bool overlapComms = false;
-typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
-typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
-typedef WilsonFermion5D<DomainWallRedBlack5dImplD> WilsonFermion5DD;
+typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
+typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
+typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;


 int main (int argc, char ** argv)
@@ -71,8 +70,8 @@ int main (int argc, char ** argv)

  std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
+  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
-  std::cout << GridLogMessage << "Making s innermost rb grids"<<std::endl;
  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);

  std::vector<int> seeds4({1,2,3,4});
@@ -87,6 +86,16 @@ int main (int argc, char ** argv)
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);

+  /*  src=zero;
+  std::vector<int> origin(5,0);
+  SpinColourVector f=zero;
+  for(int sp=0;sp<4;sp++){
+  for(int co=0;co<3;co++){
+    f()(sp)(co)=Complex(1.0,0.0); 
+  }}
+  pokeSite(f,src,origin);
+  */
+
  ColourMatrix cm = Complex(1.0,0.0);

  LatticeGaugeField Umu(UGrid); 
@@ -127,19 +136,16 @@ int main (int argc, char ** argv)
  RealD mass=0.1;
  RealD M5  =1.8;

-  typename DomainWallFermionR::ImplParams params; 
-  params.overlapCommsCompute = overlapComms;
-  
  RealD NP = UGrid->_Nprocessors;

  for(int doasm=1;doasm<2;doasm++){

    QCD::WilsonKernelsStatic::AsmOpt=doasm;

-  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
+  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  
  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
-  int ncall =10;
+  int ncall =100;
  if (1) {

    double t0=usecond();
@@ -165,11 +171,12 @@ int main (int argc, char ** argv)

  if (1)
  {
-    typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
+    typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
    LatticeFermion ssrc(sFGrid);
    LatticeFermion sref(sFGrid);
    LatticeFermion sresult(sFGrid);
-    WilsonFermion5DR sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5,params);
+
+    WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
  
    for(int x=0;x<latt4[0];x++){
    for(int y=0;y<latt4[1];y++){
@@ -181,7 +188,7 @@ int main (int argc, char ** argv)
      peekSite(tmp,src,site);
      pokeSite(tmp,ssrc,site);
    }}}}}
-
+    std::cout<<"src norms "<< norm2(src)<<" " <<norm2(ssrc)<<std::endl;
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      __SSC_START;
@@ -208,6 +215,7 @@ int main (int argc, char ** argv)
      }
    }

+    std::cout<<"res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl;


    RealF sum=0;
@@ -221,9 +229,11 @@ int main (int argc, char ** argv)
      peekSite(normal,result,site);
      peekSite(simd,sresult,site);
      sum=sum+norm2(normal-simd);
-      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
-      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<normal<<std::endl;
-      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<simd<<std::endl;
+      if (norm2(normal-simd) > 1.0e-6 ) {
+	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
+	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" normal "<<normal<<std::endl;
+	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" simd   "<<simd<<std::endl;
+      }
    }}}}}
    std::cout<<" difference between normal and simd is "<<sum<<std::endl;

@@ -268,9 +278,9 @@ int main (int argc, char ** argv)
      pickCheckerboard(Even,ssrc_e,sresult);
      pickCheckerboard(Odd ,ssrc_o,sresult);
      ssrc_e = ssrc_e - sr_e;
-      std::cout<<GridLogMessage << "sE norm diff   "<< norm2(ssrc_e)<<std::endl;
+      std::cout<<GridLogMessage << "sE norm diff   "<< norm2(ssrc_e)<< "  vec nrm"<<norm2(sr_e) <<std::endl;
      ssrc_o = ssrc_o - sr_o;
-      std::cout<<GridLogMessage << "sO norm diff   "<< norm2(ssrc_o)<<std::endl;
+      std::cout<<GridLogMessage << "sO norm diff   "<< norm2(ssrc_o)<< "  vec nrm"<<norm2(sr_o) <<std::endl;
    }


--- a/benchmarks/Benchmark_dwf_ntpf.cc
+++ b/benchmarks/Benchmark_dwf_ntpf.cc
@@ -26,8 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
-#include <PerfCount.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
--- a/benchmarks/Benchmark_dwf_sweep.cc
+++ b/benchmarks/Benchmark_dwf_sweep.cc
@@ -26,8 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
-#include <PerfCount.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
@@ -126,7 +125,6 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )

  ColourMatrix cm = Complex(1.0,0.0);

-
  LatticeGaugeField Umu5d(FGrid); 

  // replicate across fifth dimension
@@ -145,11 +143,10 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
  }

 #ifdef CHECK
-  if (1)
-  {
+  if (1) {
+
    ref = zero;
    for(int mu=0;mu<Nd;mu++){
-
      tmp = U[mu]*Cshift(src,mu+1,1);
      ref=ref + tmp - Gamma(Gmu[mu])*tmp;

@@ -193,20 +190,19 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
    Counter.Report();
  }
  
-  if ( ! report ) 
-    {
-      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-      double flops=1344*volume*ncall;
-      std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t";
-    }
+  if ( ! report ) {
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=1344*volume*ncall;
+    std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t";
+  }
  
 #ifdef CHECK
-    err = ref-result; 
-    RealD errd = norm2(err);
-    if ( errd> 1.0e-4 ) {
-      std::cout<<GridLogMessage << "oops !!! norm diff   "<< norm2(err)<<std::endl;
-      exit(-1);
-    }
+  err = ref-result; 
+  RealD errd = norm2(err);
+  if ( errd> 1.0e-4 ) {
+    std::cout<<GridLogMessage << "oops !!! norm diff   "<< norm2(err)<<std::endl;
+    exit(-1);
+  }
 #endif
    
  LatticeFermion src_e (FrbGrid);
@@ -232,10 +228,9 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
      std::cout<< flops/(t1-t0);
    }
  }
-  
 }

-#undef CHECK_SDW
+#define CHECK_SDW
 void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
 {

@@ -243,7 +238,9 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
+  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);

@@ -277,93 +274,89 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
    }
  }

-
  RealD mass=0.1;
  RealD M5  =1.8;

-    typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
-    LatticeFermion ssrc(sFGrid);
-    LatticeFermion sref(sFGrid);
-    LatticeFermion sresult(sFGrid);
-    WilsonFermion5DR sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5);
+  typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
+  LatticeFermion ssrc(sFGrid);
+  LatticeFermion sref(sFGrid);
+  LatticeFermion sresult(sFGrid);
+  WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
  
-    for(int x=0;x<latt4[0];x++){
-    for(int y=0;y<latt4[1];y++){
-    for(int z=0;z<latt4[2];z++){
-    for(int t=0;t<latt4[3];t++){
-    for(int s=0;s<Ls;s++){
-      std::vector<int> site({s,x,y,z,t});
-      SpinColourVector tmp;
-      peekSite(tmp,src,site);
-      pokeSite(tmp,ssrc,site);
-    }}}}}
+  for(int x=0;x<latt4[0];x++){
+  for(int y=0;y<latt4[1];y++){
+  for(int z=0;z<latt4[2];z++){
+  for(int t=0;t<latt4[3];t++){
+  for(int s=0;s<Ls;s++){
+    std::vector<int> site({s,x,y,z,t});
+    SpinColourVector tmp;
+    peekSite(tmp,src,site);
+    pokeSite(tmp,ssrc,site);
+  }}}}}

-    double t0=usecond();
-    sDw.Dhop(ssrc,sresult,0);
-    double t1=usecond();
+  double t0=usecond();
+  sDw.Dhop(ssrc,sresult,0);
+  double t1=usecond();

 #ifdef TIMERS_OFF
-    int ncall =10;
+  int ncall =10;
 #else 
-    int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
+  int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
 #endif

-    PerformanceCounter Counter(8);
-    Counter.Start();
-    t0=usecond();
-    for(int i=0;i<ncall;i++){
-      sDw.Dhop(ssrc,sresult,0);
-    }
-    t1=usecond();
-    Counter.Stop();
+  PerformanceCounter Counter(8);
+  Counter.Start();
+  t0=usecond();
+  for(int i=0;i<ncall;i++){
+    sDw.Dhop(ssrc,sresult,0);
+  }
+  t1=usecond();
+  Counter.Stop();
+  
+  if ( report ) {
+    Counter.Report();
+  } else { 
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=1344*volume*ncall;
+    std::cout<<"\t"<< flops/(t1-t0);
+  }

-    if ( report ) {
-      Counter.Report();
-    } else { 
-
-      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-      double flops=1344*volume*ncall;
-      std::cout<<"\t"<< flops/(t1-t0);
-    }
-
-
-    LatticeFermion sr_eo(sFGrid);
-    LatticeFermion serr(sFGrid);
-    
-    LatticeFermion ssrc_e (sFrbGrid);
-    LatticeFermion ssrc_o (sFrbGrid);
-    LatticeFermion sr_e   (sFrbGrid);
-    LatticeFermion sr_o   (sFrbGrid);
+  LatticeFermion sr_eo(sFGrid);
+  LatticeFermion serr(sFGrid);
+  
+  LatticeFermion ssrc_e (sFrbGrid);
+  LatticeFermion ssrc_o (sFrbGrid);
+  LatticeFermion sr_e   (sFrbGrid);
+  LatticeFermion sr_o   (sFrbGrid);
      
-    pickCheckerboard(Even,ssrc_e,ssrc);
-    pickCheckerboard(Odd,ssrc_o,ssrc);
-
-    setCheckerboard(sr_eo,ssrc_o);
-    setCheckerboard(sr_eo,ssrc_e);
-    
-    sr_e = zero;
-    sr_o = zero;
+  pickCheckerboard(Even,ssrc_e,ssrc);
+  pickCheckerboard(Odd,ssrc_o,ssrc);
+  
+  setCheckerboard(sr_eo,ssrc_o);
+  setCheckerboard(sr_eo,ssrc_e);
    
+  sr_e = zero;
+  sr_o = zero;
+  
+  sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
+  PerformanceCounter CounterSdw(8);
+  CounterSdw.Start();
+  t0=usecond();
+  for(int i=0;i<ncall;i++){
+    __SSC_START;
    sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
-    PerformanceCounter CounterSdw(8);
-    CounterSdw.Start();
-    t0=usecond();
-    for(int i=0;i<ncall;i++){
-      __SSC_START;
-      sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
-      __SSC_STOP;
-    }
-    t1=usecond();
-    CounterSdw.Stop();
+    __SSC_STOP;
+  }
+  t1=usecond();
+  CounterSdw.Stop();

-    if ( report ) { 
-      CounterSdw.Report();
-    } else {
-
-      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-      double flops=(1344.0*volume*ncall)/2;
-      std::cout<<"\t"<< flops/(t1-t0);
-    }
+  if ( report ) { 
+    CounterSdw.Report();
+  } else {
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=(1344.0*volume*ncall)/2;
+    std::cout<<"\t"<< flops/(t1-t0);
+  }
 }


--- a/benchmarks/Benchmark_memory_asynch.cc
+++ b/benchmarks/Benchmark_memory_asynch.cc
@@ -26,7 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
--- a/benchmarks/Benchmark_memory_bandwidth.cc
+++ b/benchmarks/Benchmark_memory_bandwidth.cc
@@ -26,7 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@@ -26,7 +26,7 @@ Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
--- a/benchmarks/Benchmark_wilson.cc
+++ b/benchmarks/Benchmark_wilson.cc
@@ -26,7 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
--- a/benchmarks/Benchmark_zmm.cc
+++ b/benchmarks/Benchmark_zmm.cc
@@ -25,8 +25,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
-#include <PerfCount.h>
+#include <Grid/Grid.h>


 using namespace Grid;
--- a/benchmarks/Makefile.am
+++ b/benchmarks/Makefile.am
@@ -1,5 +1,5 @@
 # additional include paths necessary to compile the C++ library
-AM_CXXFLAGS = -I$(top_srcdir)/lib
+AM_CXXFLAGS = -I$(top_srcdir)/include
 AM_LDFLAGS = -L$(top_builddir)/lib

 #
--- a/configure.ac
+++ b/configure.ac
@@ -10,6 +10,7 @@ AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk])
 AC_CANONICAL_SYSTEM
 AM_INIT_AUTOMAKE(subdir-objects)
 AC_CONFIG_MACRO_DIR([m4])
+AC_LINK_FILES(lib,include/Grid )
 AC_CONFIG_SRCDIR([lib/Grid.h])
 AC_CONFIG_HEADERS([lib/Config.h])
 m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
@@ -231,7 +232,6 @@ esac
 # Chroma regression tests
 #
 AC_ARG_ENABLE([chroma],[AC_HELP_STRING([--enable-chroma],[Expect chroma compiled under c++11 ])],ac_CHROMA=yes,ac_CHROMA=no)
-
 case ${ac_CHROMA} in
     yes)
       echo Enabling tests regressing to Chroma
@@ -269,15 +269,15 @@ AM_CONDITIONAL(USE_LAPACK_LIB,[ test "X${ac_LAPACK}X" != "XyesX" ])
 ###################################################################
 # Checks for doxygen support
 # if present enables the "make doxyfile" command
-#echo
-#echo Checking doxygen support 
-#echo :::::::::::::::::::::::::::::::::::::::::::
-#AC_PROG_DOXYGEN
+echo
+echo Checking doxygen support 
+echo :::::::::::::::::::::::::::::::::::::::::::
+AC_PROG_DOXYGEN

-#if test -n "$DOXYGEN"
-#then
-#AC_CONFIG_FILES([docs/doxy.cfg])
-#fi
+if test -n "$DOXYGEN"
+then
+AC_CONFIG_FILES([docs/doxy.cfg])
+fi

 echo
 echo Creating configuration files
@@ -285,8 +285,15 @@ echo :::::::::::::::::::::::::::::::::::::::::::
 AC_CONFIG_FILES(Makefile)
 AC_CONFIG_FILES(lib/Makefile)
 AC_CONFIG_FILES(tests/Makefile)
+AC_CONFIG_FILES(tests/IO/Makefile)
+AC_CONFIG_FILES(tests/core/Makefile)
+AC_CONFIG_FILES(tests/debug/Makefile)
+AC_CONFIG_FILES(tests/forces/Makefile)
+AC_CONFIG_FILES(tests/hmc/Makefile)
+AC_CONFIG_FILES(tests/solver/Makefile)
 AC_CONFIG_FILES(tests/qdpxx/Makefile)
 AC_CONFIG_FILES(benchmarks/Makefile)
+AC_CONFIG_FILES(prerequisites/Makefile)
 AC_OUTPUT


--- a/include/Grid
+++ b/include/Grid
@@ -0,0 +1 @@
+../lib
--- a/lib/Algorithms.h
+++ b/lib/Algorithms.h
@@ -29,27 +29,27 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_ALGORITHMS_H
 #define GRID_ALGORITHMS_H

-#include <algorithms/SparseMatrix.h>
-#include <algorithms/LinearOperator.h>
-#include <algorithms/Preconditioner.h>
+#include <Grid/algorithms/SparseMatrix.h>
+#include <Grid/algorithms/LinearOperator.h>
+#include <Grid/algorithms/Preconditioner.h>

-#include <algorithms/approx/Zolotarev.h>
-#include <algorithms/approx/Chebyshev.h>
-#include <algorithms/approx/Remez.h>
-#include <algorithms/approx/MultiShiftFunction.h>
+#include <Grid/algorithms/approx/Zolotarev.h>
+#include <Grid/algorithms/approx/Chebyshev.h>
+#include <Grid/algorithms/approx/Remez.h>
+#include <Grid/algorithms/approx/MultiShiftFunction.h>

-#include <algorithms/iterative/ConjugateGradient.h>
-#include <algorithms/iterative/ConjugateResidual.h>
-#include <algorithms/iterative/NormalEquations.h>
-#include <algorithms/iterative/SchurRedBlack.h>
+#include <Grid/algorithms/iterative/ConjugateGradient.h>
+#include <Grid/algorithms/iterative/ConjugateResidual.h>
+#include <Grid/algorithms/iterative/NormalEquations.h>
+#include <Grid/algorithms/iterative/SchurRedBlack.h>

-#include <algorithms/iterative/ConjugateGradientMultiShift.h>
+#include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>

 // Lanczos support
-#include <algorithms/iterative/MatrixUtils.h>
-#include <algorithms/iterative/ImplicitlyRestartedLanczos.h>
+#include <Grid/algorithms/iterative/MatrixUtils.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>

-#include <algorithms/CoarsenedMatrix.h>
+#include <Grid/algorithms/CoarsenedMatrix.h>

 // Eigen/lanczos
 // EigCg
--- a/lib/Cartesian.h
+++ b/lib/Cartesian.h
@@ -28,8 +28,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_CARTESIAN_H
 #define GRID_CARTESIAN_H

-#include <cartesian/Cartesian_base.h>
-#include <cartesian/Cartesian_full.h>
-#include <cartesian/Cartesian_red_black.h> 
+#include <Grid/cartesian/Cartesian_base.h>
+#include <Grid/cartesian/Cartesian_full.h>
+#include <Grid/cartesian/Cartesian_red_black.h> 

 #endif
--- a/lib/Communicator.h
+++ b/lib/Communicator.h
@@ -28,6 +28,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_COMMUNICATOR_H
 #define GRID_COMMUNICATOR_H

-#include <communicator/Communicator_base.h>
+#include <Grid/communicator/Communicator_base.h>

 #endif
--- a/lib/Cshift.h
+++ b/lib/Cshift.h
@@ -28,17 +28,17 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef _GRID_CSHIFT_H_
 #define _GRID_CSHIFT_H_

-#include <cshift/Cshift_common.h>
+#include <Grid/cshift/Cshift_common.h>

 #ifdef GRID_COMMS_NONE
-#include <cshift/Cshift_none.h>
+#include <Grid/cshift/Cshift_none.h>
 #endif

 #ifdef GRID_COMMS_MPI
-#include <cshift/Cshift_mpi.h>
+#include <Grid/cshift/Cshift_mpi.h>
 #endif 

 #ifdef GRID_COMMS_SHMEM
-#include <cshift/Cshift_mpi.h> // uses same implementation of communicator
+#include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
 #endif 
 #endif
--- a/lib/Eigen.inc
+++ b/lib/Eigen.inc
--- a/lib/Grid.h
+++ b/lib/Grid.h
@@ -59,29 +59,29 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 ///////////////////
 // Grid headers
 ///////////////////
-#include <serialisation/Serialisation.h>
-#include <Config.h>
-#include <Timer.h>
-#include <PerfCount.h>
-#include <Log.h>
-#include <AlignedAllocator.h>
-#include <Simd.h>
-#include <Threads.h>
-#include <Lexicographic.h>
-#include <Communicator.h> 
-#include <Cartesian.h>    
-#include <Tensors.h>      
-#include <Lattice.h>      
-#include <Cshift.h>       
-#include <Stencil.h>      
-#include <Algorithms.h>   
-#include <parallelIO/BinaryIO.h>
-#include <qcd/QCD.h>
-#include <parallelIO/NerscIO.h>
-#include <Init.h>
+#include <Grid/serialisation/Serialisation.h>
+#include "Config.h"
+#include <Grid/Timer.h>
+#include <Grid/PerfCount.h>
+#include <Grid/Log.h>
+#include <Grid/AlignedAllocator.h>
+#include <Grid/Simd.h>
+#include <Grid/Threads.h>
+#include <Grid/Lexicographic.h>
+#include <Grid/Communicator.h> 
+#include <Grid/Cartesian.h>    
+#include <Grid/Tensors.h>      
+#include <Grid/Lattice.h>      
+#include <Grid/Cshift.h>       
+#include <Grid/Stencil.h>      
+#include <Grid/Algorithms.h>   
+#include <Grid/parallelIO/BinaryIO.h>
+#include <Grid/qcd/QCD.h>
+#include <Grid/parallelIO/NerscIO.h>
+#include <Grid/Init.h>

-#include <qcd/hmc/NerscCheckpointer.h>
-#include <qcd/hmc/HmcRunner.h>
+#include <Grid/qcd/hmc/NerscCheckpointer.h>
+#include <Grid/qcd/hmc/HmcRunner.h>



--- a/lib/Lattice.h
+++ b/lib/Lattice.h
@@ -28,6 +28,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_LATTICE_H
 #define GRID_LATTICE_H

-#include <lattice/Lattice_base.h>
+#include <Grid/lattice/Lattice_base.h>

 #endif
--- a/lib/Make.inc
+++ b/lib/Make.inc
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -1,5 +1,5 @@
 # additional include paths necessary to compile the C++ library
-AM_CXXFLAGS = -I$(top_srcdir)/
+AM_CXXFLAGS = -I$(top_srcdir)/include/

 extra_sources=
 if BUILD_COMMS_MPI
@@ -17,16 +17,19 @@ endif
 #
 # Libraries
 #
-
 include Make.inc
+include Eigen.inc

 lib_LIBRARIES = libGrid.a
+
 libGrid_a_SOURCES = $(CCFILES) $(extra_sources)

+fftwdir = $(prefix)/lib/
+fftw_DATA = libfftw3.a

-#	qcd/action/fermion/PartialFractionFermion5D.cc\	\
 #
 # Include files
 #
-nobase_include_HEADERS=$(HFILES)
+otherincludedir = $(includedir)/Grid
+nobase_otherinclude_HEADERS =$(HFILES) $(EFILES) fftw3.h Config.h

--- a/lib/Simd.h
+++ b/lib/Simd.h
@@ -163,8 +163,8 @@ namespace Grid {

 };

-#include <simd/Grid_vector_types.h>
-#include <simd/Grid_vector_unops.h>
+#include "simd/Grid_vector_types.h"
+#include "simd/Grid_vector_unops.h"

 namespace Grid {
  // Default precision
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -30,7 +30,7 @@

 #include <thread>

- #include <stencil/Lebesgue.h>   // subdir aggregate
+ #include <Grid/stencil/Lebesgue.h>   // subdir aggregate

 //////////////////////////////////////////////////////////////////////////////////////////
 // Must not lose sight that goal is to be able to construct really efficient
--- a/lib/Tensors.h
+++ b/lib/Tensors.h
@@ -30,22 +30,22 @@ Author: neo <cossu@post.kek.jp>
 #ifndef GRID_MATH_H
 #define GRID_MATH_H

-#include <tensors/Tensor_traits.h>
-#include <tensors/Tensor_class.h>
-#include <tensors/Tensor_arith.h>
-#include <tensors/Tensor_inner.h>
-#include <tensors/Tensor_outer.h>
-#include <tensors/Tensor_transpose.h>
-#include <tensors/Tensor_trace.h>
-#include <tensors/Tensor_index.h>
-#include <tensors/Tensor_Ta.h>
-#include <tensors/Tensor_determinant.h>
-#include <tensors/Tensor_exp.h>
-//#include <tensors/Tensor_peek.h>
-//#include <tensors/Tensor_poke.h>
-#include <tensors/Tensor_reality.h>
-#include <tensors/Tensor_unary.h>
-#include <tensors/Tensor_extract_merge.h>
-#include <tensors/Tensor_logical.h>
+#include <Grid/tensors/Tensor_traits.h>
+#include <Grid/tensors/Tensor_class.h>
+#include <Grid/tensors/Tensor_arith.h>
+#include <Grid/tensors/Tensor_inner.h>
+#include <Grid/tensors/Tensor_outer.h>
+#include <Grid/tensors/Tensor_transpose.h>
+#include <Grid/tensors/Tensor_trace.h>
+#include <Grid/tensors/Tensor_index.h>
+#include <Grid/tensors/Tensor_Ta.h>
+#include <Grid/tensors/Tensor_determinant.h>
+#include <Grid/tensors/Tensor_exp.h>
+//#include <Grid/tensors/Tensor_peek.h>
+//#include <Grid/tensors/Tensor_poke.h>
+#include <Grid/tensors/Tensor_reality.h>
+#include <Grid/tensors/Tensor_unary.h>
+#include <Grid/tensors/Tensor_extract_merge.h>
+#include <Grid/tensors/Tensor_logical.h>

 #endif
--- a/lib/algorithms/CoarsenedMatrix.h
+++ b/lib/algorithms/CoarsenedMatrix.h
@@ -31,7 +31,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_ALGORITHM_COARSENED_MATRIX_H
 #define  GRID_ALGORITHM_COARSENED_MATRIX_H

-#include <Grid.h>

 namespace Grid {

--- a/lib/algorithms/SparseMatrix.h
+++ b/lib/algorithms/SparseMatrix.h
@@ -28,7 +28,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_ALGORITHM_SPARSE_MATRIX_H
 #define  GRID_ALGORITHM_SPARSE_MATRIX_H

-#include <Grid.h>

 namespace Grid {

--- a/lib/algorithms/approx/Chebyshev.h
+++ b/lib/algorithms/approx/Chebyshev.h
@@ -29,8 +29,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_CHEBYSHEV_H
 #define GRID_CHEBYSHEV_H

-#include<Grid.h>
-#include<algorithms/LinearOperator.h>
+#include <Grid/algorithms/LinearOperator.h>

 namespace Grid {

--- a/lib/algorithms/approx/Remez.h
+++ b/lib/algorithms/approx/Remez.h
@@ -19,9 +19,9 @@
 #include <Config.h>

 #ifdef HAVE_GMP_H
-#include <algorithms/approx/bigfloat.h>
+#include "bigfloat.h"
 #else
-#include <algorithms/approx/bigfloat_double.h>
+#include "algorithms/approx/bigfloat_double.h"
 #endif

 #define JMAX 10000 //Maximum number of iterations of Newton's approximation
--- a/lib/algorithms/iterative/DenseMatrix.h
+++ b/lib/algorithms/iterative/DenseMatrix.h
@@ -130,8 +130,8 @@ DenseMatrix<T> GetSubMtx(DenseMatrix<T> &A,int row_st, int row_end, int col_st,

 }

-#include <algorithms/iterative/Householder.h>
-#include <algorithms/iterative/Francis.h>
+#include "Householder.h"
+#include "Francis.h"

 #endif

--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -33,8 +33,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifdef USE_LAPACK
 #include <lapacke.h>
 #endif
-#include <algorithms/iterative/DenseMatrix.h>
-#include <algorithms/iterative/EigenSort.h>
+#include "DenseMatrix.h"
+#include "EigenSort.h"

 namespace Grid {

--- a/lib/cartesian/Cartesian_base.h
+++ b/lib/cartesian/Cartesian_base.h
@@ -29,7 +29,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_CARTESIAN_BASE_H
 #define GRID_CARTESIAN_BASE_H

-#include <Grid.h>

 namespace Grid{

@@ -107,6 +106,12 @@ public:
        for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
        return idx;
    }
+    virtual int iIndex(std::vector<int> &lcoor)
+    {
+        int idx=0;
+        for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
+        return idx;
+    }
    inline int oIndexReduced(std::vector<int> &ocoor)
    {
      int idx=0; 
@@ -123,12 +128,6 @@ public:
    //////////////////////////////////////////////////////////
    // SIMD lane addressing
    //////////////////////////////////////////////////////////
-    inline int iIndex(std::vector<int> &lcoor)
-    {
-        int idx=0;
-        for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
-        return idx;
-    }
    inline void iCoorFromIindex(std::vector<int> &coor,int lane)
    {
      Lexicographic::CoorFromIndex(coor,lane,_simd_layout);
@@ -220,7 +219,7 @@ public:
      }

      i_idx= iIndex(cblcoor);// this does not imply divide by 2 on checker dim
-      o_idx= oIndex(lcoor);// this implies divide by 2 on checkerdim
+      o_idx= oIndex(lcoor);  // this implies divide by 2 on checkerdim
    }

    void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector<int> &gcoor)
--- a/lib/cartesian/Cartesian_red_black.h
+++ b/lib/cartesian/Cartesian_red_black.h
@@ -32,17 +32,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 namespace Grid {

-    static const int CbRed  =0;
-    static const int CbBlack=1;
-    static const int Even   =CbRed;
-    static const int Odd    =CbBlack;
-
-    // Perhaps these are misplaced and 
-    // should be in sparse matrix.
-    // Also should make these a named enum type
-    static const int DaggerNo=0;
-    static const int DaggerYes=1;
-
+  static const int CbRed  =0;
+  static const int CbBlack=1;
+  static const int Even   =CbRed;
+  static const int Odd    =CbBlack;
+    
 // Specialise this for red black grids storing half the data like a chess board.
 class GridRedBlackCartesian : public GridBase
 {
@@ -227,6 +221,18 @@ protected:
        return idx;
    };
        
+    virtual int iIndex(std::vector<int> &lcoor)
+    {
+        int idx=0;
+        for(int d=0;d<_ndimension;d++) {
+	  if( d==_checker_dim ) {
+	    idx+=_istride[d]*(lcoor[d]/(2*_rdimensions[d]));
+	  } else { 
+	    idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
+	  }
+	}
+        return idx;
+    }
 };

 }
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -127,12 +127,21 @@ class CartesianCommunicator {
 			int recv_from_rank,
 			int bytes);

+    void SendToRecvFromInit(std::vector<CommsRequest_t> &list,
+			    void *xmit,
+			    int xmit_to_rank,
+			    void *recv,
+			    int recv_from_rank,
+			    int bytes);
+
    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 			 void *xmit,
 			 int xmit_to_rank,
 			 void *recv,
 			 int recv_from_rank,
 			 int bytes);
+
+    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list);
    void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);

    ////////////////////////////////////////////////////////////
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@@ -144,6 +144,28 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
 }

 // Basic Halo comms primitive
+// Basic Halo comms primitive
+void CartesianCommunicator::SendToRecvFromInit(std::vector<CommsRequest_t> &list,
+					       void *xmit,
+					       int dest,
+					       void *recv,
+					       int from,
+					       int bytes)
+{
+  MPI_Request xrq;
+  MPI_Request rrq;
+  int rank = _processor;
+  int ierr;
+  ierr =MPI_Send_init(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+  ierr|=MPI_Recv_init(recv, bytes, MPI_CHAR,dest,_processor,communicator,&rrq);
+  assert(ierr==0);
+  list.push_back(xrq);
+  list.push_back(rrq);
+}
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list)
+{
+  MPI_Startall(list.size(),&list[0]);
+}
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
@@ -151,17 +173,12 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 						int from,
 						int bytes)
 {
-  MPI_Request xrq;
-  MPI_Request rrq;
-  int rank = _processor;
-  int ierr;
-  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
-  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
-  
-  assert(ierr==0);
-
-  list.push_back(xrq);
-  list.push_back(rrq);
+  std::vector<CommsRequest_t> reqs(0);
+  SendToRecvFromInit(reqs,xmit,dest,recv,from,bytes);
+  SendToRecvFromBegin(reqs);
+  for(int i=0;i<reqs.size();i++){
+    list.push_back(reqs[i]);
+  }
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@@ -84,6 +84,19 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 {
  assert(0);
 }
+void CartesianCommunicator::SendToRecvFromInit(std::vector<CommsRequest_t> &list,
+						void *xmit,
+						int dest,
+						void *recv,
+						int from,
+						int bytes)
+{
+  assert(0);
+}
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list)
+{
+  assert(0);
+}
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  assert(0);
--- a/lib/communicator/Communicator_shmem.cc
+++ b/lib/communicator/Communicator_shmem.cc
@@ -268,6 +268,10 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
 }

 // Basic Halo comms primitive
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list)
+{
+  assert(0); //unimplemented
+}
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
@@ -280,6 +284,15 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
  //  shmem_putmem_nb(recv,xmit,bytes,dest,NULL);
  shmem_putmem(recv,xmit,bytes,dest);
 }
+void CartesianCommunicator::SendToRecvFromInit(std::vector<CommsRequest_t> &list,
+						void *xmit,
+						int dest,
+						void *recv,
+						int from,
+						int bytes)
+{
+  assert(0); // Unimplemented
+}
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  //  shmem_quiet();      // I'm done
--- a/lib/lattice/Lattice_base.h
+++ b/lib/lattice/Lattice_base.h
@@ -101,6 +101,7 @@ public:
    int begin(void) { return 0;};
    int end(void)   { return _odata.size(); }
    vobj & operator[](int i) { return _odata[i]; };
+    const vobj & operator[](int i) const { return _odata[i]; };

 public:
    typedef typename vobj::scalar_type scalar_type;
@@ -324,27 +325,27 @@ PARALLEL_FOR_LOOP



-#include <lattice/Lattice_conformable.h>
+#include "Lattice_conformable.h"
 #define GRID_LATTICE_EXPRESSION_TEMPLATES
 #ifdef  GRID_LATTICE_EXPRESSION_TEMPLATES
-#include <lattice/Lattice_ET.h>
+#include "Lattice_ET.h"
 #else 
-#include <lattice/Lattice_overload.h>
+#include "Lattice_overload.h"
 #endif
-#include <lattice/Lattice_arith.h>
-#include <lattice/Lattice_trace.h>
-#include <lattice/Lattice_transpose.h>
-#include <lattice/Lattice_local.h>
-#include <lattice/Lattice_reduction.h>
-#include <lattice/Lattice_peekpoke.h>
-#include <lattice/Lattice_reality.h>
-#include <lattice/Lattice_comparison_utils.h>
-#include <lattice/Lattice_comparison.h>
-#include <lattice/Lattice_coordinate.h>
-#include <lattice/Lattice_where.h>
-#include <lattice/Lattice_rng.h>
-#include <lattice/Lattice_unary.h>
-#include <lattice/Lattice_transfer.h>
+#include "Lattice_arith.h"
+#include "Lattice_trace.h"
+#include "Lattice_transpose.h"
+#include "Lattice_local.h"
+#include "Lattice_reduction.h"
+#include "Lattice_peekpoke.h"
+#include "Lattice_reality.h"
+#include "Lattice_comparison_utils.h"
+#include "Lattice_comparison.h"
+#include "Lattice_coordinate.h"
+#include "Lattice_where.h"
+#include "Lattice_rng.h"
+#include "Lattice_unary.h"
+#include "Lattice_transfer.h"


 #endif
--- a/lib/pugixml/pugixml.h
+++ b/lib/pugixml/pugixml.h
@@ -17,7 +17,7 @@
 #endif

 // Include user configuration file (this can define various configuration macros)
-#include <pugixml/pugiconfig.hpp>
+#include "pugiconfig.hpp"

 #ifndef HEADER_PUGIXML_HPP
 #define HEADER_PUGIXML_HPP
--- a/lib/qcd/QCD.h
+++ b/lib/qcd/QCD.h
@@ -60,6 +60,12 @@ namespace QCD {
    static const int SpinIndex   = 1;
    static const int LorentzIndex= 0;

+    // Also should make these a named enum type
+    static const int DaggerNo=0;
+    static const int DaggerYes=1;
+    static const int InverseNo=0;
+    static const int InverseYes=1;
+
    // Useful traits is this a spin index
    //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;

@@ -484,16 +490,16 @@ namespace QCD {
 }   //namespace QCD
 } // Grid

-#include <qcd/utils/SpaceTimeGrid.h>
-#include <qcd/spin/Dirac.h>
-#include <qcd/spin/TwoSpinor.h>
-#include <qcd/utils/LinalgUtils.h>
-#include <qcd/utils/CovariantCshift.h>
-#include <qcd/utils/SUn.h>
-#include <qcd/action/Actions.h>
-#include <qcd/hmc/integrators/Integrator.h>
-#include <qcd/hmc/integrators/Integrator_algorithm.h>
-#include <qcd/hmc/HMC.h>
+#include <Grid/qcd/utils/SpaceTimeGrid.h>
+#include <Grid/qcd/spin/Dirac.h>
+#include <Grid/qcd/spin/TwoSpinor.h>
+#include <Grid/qcd/utils/LinalgUtils.h>
+#include <Grid/qcd/utils/CovariantCshift.h>
+#include <Grid/qcd/utils/SUn.h>
+#include <Grid/qcd/action/Actions.h>
+#include <Grid/qcd/hmc/integrators/Integrator.h>
+#include <Grid/qcd/hmc/integrators/Integrator_algorithm.h>
+#include <Grid/qcd/hmc/HMC.h>


 #endif
--- a/lib/qcd/action/Actions.h
+++ b/lib/qcd/action/Actions.h
@@ -40,25 +40,25 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 ////////////////////////////////////////////
 // Abstract base interface
 ////////////////////////////////////////////
-#include <qcd/action/ActionBase.h>
-#include <qcd/action/ActionParams.h>
+#include <Grid/qcd/action/ActionBase.h>
+#include <Grid/qcd/action/ActionParams.h>

 ////////////////////////////////////////////
 // Utility functions
 ////////////////////////////////////////////
-#include <qcd/action/gauge/GaugeImpl.h>
-#include <qcd/utils/WilsonLoops.h>
+#include <Grid/qcd/action/gauge/GaugeImpl.h>
+#include <Grid/qcd/utils/WilsonLoops.h>

-#include <qcd/action/fermion/WilsonCompressor.h>     //used by all wilson type fermions
-#include <qcd/action/fermion/FermionOperatorImpl.h>
-#include <qcd/action/fermion/FermionOperator.h>
-#include <qcd/action/fermion/WilsonKernels.h>        //used by all wilson type fermions
+#include <Grid/qcd/action/fermion/WilsonCompressor.h>     //used by all wilson type fermions
+#include <Grid/qcd/action/fermion/FermionOperatorImpl.h>
+#include <Grid/qcd/action/fermion/FermionOperator.h>
+#include <Grid/qcd/action/fermion/WilsonKernels.h>        //used by all wilson type fermions

 ////////////////////////////////////////////
 // Gauge Actions
 ////////////////////////////////////////////
-#include <qcd/action/gauge/WilsonGaugeAction.h>
-#include <qcd/action/gauge/PlaqPlusRectangleAction.h>
+#include <Grid/qcd/action/gauge/WilsonGaugeAction.h>
+#include <Grid/qcd/action/gauge/PlaqPlusRectangleAction.h>

 namespace Grid {
 namespace QCD {
@@ -107,41 +107,50 @@ typedef SymanzikGaugeAction<ConjugateGimplD>        ConjugateSymanzikGaugeAction
 // for EVERY .cc file. This define centralises the list and restores global push of impl cases
 ////////////////////////////////////////////////////////////////////////////////////////////////////

-#define FermOpTemplateInstantiate(A) \
+
+#define FermOp4dVecTemplateInstantiate(A) \
  template class A<WilsonImplF>;		\
  template class A<WilsonImplD>;		\
  template class A<GparityWilsonImplF>;		\
  template class A<GparityWilsonImplD>;		

+#define FermOp5dVecTemplateInstantiate(A) \
+  template class A<DomainWallVec5dImplF>;	\
+  template class A<DomainWallVec5dImplD>;	
+
+#define FermOpTemplateInstantiate(A) \
+ FermOp4dVecTemplateInstantiate(A) \
+ FermOp5dVecTemplateInstantiate(A) 
+
 #define GparityFermOpTemplateInstantiate(A) 

 ////////////////////////////////////////////
 // Fermion operators / actions
 ////////////////////////////////////////////

-#include <qcd/action/fermion/WilsonFermion.h>       // 4d wilson like
-#include <qcd/action/fermion/WilsonTMFermion.h>       // 4d wilson like
-#include <qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
+#include <Grid/qcd/action/fermion/WilsonFermion.h>       // 4d wilson like
+#include <Grid/qcd/action/fermion/WilsonTMFermion.h>       // 4d wilson like
+#include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types

-//#include <qcd/action/fermion/CloverFermion.h>
+//#include <Grid/qcd/action/fermion/CloverFermion.h>

-#include <qcd/action/fermion/CayleyFermion5D.h>     // Cayley types
-#include <qcd/action/fermion/DomainWallFermion.h>
-#include <qcd/action/fermion/DomainWallFermion.h>
-#include <qcd/action/fermion/MobiusFermion.h>
-#include <qcd/action/fermion/ScaledShamirFermion.h>
-#include <qcd/action/fermion/MobiusZolotarevFermion.h>
-#include <qcd/action/fermion/ShamirZolotarevFermion.h>
-#include <qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h>
-#include <qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h>
+#include <Grid/qcd/action/fermion/CayleyFermion5D.h>     // Cayley types
+#include <Grid/qcd/action/fermion/DomainWallFermion.h>
+#include <Grid/qcd/action/fermion/DomainWallFermion.h>
+#include <Grid/qcd/action/fermion/MobiusFermion.h>
+#include <Grid/qcd/action/fermion/ScaledShamirFermion.h>
+#include <Grid/qcd/action/fermion/MobiusZolotarevFermion.h>
+#include <Grid/qcd/action/fermion/ShamirZolotarevFermion.h>
+#include <Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h>
+#include <Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h>

-#include <qcd/action/fermion/ContinuedFractionFermion5D.h>               // Continued fraction
-#include <qcd/action/fermion/OverlapWilsonContfracTanhFermion.h>
-#include <qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h>
+#include <Grid/qcd/action/fermion/ContinuedFractionFermion5D.h>               // Continued fraction
+#include <Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h>
+#include <Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h>

-#include <qcd/action/fermion/PartialFractionFermion5D.h>                 // Partial fraction
-#include <qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
-#include <qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h>
+#include <Grid/qcd/action/fermion/PartialFractionFermion5D.h>                 // Partial fraction
+#include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
+#include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h>

 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // More maintainable to maintain the following typedef list centrally, as more "impl" targets
@@ -222,21 +231,21 @@ typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
 ///////////////////////////////////////////////////////////////////////////////
 // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
 ///////////////////////////////////////////////////////////////////////////////
-#include <qcd/action/fermion/g5HermitianLinop.h>
+#include <Grid/qcd/action/fermion/g5HermitianLinop.h>

 ////////////////////////////////////////
 // Pseudo fermion combinations for HMC
 ////////////////////////////////////////
-#include <qcd/action/pseudofermion/EvenOddSchurDifferentiable.h>
+#include <Grid/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h>

-#include <qcd/action/pseudofermion/TwoFlavour.h>
-#include <qcd/action/pseudofermion/TwoFlavourRatio.h>
-#include <qcd/action/pseudofermion/TwoFlavourEvenOdd.h>
-#include <qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h>
+#include <Grid/qcd/action/pseudofermion/TwoFlavour.h>
+#include <Grid/qcd/action/pseudofermion/TwoFlavourRatio.h>
+#include <Grid/qcd/action/pseudofermion/TwoFlavourEvenOdd.h>
+#include <Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h>

-#include <qcd/action/pseudofermion/OneFlavourRational.h>
-#include <qcd/action/pseudofermion/OneFlavourRationalRatio.h>
-#include <qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
-#include <qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>
+#include <Grid/qcd/action/pseudofermion/OneFlavourRational.h>
+#include <Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h>
+#include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
+#include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>

 #endif
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@@ -28,7 +28,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+
 #include <Grid.h>
+
+
 namespace Grid {
 namespace QCD {

@@ -45,486 +48,342 @@ namespace QCD {
 		   FourDimGrid,
 	 	   FourDimRedBlackGrid,_M5,p),
   mass(_mass)
- {
- }
+ { }

- template<class Impl>
-  void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din)
-  {
-    // Assemble Din
-    int Ls=this->Ls;
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	//	Din = bs psi[s] + cs[s] psi[s+1}
-	axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
-	//      Din+= -mass*cs[s] psi[s+1}
-	axpby_ssp_pplus (Din,1.0,Din,-mass*cs[s],psi,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pminus(Din,bs[s],psi,-mass*cs[s],psi,s,0);
-	axpby_ssp_pplus (Din,1.0,Din,cs[s],psi,s,s-1);
-      } else {
-	axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
-	axpby_ssp_pplus(Din,1.0,Din,cs[s],psi,s,s-1);
-      }
-    }
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  std::vector<RealD> diag (Ls,1.0);
+  std::vector<RealD> upper(Ls,-1.0); upper[Ls-1]=mass;
+  std::vector<RealD> lower(Ls,-1.0); lower[0]   =mass;
+  M5D(psi,chi,chi,lower,diag,upper);
+}
+template<class Impl>
+void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din)
+{
+  int Ls=this->Ls;
+  std::vector<RealD> diag = bs;
+  std::vector<RealD> upper= cs;
+  std::vector<RealD> lower= cs; 
+  upper[Ls-1]=-mass*upper[Ls-1];
+  lower[0]   =-mass*lower[0];
+  M5D(psi,psi,Din,lower,diag,upper);
+}
+template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  std::vector<RealD> diag = beo;
+  std::vector<RealD> upper(Ls);
+  std::vector<RealD> lower(Ls);
+  for(int i=0;i<Ls;i++) {
+    upper[i]=-ceo[i];
+    lower[i]=-ceo[i];
  }
- template<class Impl>
-  void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din)
-  {
-    int Ls=this->Ls;
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	axpby_ssp_pplus (Din,bs[s],psi,cs[s+1],psi,s,s+1);
-	axpby_ssp_pminus(Din,1.0,Din,-mass*cs[Ls-1],psi,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pplus (Din,bs[s],psi,-mass*cs[0],psi,s,0);
-	axpby_ssp_pminus(Din,1.0,Din,cs[s-1],psi,s,s-1);
-      } else {
-	axpby_ssp_pplus (Din,bs[s],psi,cs[s+1],psi,s,s+1);
-	axpby_ssp_pminus(Din,1.0,Din,cs[s-1],psi,s,s-1);
-      }
-    }
+  upper[Ls-1]=-mass*upper[Ls-1];
+  lower[0]   =-mass*lower[0];
+  M5D(psi,psi,chi,lower,diag,upper);
+}
+template<class Impl>
+void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  std::vector<RealD> diag = bee;
+  std::vector<RealD> upper(Ls);
+  std::vector<RealD> lower(Ls);
+  for(int i=0;i<Ls;i++) {
+    upper[i]=-cee[i];
+    lower[i]=-cee[i];
  }
+  upper[Ls-1]=-mass*upper[Ls-1];
+  lower[0]   =-mass*lower[0];
+  M5D(psi,psi,chi,lower,diag,upper);
+}

-  // override multiply
- template<class Impl>
-  RealD CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
-  {
-    int Ls=this->Ls;
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  std::vector<RealD> diag = bee;
+  std::vector<RealD> upper(Ls);
+  std::vector<RealD> lower(Ls);

-    FermionField Din(psi._grid);
-
-    // Assemble Din
-    /*
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	//	Din = bs psi[s] + cs[s] psi[s+1}
-	axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
-	//      Din+= -mass*cs[s] psi[s+1}
-	axpby_ssp_pplus (Din,1.0,Din,-mass*cs[s],psi,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pminus(Din,bs[s],psi,-mass*cs[s],psi,s,0);
-	axpby_ssp_pplus (Din,1.0,Din,cs[s],psi,s,s-1);
-      } else {
-	axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
-	axpby_ssp_pplus(Din,1.0,Din,cs[s],psi,s,s-1);
-      }
-    }
-    */
-    Meooe5D(psi,Din);
-
-    this->DW(Din,chi,DaggerNo);
-    // ((b D_W + D_w hop terms +1) on s-diag
-    axpby(chi,1.0,1.0,chi,psi); 
-
-    // Call Mooee??
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ){
-	axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s+1);
-	axpby_ssp_pplus (chi,1.0,chi,mass,psi,s,Ls-1);
-      } else if ( s==(Ls-1)) {
-	axpby_ssp_pminus(chi,1.0,chi,mass,psi,s,0);
-	axpby_ssp_pplus (chi,1.0,chi,-1.0,psi,s,s-1);
-      } else {
-	axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s+1);
-	axpby_ssp_pplus (chi,1.0,chi,-1.0,psi,s,s-1);
-      }
-    }
-    return norm2(chi);
-  }
-
- template<class Impl>
-  RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
-  {
-    // Under adjoint
-    //D1+        D1- P-    ->   D1+^dag   P+ D2-^dag
-    //D2- P+     D2+            P-D1-^dag D2+dag
-
-    FermionField Din(psi._grid);
-    // Apply Dw
-    this->DW(psi,Din,DaggerYes); 
-
-    MeooeDag5D(Din,chi);
-
-    int Ls=this->Ls;
-    for(int s=0;s<Ls;s++){
-
-      // Collect the terms in DW
-      //	Chi = bs Din[s] + cs[s] Din[s+1}
-      //    Chi+= -mass*cs[s] psi[s+1}
-      /*
-      if ( s==0 ) {
-	axpby_ssp_pplus (chi,bs[s],Din,cs[s+1],Din,s,s+1);
-	axpby_ssp_pminus(chi,1.0,chi,-mass*cs[Ls-1],Din,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pplus (chi,bs[s],Din,-mass*cs[0],Din,s,0);
-	axpby_ssp_pminus(chi,1.0,chi,cs[s-1],Din,s,s-1);
-      } else {
-	axpby_ssp_pplus (chi,bs[s],Din,cs[s+1],Din,s,s+1);
-	axpby_ssp_pminus(chi,1.0,chi,cs[s-1],Din,s,s-1);
-      }
-      */
-
-      // FIXME just call MooeeDag??
-
-      // Collect the terms indept of DW
-      if ( s==0 ){
-	axpby_ssp_pplus (chi,1.0,chi,-1.0,psi,s,s+1);
-	axpby_ssp_pminus(chi,1.0,chi,mass,psi,s,Ls-1);
-      } else if ( s==(Ls-1)) {
-	axpby_ssp_pplus (chi,1.0,chi,mass,psi,s,0);
-	axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s-1);
-      } else {
-	axpby_ssp_pplus(chi,1.0,chi,-1.0,psi,s,s+1);
-	axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s-1);
-      }
-    }
-    // ((b D_W + D_w hop terms +1) on s-diag
-    axpby (chi,1.0,1.0,chi,psi); 
-    return norm2(chi);
-  }
-
-  // half checkerboard operations
- template<class Impl>
-  void CayleyFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
-  {
-    int Ls=this->Ls;
-
-    FermionField tmp(psi._grid);
+  for (int s=0;s<Ls;s++){
    // Assemble the 5d matrix
-    Meooe5D(psi,tmp); 
-#if 0
-    std::cout << "Meooe Test replacement norm2 tmp = " <<norm2(tmp)<<std::endl;
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	//	tmp = bs psi[s] + cs[s] psi[s+1}
-	//      tmp+= -mass*cs[s] psi[s+1}
-	axpby_ssp_pminus(tmp,beo[s],psi,-ceo[s],psi ,s, s+1);
-	axpby_ssp_pplus(tmp,1.0,tmp,mass*ceo[s],psi,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pminus(tmp,beo[s],psi,mass*ceo[s],psi,s,0);
-	axpby_ssp_pplus(tmp,1.0,tmp,-ceo[s],psi,s,s-1);
-      } else {
-	axpby_ssp_pminus(tmp,beo[s],psi,-ceo[s],psi,s,s+1);
-	axpby_ssp_pplus (tmp,1.0,tmp,-ceo[s],psi,s,s-1);
-      }
-    }
-    std::cout << "Meooe Test replacement norm2 tmp old = " <<norm2(tmp)<<std::endl;
-#endif
-
-    // Apply 4d dslash
-    if ( psi.checkerboard == Odd ) {
-      this->DhopEO(tmp,chi,DaggerNo);
+    if ( s==0 ) {
+      upper[s] = -cee[s+1] ;
+      lower[s] = mass*cee[Ls-1];
+    } else if ( s==(Ls-1)) { 
+      upper[s] = mass*cee[0];
+      lower[s] = -cee[s-1];
    } else {
-      this->DhopOE(tmp,chi,DaggerNo);
+      upper[s]=-cee[s+1];
+      lower[s]=-cee[s-1];
    }
  }

-  template<class Impl>
-  void CayleyFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
-  {
-    FermionField tmp(psi._grid);
-    // Apply 4d dslash
-    if ( psi.checkerboard == Odd ) {
-      this->DhopEO(psi,tmp,DaggerYes);
-    } else {
-      this->DhopOE(psi,tmp,DaggerYes);
-    }
+  M5Ddag(psi,psi,chi,lower,diag,upper);
+}

-    MeooeDag5D(tmp,chi); 
-#if 0
-    std::cout << "Meooe Test replacement norm2 chi new = " <<norm2(chi)<<std::endl;
-    // Assemble the 5d matrix
-    int Ls=this->Ls;
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	axpby_ssp_pplus(chi,beo[s],tmp,   -ceo[s+1]  ,tmp,s,s+1);
-	axpby_ssp_pminus(chi,   1.0,chi,mass*ceo[Ls-1],tmp,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pplus(chi,beo[s],tmp,mass*ceo[0],tmp,s,0);
-	axpby_ssp_pminus(chi,1.0,chi,-ceo[s-1],tmp,s,s-1);
-      } else {
-	axpby_ssp_pplus(chi,beo[s],tmp,-ceo[s+1],tmp,s,s+1);
-	axpby_ssp_pminus(chi,1.0   ,chi,-ceo[s-1],tmp,s,s-1);
-      }
-    }
-    std::cout << "Meooe Test replacement norm2 chi old = " <<norm2(chi)<<std::endl;
-#endif
+template<class Impl>
+void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  std::vector<RealD> diag(Ls,1.0);
+  std::vector<RealD> upper(Ls,-1.0);
+  std::vector<RealD> lower(Ls,-1.0);
+  upper[Ls-1]=-mass*upper[Ls-1];
+  lower[0]   =-mass*lower[0];
+  M5Ddag(psi,chi,chi,lower,diag,upper);
+}

+template<class Impl>
+void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din)
+{
+  int Ls=this->Ls;
+  std::vector<RealD> diag =bs;
+  std::vector<RealD> upper=cs;
+  std::vector<RealD> lower=cs;
+  upper[Ls-1]=-mass*upper[Ls-1];
+  lower[0]   =-mass*lower[0];
+  M5Ddag(psi,psi,Din,lower,diag,upper);
+}
+
+template<class Impl>
+RealD CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  
+  FermionField Din(psi._grid);
+  
+  // Assemble Din
+  Meooe5D(psi,Din);
+  
+  this->DW(Din,chi,DaggerNo);
+  // ((b D_W + D_w hop terms +1) on s-diag
+  axpby(chi,1.0,1.0,chi,psi); 
+  
+  M5D(psi,chi);
+  return(norm2(chi));
+}
+
+template<class Impl>
+RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
+{
+  // Under adjoint
+  //D1+        D1- P-    ->   D1+^dag   P+ D2-^dag
+  //D2- P+     D2+            P-D1-^dag D2+dag
+  
+  FermionField Din(psi._grid);
+  // Apply Dw
+  this->DW(psi,Din,DaggerYes); 
+  
+  MeooeDag5D(Din,chi);
+  
+  M5Ddag(psi,chi);
+  // ((b D_W + D_w hop terms +1) on s-diag
+  axpby (chi,1.0,1.0,chi,psi); 
+  return norm2(chi);
+}
+
+// half checkerboard operations
+template<class Impl>
+void CayleyFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  FermionField tmp(psi._grid);
+
+  Meooe5D(psi,tmp); 
+
+  if ( psi.checkerboard == Odd ) {
+    this->DhopEO(tmp,chi,DaggerNo);
+  } else {
+    this->DhopOE(tmp,chi,DaggerNo);
  }
+}

- template<class Impl>
-  void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
-  {
-    int Ls=this->Ls;
-    for (int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	axpby_ssp_pminus(chi,bee[s],psi ,-cee[s],psi,s,s+1);
-	axpby_ssp_pplus (chi,1.0,chi,mass*cee[s],psi,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pminus(chi,bee[s],psi,mass*cee[s],psi,s,0);
-	axpby_ssp_pplus (chi,1.0,chi,-cee[s],psi,s,s-1);
-      } else {
-	axpby_ssp_pminus(chi,bee[s],psi,-cee[s],psi,s,s+1);
-	axpby_ssp_pplus (chi,1.0,chi,-cee[s],psi,s,s-1);
-      }
-    }
+template<class Impl>
+void CayleyFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
+{
+  FermionField tmp(psi._grid);
+  // Apply 4d dslash
+  if ( psi.checkerboard == Odd ) {
+    this->DhopEO(psi,tmp,DaggerYes);
+  } else {
+    this->DhopOE(psi,tmp,DaggerYes);
  }
+  MeooeDag5D(tmp,chi); 
+}

- template<class Impl>
-  void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
-    int Ls=this->Ls;
-    FermionField tmp(psi._grid);
-    // Assemble the 5d matrix
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	//	tmp = bs psi[s] + cs[s] psi[s+1}
-	//      tmp+= -mass*cs[s] psi[s+1}
-	axpby_ssp_pminus(tmp,beo[s],psi,-ceo[s],psi ,s, s+1);
-	axpby_ssp_pplus(tmp,1.0,tmp,mass*ceo[s],psi,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pminus(tmp,beo[s],psi,mass*ceo[s],psi,s,0);
-	axpby_ssp_pplus(tmp,1.0,tmp,-ceo[s],psi,s,s-1);
-      } else {
-	axpby_ssp_pminus(tmp,beo[s],psi,-ceo[s],psi,s,s+1);
-	axpby_ssp_pplus (tmp,1.0,tmp,-ceo[s],psi,s,s-1);
-      }
-    }
-    // Apply 4d dslash fragment
-    this->DhopDir(tmp,chi,dir,disp);
+template<class Impl>
+void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
+  FermionField tmp(psi._grid);
+  Meo5D(psi,tmp);
+  // Apply 4d dslash fragment
+  this->DhopDir(tmp,chi,dir,disp);
+}
+// force terms; five routines; default to Dhop on diagonal
+template<class Impl>
+void CayleyFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  FermionField Din(V._grid);
+  
+  if ( dag == DaggerNo ) {
+    //      U d/du [D_w D5] V = U d/du DW D5 V
+    Meooe5D(V,Din);
+    this->DhopDeriv(mat,U,Din,dag);
+  } else {
+    //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
+    Meooe5D(U,Din);
+    this->DhopDeriv(mat,Din,V,dag);
  }
-
- template<class Impl>
-  void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
-  {
-    int Ls=this->Ls;
-    for (int s=0;s<Ls;s++){
-      // Assemble the 5d matrix
-      if ( s==0 ) {
-	axpby_ssp_pplus(chi,bee[s],psi,-cee[s+1]  ,psi,s,s+1);
-	axpby_ssp_pminus(chi,1.0,chi,mass*cee[Ls-1],psi,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pplus(chi,bee[s],psi,mass*cee[0],psi,s,0);
-	axpby_ssp_pminus(chi,1.0,chi,-cee[s-1],psi,s,s-1);
-      } else {
-	axpby_ssp_pplus(chi,bee[s],psi,-cee[s+1],psi,s,s+1);
-	axpby_ssp_pminus(chi,1.0   ,chi,-cee[s-1],psi,s,s-1);
-      }
-    }
-  }
-
- template<class Impl>
-  void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
-  {
-    int Ls=this->Ls;
-    // Apply (L^{\prime})^{-1}
-    axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0]
-    for (int s=1;s<Ls;s++){
-      axpby_ssp_pplus(chi,1.0,psi,-lee[s-1],chi,s,s-1);// recursion Psi[s] -lee P_+ chi[s-1]
-    }
-    // L_m^{-1} 
-    for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-      axpby_ssp_pminus(chi,1.0,chi,-leem[s],chi,Ls-1,s);
-    }
-    // U_m^{-1} D^{-1}
-    for (int s=0;s<Ls-1;s++){
-      // Chi[s] + 1/d chi[s] 
-      axpby_ssp_pplus(chi,1.0/dee[s],chi,-ueem[s]/dee[Ls-1],chi,s,Ls-1);
-    }	
-    axpby_ssp(chi,1.0/dee[Ls-1],chi,0.0,chi,Ls-1,Ls-1); // Modest avoidable 
-    
-    // Apply U^{-1}
-    for (int s=Ls-2;s>=0;s--){
-      axpby_ssp_pminus (chi,1.0,chi,-uee[s],chi,s,s+1);  // chi[Ls]
-    }
-  }
-
- template<class Impl>
-  void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
-  {
-    int Ls=this->Ls;
-    // Apply (U^{\prime})^{-dagger}
-    axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0]
-    for (int s=1;s<Ls;s++){
-      axpby_ssp_pminus(chi,1.0,psi,-uee[s-1],chi,s,s-1);
-    }
-    // U_m^{-\dagger} 
-    for (int s=0;s<Ls-1;s++){
-      axpby_ssp_pplus(chi,1.0,chi,-ueem[s],chi,Ls-1,s);
-    }
-    // L_m^{-\dagger} D^{-dagger}
-    for (int s=0;s<Ls-1;s++){
-      axpby_ssp_pminus(chi,1.0/dee[s],chi,-leem[s]/dee[Ls-1],chi,s,Ls-1);
-    }	
-    axpby_ssp(chi,1.0/dee[Ls-1],chi,0.0,chi,Ls-1,Ls-1); // Modest avoidable 
-    
-    // Apply L^{-dagger}
-    for (int s=Ls-2;s>=0;s--){
-      axpby_ssp_pplus (chi,1.0,chi,-lee[s],chi,s,s+1);  // chi[Ls]
-    }
-  }
-
-  // force terms; five routines; default to Dhop on diagonal
-  template<class Impl>
-  void CayleyFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-  {
-    FermionField Din(V._grid);
-
-    if ( dag == DaggerNo ) {
-      //      U d/du [D_w D5] V = U d/du DW D5 V
-      Meooe5D(V,Din);
-      this->DhopDeriv(mat,U,Din,dag);
-    } else {
-      //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
-      Meooe5D(U,Din);
-      this->DhopDeriv(mat,Din,V,dag);
-    }
-  };
- template<class Impl>
-  void CayleyFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-  {
-    FermionField Din(V._grid);
-
-    if ( dag == DaggerNo ) {
-      //      U d/du [D_w D5] V = U d/du DW D5 V
-      Meooe5D(V,Din);
-      this->DhopDerivOE(mat,U,Din,dag);
-    } else {
-      //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
+};
+template<class Impl>
+void CayleyFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  FermionField Din(V._grid);
+  
+  if ( dag == DaggerNo ) {
+    //      U d/du [D_w D5] V = U d/du DW D5 V
+    Meooe5D(V,Din);
+    this->DhopDerivOE(mat,U,Din,dag);
+  } else {
+    //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
      Meooe5D(U,Din);
      this->DhopDerivOE(mat,Din,V,dag);
-    }
-  };
- template<class Impl>
-  void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-  {
-    FermionField Din(V._grid);
-
-    if ( dag == DaggerNo ) {
-      //      U d/du [D_w D5] V = U d/du DW D5 V
-      Meooe5D(V,Din);
-      this->DhopDerivEO(mat,U,Din,dag);
-    } else {
-      //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
-      Meooe5D(U,Din);
-      this->DhopDerivEO(mat,Din,V,dag);
-    }
-  };
+  }
+};
+template<class Impl>
+void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  FermionField Din(V._grid);
  
-  // Tanh
- template<class Impl>
-  void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
-  {
-    SetCoefficientsZolotarev(1.0,zdata,b,c);
-
+  if ( dag == DaggerNo ) {
+    //      U d/du [D_w D5] V = U d/du DW D5 V
+    Meooe5D(V,Din);
+    this->DhopDerivEO(mat,U,Din,dag);
+  } else {
+    //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
+    Meooe5D(U,Din);
+    this->DhopDerivEO(mat,Din,V,dag);
  }
-  //Zolo
- template<class Impl>
-  void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
-  {
-    int Ls=this->Ls;
+};
+  
+// Tanh
+template<class Impl>
+void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
+{
+  SetCoefficientsZolotarev(1.0,zdata,b,c);
+}
+//Zolo
+template<class Impl>
+void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
+{
+  int Ls=this->Ls;

-    ///////////////////////////////////////////////////////////
-    // The Cayley coeffs (unprec)
-    ///////////////////////////////////////////////////////////
-    omega.resize(Ls);
-    bs.resize(Ls);
-    cs.resize(Ls);
-    as.resize(Ls);
+  ///////////////////////////////////////////////////////////
+  // The Cayley coeffs (unprec)
+  ///////////////////////////////////////////////////////////
+  omega.resize(Ls);
+  bs.resize(Ls);
+  cs.resize(Ls);
+  as.resize(Ls);
+  
+  // 
+  // Ts = (    [bs+cs]Dw        )^-1 (    (bs+cs) Dw         )
+  //     -(g5  -------       -1 )    ( g5 ---------     + 1  )
+  //      (   {2+(bs-cs)Dw}     )    (    2+(bs-cs) Dw       )
+  //
+  //  bs = 1/2( (1/omega_s + 1)*b + (1/omega - 1)*c ) = 1/2(  1/omega(b+c) + (b-c) )
+  //  cs = 1/2( (1/omega_s - 1)*b + (1/omega + 1)*c ) = 1/2(  1/omega(b+c) - (b-c) )
+  //
+  // bs+cs = 0.5*( 1/omega(b+c) + (b-c) + 1/omega(b+c) - (b-c) ) = 1/omega(b+c)
+  // bs-cs = 0.5*( 1/omega(b+c) + (b-c) - 1/omega(b+c) + (b-c) ) = b-c
+  //
+  // So 
+  //
+  // Ts = (    [b+c]Dw/omega_s    )^-1 (    (b+c) Dw /omega_s        )
+  //     -(g5  -------         -1 )    ( g5 ---------           + 1  )
+  //      (   {2+(b-c)Dw}         )    (    2+(b-c) Dw               )
+  //
+  // Ts = (    [b+c]Dw            )^-1 (    (b+c) Dw                 )
+  //     -(g5  -------    -omega_s)    ( g5 ---------      + omega_s )
+  //      (   {2+(b-c)Dw}         )    (    2+(b-c) Dw               )
+  // 
    
-    // 
-    // Ts = (    [bs+cs]Dw        )^-1 (    (bs+cs) Dw         )
-    //     -(g5  -------       -1 )    ( g5 ---------     + 1  )
-    //      (   {2+(bs-cs)Dw}     )    (    2+(bs-cs) Dw       )
-    //
-    //  bs = 1/2( (1/omega_s + 1)*b + (1/omega - 1)*c ) = 1/2(  1/omega(b+c) + (b-c) )
-    //  cs = 1/2( (1/omega_s - 1)*b + (1/omega + 1)*c ) = 1/2(  1/omega(b+c) - (b-c) )
-    //
-    // bs+cs = 0.5*( 1/omega(b+c) + (b-c) + 1/omega(b+c) - (b-c) ) = 1/omega(b+c)
-    // bs-cs = 0.5*( 1/omega(b+c) + (b-c) - 1/omega(b+c) + (b-c) ) = b-c
-    //
-    // So 
-    //
-    // Ts = (    [b+c]Dw/omega_s    )^-1 (    (b+c) Dw /omega_s        )
-    //     -(g5  -------         -1 )    ( g5 ---------           + 1  )
-    //      (   {2+(b-c)Dw}         )    (    2+(b-c) Dw               )
-    //
-    // Ts = (    [b+c]Dw            )^-1 (    (b+c) Dw                 )
-    //     -(g5  -------    -omega_s)    ( g5 ---------      + omega_s )
-    //      (   {2+(b-c)Dw}         )    (    2+(b-c) Dw               )
-    // 
+  double bpc = b+c;
+  double bmc = b-c;
+  for(int i=0; i < Ls; i++){
+    as[i] = 1.0;
+    omega[i] = ((double)zdata->gamma[i])*zolo_hi; //NB reciprocal relative to Chroma NEF code
+    bs[i] = 0.5*(bpc/omega[i] + bmc);
+    cs[i] = 0.5*(bpc/omega[i] - bmc);
+  }
+  
+  ////////////////////////////////////////////////////////
+  // Constants for the preconditioned matrix Cayley form
+  ////////////////////////////////////////////////////////
+  bee.resize(Ls);
+  cee.resize(Ls);
+  beo.resize(Ls);
+  ceo.resize(Ls);
+  
+  for(int i=0;i<Ls;i++){
+    bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);
+    cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
+    beo[i]=as[i]*bs[i];
+    ceo[i]=-as[i]*cs[i];
+  }
+  
+  aee.resize(Ls);
+  aeo.resize(Ls);
+  for(int i=0;i<Ls;i++){
+    aee[i]=cee[i];
+    aeo[i]=ceo[i];
+  }
+  
+  //////////////////////////////////////////
+  // LDU decomposition of eeoo
+  //////////////////////////////////////////
+  dee.resize(Ls);
+  lee.resize(Ls);
+  leem.resize(Ls);
+  uee.resize(Ls);
+  ueem.resize(Ls);
+  
+  for(int i=0;i<Ls;i++){
    
-    double bpc = b+c;
-    double bmc = b-c;
-    for(int i=0; i < Ls; i++){
-      as[i] = 1.0;
-      omega[i] = ((double)zdata->gamma[i])*zolo_hi; //NB reciprocal relative to Chroma NEF code
-      bs[i] = 0.5*(bpc/omega[i] + bmc);
-      cs[i] = 0.5*(bpc/omega[i] - bmc);
-    }
-
-    ////////////////////////////////////////////////////////
-    // Constants for the preconditioned matrix Cayley form
-    ////////////////////////////////////////////////////////
-    bee.resize(Ls);
-    cee.resize(Ls);
-    beo.resize(Ls);
-    ceo.resize(Ls);
+    dee[i] = bee[i];
    
-    for(int i=0;i<Ls;i++){
-      bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);
-      cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
-      beo[i]=as[i]*bs[i];
-      ceo[i]=-as[i]*cs[i];
-    }
-
-    aee.resize(Ls);
-    aeo.resize(Ls);
-    for(int i=0;i<Ls;i++){
-      aee[i]=cee[i];
-      aeo[i]=ceo[i];
-    }
-
-    //////////////////////////////////////////
-    // LDU decomposition of eeoo
-    //////////////////////////////////////////
-    dee.resize(Ls);
-    lee.resize(Ls);
-    leem.resize(Ls);
-    uee.resize(Ls);
-    ueem.resize(Ls);
-    
-    for(int i=0;i<Ls;i++){
+    if ( i < Ls-1 ) {
      
-      dee[i] = bee[i];
+      lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
      
-      if ( i < Ls-1 ) {
-	
-	lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
-	    
-	leem[i]=mass*cee[Ls-1]/bee[0];
-	for(int j=0;j<i;j++)  leem[i]*= aee[j]/bee[j+1];
-	
-	uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
-	
-	ueem[i]=mass;
-	for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
-	ueem[i]*= aee[0]/bee[0];
-	    
-      } else { 
-	lee[i] =0.0;
-	leem[i]=0.0;
-	uee[i] =0.0;
-	ueem[i]=0.0;
-      }
-    }
-	
-    { 
-      double delta_d=mass*cee[Ls-1];
-      for(int j=0;j<Ls-1;j++) delta_d *= cee[j]/bee[j];
-      dee[Ls-1] += delta_d;
+      leem[i]=mass*cee[Ls-1]/bee[0];
+      for(int j=0;j<i;j++)  leem[i]*= aee[j]/bee[j+1];
+      
+      uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
+      
+      ueem[i]=mass;
+      for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
+      ueem[i]*= aee[0]/bee[0];
+      
+    } else { 
+      lee[i] =0.0;
+      leem[i]=0.0;
+      uee[i] =0.0;
+      ueem[i]=0.0;
    }
  }
+	
+  { 
+    double delta_d=mass*cee[Ls-1];
+    for(int j=0;j<Ls-1;j++) delta_d *= cee[j]/bee[j];
+    dee[Ls-1] += delta_d;
+  }  
+}
+
+

  FermOpTemplateInstantiate(CayleyFermion5D);
  GparityFermOpTemplateInstantiate(CayleyFermion5D);
--- a/lib/qcd/action/fermion/CayleyFermion5D.h
+++ b/lib/qcd/action/fermion/CayleyFermion5D.h
@@ -51,6 +51,29 @@ namespace Grid {
      virtual void   MooeeDag    (const FermionField &in, FermionField &out);
      virtual void   MooeeInv    (const FermionField &in, FermionField &out);
      virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
+      virtual void   Meo5D (const FermionField &psi, FermionField &chi);
+
+      virtual void   M5D   (const FermionField &psi, FermionField &chi);
+      virtual void   M5Ddag(const FermionField &psi, FermionField &chi);
+
+      /////////////////////////////////////////////////////
+      // Instantiate different versions depending on Impl
+      /////////////////////////////////////////////////////
+      void M5D(const FermionField &psi,
+	       const FermionField &phi, 
+	       FermionField &chi,
+	       std::vector<RealD> &lower,
+	       std::vector<RealD> &diag,
+	       std::vector<RealD> &upper);
+
+      void M5Ddag(const FermionField &psi,
+		  const FermionField &phi, 
+		  FermionField &chi,
+		  std::vector<RealD> &lower,
+		  std::vector<RealD> &diag,
+		  std::vector<RealD> &upper);
+      void MooeeInternal(const FermionField &in, FermionField &out,int dag,int inv);
+
      virtual void   Instantiatable(void)=0;

      // force terms; five routines; default to Dhop on diagonal
@@ -94,6 +117,8 @@ namespace Grid {
 		      GridRedBlackCartesian &FourDimRedBlackGrid,
 		      RealD _mass,RealD _M5,const ImplParams &p= ImplParams());

+
+
    protected:
      void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
      void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
@@ -101,5 +126,15 @@ namespace Grid {

  }
 }
+#define INSTANTIATE_DPERP(A)\
+template void CayleyFermion5D< A >::M5D(const FermionField &psi,const FermionField &phi,FermionField &chi,\
+					std::vector<RealD> &lower,std::vector<RealD> &diag,std::vector<RealD> &upper); \
+template void CayleyFermion5D< A >::M5Ddag(const FermionField &psi,const FermionField &phi,FermionField &chi,\
+					   std::vector<RealD> &lower,std::vector<RealD> &diag,std::vector<RealD> &upper); \
+template void CayleyFermion5D< A >::MooeeInv    (const FermionField &psi, FermionField &chi); \
+template void CayleyFermion5D< A >::MooeeInvDag (const FermionField &psi, FermionField &chi);
+
+#define CAYLEY_DPERP_CACHE
+#undef  CAYLEY_DPERP_LINALG

 #endif
--- a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
@@ -0,0 +1,209 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid.h>
+
+
+namespace Grid {
+namespace QCD {
+
+  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
+
+  // Pminus fowards
+  // Pplus  backwards..
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
+				const FermionField &phi, 
+				FermionField &chi,
+				std::vector<RealD> &lower,
+				std::vector<RealD> &diag,
+				std::vector<RealD> &upper)
+{
+  int Ls =this->Ls;
+  GridBase *grid=psi._grid;
+  assert(phi.checkerboard == psi.checkerboard);
+  chi.checkerboard=psi.checkerboard;
+PARALLEL_FOR_LOOP
+  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+    for(int s=0;s<Ls;s++){
+      auto tmp = psi._odata[0];
+      if ( s==0 ) {
+ 	                            spProj5m(tmp,psi._odata[ss+s+1]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	                    spProj5p(tmp,psi._odata[ss+Ls-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      } else if ( s==(Ls-1)) {
+	                            spProj5m(tmp,psi._odata[ss+0]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+ 	                    spProj5p(tmp,psi._odata[ss+s-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      } else { 
+	                            spProj5m(tmp,psi._odata[ss+s+1]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	                    spProj5p(tmp,psi._odata[ss+s-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      }
+    }
+  }
+}
+
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
+				   const FermionField &phi, 
+				   FermionField &chi,
+				   std::vector<RealD> &lower,
+				   std::vector<RealD> &diag,
+				   std::vector<RealD> &upper)
+{
+  int Ls =this->Ls;
+  GridBase *grid=psi._grid;
+  assert(phi.checkerboard == psi.checkerboard);
+  chi.checkerboard=psi.checkerboard;
+
+PARALLEL_FOR_LOOP
+  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+    auto tmp = psi._odata[0];
+    for(int s=0;s<Ls;s++){
+      if ( s==0 ) {
+	spProj5p(tmp,psi._odata[ss+s+1]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	spProj5m(tmp,psi._odata[ss+Ls-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      } else if ( s==(Ls-1)) {
+	spProj5p(tmp,psi._odata[ss+0]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	spProj5m(tmp,psi._odata[ss+s-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      } else { 
+	spProj5p(tmp,psi._odata[ss+s+1]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	spProj5m(tmp,psi._odata[ss+s-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      }
+    }
+  }
+}
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
+{
+  GridBase *grid=psi._grid;
+  int Ls=this->Ls;
+
+  chi.checkerboard=psi.checkerboard;
+
+PARALLEL_FOR_LOOP
+  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+    auto tmp = psi._odata[0];
+
+    // Apply (L^{\prime})^{-1}
+    chi[ss]=psi[ss]; // chi[0]=psi[0]
+    for(int s=1;s<Ls;s++){
+                            spProj5p(tmp,chi[ss+s-1]);  
+      chi[ss+s] = psi[ss+s]-lee[s-1]*tmp;
+    }
+    // L_m^{-1} 
+    for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+                                   spProj5m(tmp,chi[ss+s]);    
+      chi[ss+Ls-1] = chi[ss+Ls-1] - leem[s]*tmp;
+    }
+    // U_m^{-1} D^{-1}
+    for (int s=0;s<Ls-1;s++){
+      // Chi[s] + 1/d chi[s] 
+                                                spProj5p(tmp,chi[ss+Ls-1]); 
+      chi[ss+s] = (1.0/dee[s])*chi[ss+s]-(ueem[s]/dee[Ls-1])*tmp;
+    }	
+    chi[ss+Ls-1]= (1.0/dee[Ls-1])*chi[ss+Ls-1];
+      
+    // Apply U^{-1}
+    for (int s=Ls-2;s>=0;s--){
+                            spProj5m(tmp,chi[ss+s+1]);  
+      chi[ss+s] = chi[ss+s] - uee[s]*tmp;
+    }
+  }
+}
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
+{
+  GridBase *grid=psi._grid;
+  int Ls=this->Ls;
+
+  assert(psi.checkerboard == psi.checkerboard);
+  chi.checkerboard=psi.checkerboard;
+
+
+PARALLEL_FOR_LOOP
+  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+
+    auto tmp = psi._odata[0];
+
+    // Apply (U^{\prime})^{-dagger}
+    chi[ss]=psi[ss];
+    for (int s=1;s<Ls;s++){
+                            spProj5m(tmp,chi[ss+s-1]);
+      chi[ss+s] = psi[ss+s]-uee[s-1]*tmp;
+    }
+    // U_m^{-\dagger} 
+    for (int s=0;s<Ls-1;s++){
+                                   spProj5p(tmp,chi[ss+s]);
+      chi[ss+Ls-1] = chi[ss+Ls-1] - ueem[s]*tmp;
+    }
+
+    // L_m^{-\dagger} D^{-dagger}
+    for (int s=0;s<Ls-1;s++){
+      spProj5m(tmp,chi[ss+Ls-1]);
+      chi[ss+s] = (1.0/dee[s])*chi[ss+s]-(leem[s]/dee[Ls-1])*tmp;
+    }	
+    chi[ss+Ls-1]= (1.0/dee[Ls-1])*chi[ss+Ls-1];
+  
+    // Apply L^{-dagger}
+    for (int s=Ls-2;s>=0;s--){
+      spProj5p(tmp,chi[ss+s+1]);
+      chi[ss+s] = chi[ss+s] - lee[s]*tmp;
+    }
+  }
+}
+
+#ifdef CAYLEY_DPERP_CACHE
+  INSTANTIATE_DPERP(WilsonImplF);
+  INSTANTIATE_DPERP(WilsonImplD);
+  INSTANTIATE_DPERP(GparityWilsonImplF);
+  INSTANTIATE_DPERP(GparityWilsonImplD);
+#endif
+
+}}
--- a/lib/qcd/action/fermion/CayleyFermion5Ddense.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Ddense.cc
@@ -0,0 +1,133 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid/Eigen/Dense>
+#include <Grid.h>
+
+
+namespace Grid {
+namespace QCD {
+  /*
+   * Dense matrix versions of routines
+   */
+
+  /*
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
+{
+  this->MooeeInternal(psi,chi,DaggerYes,InverseYes);
+}
+  
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
+{
+  this->MooeeInternal(psi,chi,DaggerNo,InverseYes);
+}
+  */
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
+{
+  int Ls=this->Ls;
+  int LLs = psi._grid->_rdimensions[0];
+  int vol = psi._grid->oSites()/LLs;
+  
+  chi.checkerboard=psi.checkerboard;
+  
+  assert(Ls==LLs);
+  
+  Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
+  Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
+  
+  for(int s=0;s<Ls;s++){
+    Pplus(s,s) = bee[s];
+    Pminus(s,s)= bee[s];
+  }
+  
+  for(int s=0;s<Ls-1;s++){
+    Pminus(s,s+1) = -cee[s];
+  }
+  
+  for(int s=0;s<Ls-1;s++){
+    Pplus(s+1,s) = -cee[s+1];
+  }
+  Pplus (0,Ls-1) = mass*cee[0];
+  Pminus(Ls-1,0) = mass*cee[Ls-1];
+  
+  Eigen::MatrixXd PplusMat ;
+  Eigen::MatrixXd PminusMat;
+  
+  if ( inv ) {
+    PplusMat =Pplus.inverse();
+    PminusMat=Pminus.inverse();
+  } else { 
+    PplusMat =Pplus;
+    PminusMat=Pminus;
+  }
+  
+  if(dag){
+    PplusMat.adjointInPlace();
+    PminusMat.adjointInPlace();
+  }
+
+  // For the non-vectorised s-direction this is simple
+  
+  for(auto site=0;site<vol;site++){
+    
+    SiteSpinor     SiteChi;
+    SiteHalfSpinor SitePplus;
+    SiteHalfSpinor SitePminus;
+    
+    for(int s1=0;s1<Ls;s1++){
+      SiteChi =zero;
+      for(int s2=0;s2<Ls;s2++){
+	int lex2 = s2+Ls*site;
+	
+	if ( PplusMat(s1,s2) != 0.0 ) {
+	  spProj5p(SitePplus,psi[lex2]);
+	  accumRecon5p(SiteChi,PplusMat (s1,s2)*SitePplus);
+	}
+	
+	if ( PminusMat(s1,s2) != 0.0 ) {
+	  spProj5m(SitePminus,psi[lex2]);
+	  accumRecon5m(SiteChi,PminusMat(s1,s2)*SitePminus);
+	}
+      }
+      chi[s1+Ls*site] = SiteChi*0.5;
+    }
+  }
+}
+
+template void CayleyFermion5D<GparityWilsonImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<GparityWilsonImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<WilsonImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<WilsonImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+
+}}
--- a/lib/qcd/action/fermion/CayleyFermion5Dssp.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dssp.cc
@@ -0,0 +1,149 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid.h>
+
+
+namespace Grid {
+namespace QCD {
+
+  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
+
+  // Pminus fowards
+  // Pplus  backwards
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
+				const FermionField &phi, 
+				FermionField &chi,
+				std::vector<RealD> &lower,
+				std::vector<RealD> &diag,
+				std::vector<RealD> &upper)
+{
+  int Ls=this->Ls;
+  for(int s=0;s<Ls;s++){
+    if ( s==0 ) {
+      axpby_ssp_pminus(chi,diag[s],phi,upper[s],psi,s,s+1);
+      axpby_ssp_pplus (chi,1.0,chi,lower[s],psi,s,Ls-1);
+    } else if ( s==(Ls-1)) { 
+      axpby_ssp_pminus(chi,diag[s],phi,upper[s],psi,s,0);
+      axpby_ssp_pplus (chi,1.0,chi,lower[s],psi,s,s-1);
+    } else {
+      axpby_ssp_pminus(chi,diag[s],phi,upper[s],psi,s,s+1);
+      axpby_ssp_pplus(chi,1.0,chi,lower[s],psi,s,s-1);
+    }
+  }
+}
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
+				   const FermionField &phi, 
+				   FermionField &chi,
+				   std::vector<RealD> &lower,
+				   std::vector<RealD> &diag,
+				   std::vector<RealD> &upper)
+{
+  int Ls=this->Ls;
+  for(int s=0;s<Ls;s++){
+    if ( s==0 ) {
+      axpby_ssp_pplus (chi,diag[s],phi,upper[s],psi,s,s+1);
+      axpby_ssp_pminus(chi,1.0,chi,lower[s],psi,s,Ls-1);
+    } else if ( s==(Ls-1)) { 
+      axpby_ssp_pplus (chi,diag[s],phi,upper[s],psi,s,0);
+      axpby_ssp_pminus(chi,1.0,chi,lower[s],psi,s,s-1);
+    } else {
+      axpby_ssp_pplus (chi,diag[s],phi,upper[s],psi,s,s+1);
+      axpby_ssp_pminus(chi,1.0,chi,lower[s],psi,s,s-1);
+    }
+  }
+}
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
+{
+  chi.checkerboard=psi.checkerboard;
+  int Ls=this->Ls;
+  // Apply (L^{\prime})^{-1}
+  axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0]
+  for (int s=1;s<Ls;s++){
+    axpby_ssp_pplus(chi,1.0,psi,-lee[s-1],chi,s,s-1);// recursion Psi[s] -lee P_+ chi[s-1]
+  }
+  // L_m^{-1} 
+  for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+    axpby_ssp_pminus(chi,1.0,chi,-leem[s],chi,Ls-1,s);
+  }
+  // U_m^{-1} D^{-1}
+  for (int s=0;s<Ls-1;s++){
+    // Chi[s] + 1/d chi[s] 
+    axpby_ssp_pplus(chi,1.0/dee[s],chi,-ueem[s]/dee[Ls-1],chi,s,Ls-1);
+  }	
+  axpby_ssp(chi,1.0/dee[Ls-1],chi,0.0,chi,Ls-1,Ls-1); // Modest avoidable 
+  
+  // Apply U^{-1}
+  for (int s=Ls-2;s>=0;s--){
+    axpby_ssp_pminus (chi,1.0,chi,-uee[s],chi,s,s+1);  // chi[Ls]
+  }
+}
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
+{
+  chi.checkerboard=psi.checkerboard;
+  int Ls=this->Ls;
+  // Apply (U^{\prime})^{-dagger}
+  axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0]
+  for (int s=1;s<Ls;s++){
+    axpby_ssp_pminus(chi,1.0,psi,-uee[s-1],chi,s,s-1);
+  }
+  // U_m^{-\dagger} 
+  for (int s=0;s<Ls-1;s++){
+    axpby_ssp_pplus(chi,1.0,chi,-ueem[s],chi,Ls-1,s);
+  }
+  // L_m^{-\dagger} D^{-dagger}
+  for (int s=0;s<Ls-1;s++){
+    axpby_ssp_pminus(chi,1.0/dee[s],chi,-leem[s]/dee[Ls-1],chi,s,Ls-1);
+  }	
+  axpby_ssp(chi,1.0/dee[Ls-1],chi,0.0,chi,Ls-1,Ls-1); // Modest avoidable 
+  
+  // Apply L^{-dagger}
+  for (int s=Ls-2;s>=0;s--){
+    axpby_ssp_pplus (chi,1.0,chi,-lee[s],chi,s,s+1);  // chi[Ls]
+  }
+}
+
+
+#ifdef CAYLEY_DPERP_LINALG
+  INSTANTIATE(WilsonImplF);
+  INSTANTIATE(WilsonImplD);
+  INSTANTIATE(GparityWilsonImplF);
+  INSTANTIATE(GparityWilsonImplD);
+#endif
+
+}
+}
--- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
@@ -0,0 +1,305 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid/Eigen/Dense>
+#include <Grid.h>
+
+
+namespace Grid {
+namespace QCD {
+  /*
+   * Dense matrix versions of routines
+   */
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
+{
+  this->MooeeInternal(psi,chi,DaggerYes,InverseYes);
+}
+  
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
+{
+  this->MooeeInternal(psi,chi,DaggerNo,InverseYes);
+}
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
+				const FermionField &phi, 
+				FermionField &chi,
+				std::vector<RealD> &lower,
+				std::vector<RealD> &diag,
+				std::vector<RealD> &upper)
+{
+  GridBase *grid=psi._grid;
+  int Ls   = this->Ls;
+  int LLs  = grid->_rdimensions[0];
+  int nsimd= Simd::Nsimd();
+
+  Vector<iSinglet<Simd> > u(LLs);
+  Vector<iSinglet<Simd> > l(LLs);
+  Vector<iSinglet<Simd> > d(LLs);
+
+  assert(Ls/LLs==nsimd);
+  assert(phi.checkerboard == psi.checkerboard);
+
+  chi.checkerboard=psi.checkerboard;
+
+  // just directly address via type pun
+  typedef typename Simd::scalar_type scalar_type;
+  scalar_type * u_p = (scalar_type *)&u[0];
+  scalar_type * l_p = (scalar_type *)&l[0];
+  scalar_type * d_p = (scalar_type *)&d[0];
+
+  for(int o=0;o<LLs;o++){ // outer
+  for(int i=0;i<nsimd;i++){ //inner
+    int s  = o+i*LLs;
+    int ss = o*nsimd+i;
+    u_p[ss] = upper[s];
+    l_p[ss] = lower[s];
+    d_p[ss] = diag[s];
+  }}
+
+PARALLEL_FOR_LOOP
+  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
+
+    alignas(64) SiteHalfSpinor hp;
+    alignas(64) SiteHalfSpinor hm;
+    alignas(64) SiteSpinor fp;
+    alignas(64) SiteSpinor fm;
+
+    for(int v=0;v<LLs;v++){
+
+      int vp=(v+1)%LLs;
+      int vm=(v+LLs-1)%LLs;
+
+      spProj5m(hp,psi[ss+vp]);
+      spProj5p(hm,psi[ss+vm]);
+      
+      if ( vp<=v ) rotate(hp,hp,1);
+      if ( vm>=v ) rotate(hm,hm,nsimd-1);
+
+      hp=hp*0.5;
+      hm=hm*0.5;
+      spRecon5m(fp,hp);
+      spRecon5p(fm,hm);
+
+      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
+      chi[ss+v] = chi[ss+v]     +l[v]*fm;
+
+    }
+  }
+}
+
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
+				   const FermionField &phi, 
+				   FermionField &chi,
+				   std::vector<RealD> &lower,
+				   std::vector<RealD> &diag,
+				   std::vector<RealD> &upper)
+{
+  GridBase *grid=psi._grid;
+  int Ls   = this->Ls;
+  int LLs  = grid->_rdimensions[0];
+  int nsimd= Simd::Nsimd();
+
+  Vector<iSinglet<Simd> > u(LLs);
+  Vector<iSinglet<Simd> > l(LLs);
+  Vector<iSinglet<Simd> > d(LLs);
+
+  assert(Ls/LLs==nsimd);
+  assert(phi.checkerboard == psi.checkerboard);
+
+  chi.checkerboard=psi.checkerboard;
+
+  // just directly address via type pun
+  typedef typename Simd::scalar_type scalar_type;
+  scalar_type * u_p = (scalar_type *)&u[0];
+  scalar_type * l_p = (scalar_type *)&l[0];
+  scalar_type * d_p = (scalar_type *)&d[0];
+
+  for(int o=0;o<LLs;o++){ // outer
+  for(int i=0;i<nsimd;i++){ //inner
+    int s  = o+i*LLs;
+    int ss = o*nsimd+i;
+    u_p[ss] = upper[s];
+    l_p[ss] = lower[s];
+    d_p[ss] = diag[s];
+  }}
+
+PARALLEL_FOR_LOOP
+  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
+
+    alignas(64) SiteHalfSpinor hp;
+    alignas(64) SiteHalfSpinor hm;
+    alignas(64) SiteSpinor fp;
+    alignas(64) SiteSpinor fm;
+
+    for(int v=0;v<LLs;v++){
+
+      int vp=(v+1)%LLs;
+      int vm=(v+LLs-1)%LLs;
+
+      spProj5p(hp,psi[ss+vp]);
+      spProj5m(hm,psi[ss+vm]);
+
+      if ( vp<=v ) rotate(hp,hp,1);
+      if ( vm>=v ) rotate(hm,hm,nsimd-1);
+      
+      hp=hp*0.5;
+      hm=hm*0.5;
+      spRecon5p(fp,hp);
+      spRecon5m(fm,hm);
+
+      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
+      chi[ss+v] = chi[ss+v]     +l[v]*fm;
+
+    }
+  }
+}
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
+{
+  int Ls=this->Ls;
+  int LLs = psi._grid->_rdimensions[0];
+  int vol = psi._grid->oSites()/LLs;
+
+  chi.checkerboard=psi.checkerboard;
+  
+  Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
+  Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
+  
+  for(int s=0;s<Ls;s++){
+    Pplus(s,s) = bee[s];
+    Pminus(s,s)= bee[s];
+  }
+  
+  for(int s=0;s<Ls-1;s++){
+    Pminus(s,s+1) = -cee[s];
+  }
+  
+  for(int s=0;s<Ls-1;s++){
+    Pplus(s+1,s) = -cee[s+1];
+  }
+  Pplus (0,Ls-1) = mass*cee[0];
+  Pminus(Ls-1,0) = mass*cee[Ls-1];
+  
+  Eigen::MatrixXd PplusMat ;
+  Eigen::MatrixXd PminusMat;
+  
+  if ( inv ) {
+    PplusMat =Pplus.inverse();
+    PminusMat=Pminus.inverse();
+  } else { 
+    PplusMat =Pplus;
+    PminusMat=Pminus;
+  }
+  
+  if(dag){
+    PplusMat.adjointInPlace();
+    PminusMat.adjointInPlace();
+  }
+  
+  typedef typename SiteHalfSpinor::scalar_type scalar_type;
+  const int Nsimd=Simd::Nsimd();
+  Vector<iSinglet<Simd> > Matp(Ls*LLs);
+  Vector<iSinglet<Simd> > Matm(Ls*LLs);
+
+  for(int s2=0;s2<Ls;s2++){
+  for(int s1=0;s1<LLs;s1++){
+    int istride = LLs;
+    int ostride = 1;
+      Simd Vp;
+      Simd Vm;
+      scalar_type *sp = (scalar_type *)&Vp;
+      scalar_type *sm = (scalar_type *)&Vm;
+      for(int l=0;l<Nsimd;l++){
+	sp[l] = PplusMat (l*istride+s1*ostride ,s2);
+	sm[l] = PminusMat(l*istride+s1*ostride,s2);
+      }
+      Matp[LLs*s2+s1] = Vp;
+      Matm[LLs*s2+s1] = Vm;
+    }
+  }
+  
+  // Dynamic allocate on stack to get per thread without serialised heap acces
+PARALLEL_FOR_LOOP
+  for(auto site=0;site<vol;site++){
+    
+    //    SiteHalfSpinor *SitePplus =(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
+    //    SiteHalfSpinor *SitePminus=(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
+    //    SiteSpinor     *SiteChi   =(SiteSpinor *)     alloca(LLs*sizeof(SiteSpinor));
+
+    Vector<SiteHalfSpinor> SitePplus(LLs);
+    Vector<SiteHalfSpinor> SitePminus(LLs);
+    Vector<SiteHalfSpinor> SiteChiP(LLs);
+    Vector<SiteHalfSpinor> SiteChiM(LLs);
+    Vector<SiteSpinor>     SiteChi(LLs);
+
+    SiteHalfSpinor BcastP;
+    SiteHalfSpinor BcastM;
+
+    for(int s=0;s<LLs;s++){
+      int lex = s+LLs*site;
+      spProj5p(SitePplus[s] ,psi[lex]);
+      spProj5m(SitePminus[s],psi[lex]);
+      SiteChiP[s]=zero;
+      SiteChiM[s]=zero;
+    }
+      
+    int s=0;
+    for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
+      for(int s2=0;s2<LLs;s2++){ // Column loop of right hand side
+	vbroadcast(BcastP,SitePplus [s2],l);
+	vbroadcast(BcastM,SitePminus[s2],l);
+	for(int s1=0;s1<LLs;s1++){ // Column loop of reduction variables
+	  SiteChiP[s1]=SiteChiP[s1]+Matp[LLs*s+s1]*BcastP;
+	  SiteChiM[s1]=SiteChiM[s1]+Matm[LLs*s+s1]*BcastM;
+	}
+      s++;
+    }}
+
+    for(int s=0;s<LLs;s++){
+      int lex = s+LLs*site;
+      spRecon5p(SiteChi[s],SiteChiP[s]);
+      accumRecon5m(SiteChi[s],SiteChiM[s]);
+      chi[lex] = SiteChi[s]*0.5;
+    }
+  }
+}
+
+INSTANTIATE_DPERP(DomainWallVec5dImplD);
+INSTANTIATE_DPERP(DomainWallVec5dImplF);
+
+template void CayleyFermion5D<DomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<DomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+
+}}
--- a/lib/qcd/action/fermion/DomainWallFermion.h
+++ b/lib/qcd/action/fermion/DomainWallFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_DOMAIN_WALL_FERMION_H
 #define  GRID_QCD_DOMAIN_WALL_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -113,6 +113,8 @@ namespace Grid {
    class WilsonImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
    public:

+      const bool LsVectorised=false;
+
      typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl;

      INHERIT_GIMPL_TYPES(Gimpl);
@@ -191,8 +193,10 @@ PARALLEL_FOR_LOOP
    // Single flavour four spinors with colour index, 5d redblack
    ///////
    template<class S,int Nrepresentation=Nc>
-    class DomainWallRedBlack5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
+    class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
    public:
+    
+      const bool LsVectorised=true;

      typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl;

@@ -221,7 +225,7 @@ PARALLEL_FOR_LOOP

      ImplParams Params;

-      DomainWallRedBlack5dImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
+      DomainWallVec5dImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 

      bool overlapCommsCompute(void) { return false; };
    
@@ -287,6 +291,8 @@ PARALLEL_FOR_LOOP
    class GparityWilsonImpl : public ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> >{ 
    public:

+      const bool LsVectorised=false;
+
      typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;

      INHERIT_GIMPL_TYPES(Gimpl);
@@ -449,7 +455,7 @@ PARALLEL_FOR_LOOP
 	auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde,A));  
 PARALLEL_FOR_LOOP
        for(auto ss=tmp.begin();ss<tmp.end();ss++){
-	  link[ss]() = tmp[ss](0,0) - conjugate(tmp[ss](1,1)) ;
+	  link[ss]() = tmp[ss](0,0) - conjugate(tmp[ss](1,1)) ; // IS THIS SIGN RIGHT?
 	}
 	PokeIndex<LorentzIndex>(mat,link,mu);
 	return;
@@ -477,9 +483,9 @@ PARALLEL_FOR_LOOP
    typedef WilsonImpl<vComplexF,Nc> WilsonImplF; // Float
    typedef WilsonImpl<vComplexD,Nc> WilsonImplD; // Double

-    typedef DomainWallRedBlack5dImpl<vComplex ,Nc> DomainWallRedBlack5dImplR; // Real.. whichever prec
-    typedef DomainWallRedBlack5dImpl<vComplexF,Nc> DomainWallRedBlack5dImplF; // Float
-    typedef DomainWallRedBlack5dImpl<vComplexD,Nc> DomainWallRedBlack5dImplD; // Double
+    typedef DomainWallVec5dImpl<vComplex ,Nc> DomainWallVec5dImplR; // Real.. whichever prec
+    typedef DomainWallVec5dImpl<vComplexF,Nc> DomainWallVec5dImplF; // Float
+    typedef DomainWallVec5dImpl<vComplexD,Nc> DomainWallVec5dImplD; // Double

    typedef GparityWilsonImpl<vComplex ,Nc> GparityWilsonImplR; // Real.. whichever prec
    typedef GparityWilsonImpl<vComplexF,Nc> GparityWilsonImplF; // Float
--- a/lib/qcd/action/fermion/MobiusFermion.h
+++ b/lib/qcd/action/fermion/MobiusFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_MOBIUS_FERMION_H
 #define  GRID_QCD_MOBIUS_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/MobiusZolotarevFermion.h
+++ b/lib/qcd/action/fermion/MobiusZolotarevFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_MOBIUS_ZOLOTAREV_FERMION_H
 #define  GRID_QCD_MOBIUS_ZOLOTAREV_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef OVERLAP_WILSON_CAYLEY_TANH_FERMION_H
 #define OVERLAP_WILSON_CAYLEY_TANH_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  OVERLAP_WILSON_CAYLEY_ZOLOTAREV_FERMION_H
 #define  OVERLAP_WILSON_CAYLEY_ZOLOTAREV_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef OVERLAP_WILSON_CONTFRAC_TANH_FERMION_H
 #define OVERLAP_WILSON_CONTFRAC_TANH_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef OVERLAP_WILSON_CONTFRAC_ZOLOTAREV_FERMION_H
 #define OVERLAP_WILSON_CONTFRAC_ZOLOTAREV_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef OVERLAP_WILSON_PARTFRAC_TANH_FERMION_H
 #define OVERLAP_WILSON_PARTFRAC_TANH_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef OVERLAP_WILSON_PARTFRAC_ZOLOTAREV_FERMION_H
 #define OVERLAP_WILSON_PARTFRAC_ZOLOTAREV_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/ScaledShamirFermion.h
+++ b/lib/qcd/action/fermion/ScaledShamirFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_SCALED_SHAMIR_FERMION_H
 #define  GRID_QCD_SCALED_SHAMIR_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/ShamirZolotarevFermion.h
+++ b/lib/qcd/action/fermion/ShamirZolotarevFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_SHAMIR_ZOLOTAREV_FERMION_H
 #define  GRID_QCD_SHAMIR_ZOLOTAREV_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -48,9 +48,9 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
 				       GridRedBlackCartesian &FourDimRedBlackGrid,
 				       RealD _M5,const ImplParams &p) :
  Kernels(p),
-  _FiveDimGrid(&FiveDimGrid),
+  _FiveDimGrid        (&FiveDimGrid),
  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
-  _FourDimGrid(&FourDimGrid),
+  _FourDimGrid        (&FourDimGrid),
  _FourDimRedBlackGrid(&FourDimRedBlackGrid),
  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements),
  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
@@ -62,60 +62,83 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  Lebesgue(_FourDimGrid),
  LebesgueEvenOdd(_FourDimRedBlackGrid)
 {
-  // some assertions
-  assert(FiveDimGrid._ndimension==5);
-  assert(FourDimGrid._ndimension==4);
-  assert(FiveDimRedBlackGrid._ndimension==5);
-  assert(FourDimRedBlackGrid._ndimension==4);
-  assert(FiveDimRedBlackGrid._checker_dim==1);
+  if (Impl::LsVectorised) { 

-  // Dimension zero of the five-d is the Ls direction
-  Ls=FiveDimGrid._fdimensions[0];
-  assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
-  assert(FiveDimRedBlackGrid._processors[0] ==1);
-  assert(FiveDimRedBlackGrid._simd_layout[0]==1);
-  assert(FiveDimGrid._processors[0]         ==1);
-  assert(FiveDimGrid._simd_layout[0]        ==1);
+    int nsimd = Simd::Nsimd();
+    
+    // some assertions
+    assert(FiveDimGrid._ndimension==5);
+    assert(FiveDimRedBlackGrid._ndimension==5);
+    assert(FiveDimRedBlackGrid._checker_dim==1); // Don't checker the s direction
+    assert(FourDimGrid._ndimension==4);

-  // Other dimensions must match the decomposition of the four-D fields 
-  for(int d=0;d<4;d++){
-    assert(FourDimRedBlackGrid._fdimensions[d]  ==FourDimGrid._fdimensions[d]);
-    assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
+    // Dimension zero of the five-d is the Ls direction
+    Ls=FiveDimGrid._fdimensions[0];
+    assert(FiveDimGrid._processors[0]         ==1);
+    assert(FiveDimGrid._simd_layout[0]        ==nsimd);

-    assert(FourDimRedBlackGrid._processors[d]   ==FourDimGrid._processors[d]);
-    assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
+    assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
+    assert(FiveDimRedBlackGrid._processors[0] ==1);
+    assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);

-    assert(FourDimRedBlackGrid._simd_layout[d]  ==FourDimGrid._simd_layout[d]);
-    assert(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]);
+    // Other dimensions must match the decomposition of the four-D fields 
+    for(int d=0;d<4;d++){
+      assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
+      assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
+      
+      assert(FourDimGrid._simd_layout[d]=1);
+      assert(FourDimRedBlackGrid._simd_layout[d]=1);
+      assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);

-    assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
-    assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
-    assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
+      assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
+      assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
+      assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
+    }
+
+  } else {
+
+    // some assertions
+    assert(FiveDimGrid._ndimension==5);
+    assert(FourDimGrid._ndimension==4);
+    assert(FiveDimRedBlackGrid._ndimension==5);
+    assert(FourDimRedBlackGrid._ndimension==4);
+    assert(FiveDimRedBlackGrid._checker_dim==1);
+    
+    // Dimension zero of the five-d is the Ls direction
+    Ls=FiveDimGrid._fdimensions[0];
+    assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
+    assert(FiveDimRedBlackGrid._processors[0] ==1);
+    assert(FiveDimRedBlackGrid._simd_layout[0]==1);
+    assert(FiveDimGrid._processors[0]         ==1);
+    assert(FiveDimGrid._simd_layout[0]        ==1);
+    
+    // Other dimensions must match the decomposition of the four-D fields 
+    for(int d=0;d<4;d++){
+      assert(FourDimRedBlackGrid._fdimensions[d]  ==FourDimGrid._fdimensions[d]);
+      assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
+      
+      assert(FourDimRedBlackGrid._processors[d]   ==FourDimGrid._processors[d]);
+      assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
+      
+      assert(FourDimRedBlackGrid._simd_layout[d]  ==FourDimGrid._simd_layout[d]);
+      assert(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]);
+      
+      assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
+      assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
+      assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
+    }
  }
-
+    
  // Allocate the required comms buffer
  ImportGauge(_Umu);
-}  
-
+}
+  /*
 template<class Impl>
 WilsonFermion5D<Impl>::WilsonFermion5D(int simd,GaugeField &_Umu,
 				       GridCartesian         &FiveDimGrid,
 				       GridRedBlackCartesian &FiveDimRedBlackGrid,
 				       GridCartesian         &FourDimGrid,
 				       RealD _M5,const ImplParams &p) :
-  Kernels(p),
-  _FiveDimGrid        (&FiveDimGrid),
-  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
-  _FourDimGrid        (&FourDimGrid),
-  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements),
-  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
-  StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd
-  M5(_M5),
-  Umu(_FourDimGrid),
-  UmuEven(_FourDimGrid),
-  UmuOdd (_FourDimGrid),
-  Lebesgue(_FourDimGrid),
-  LebesgueEvenOdd(_FourDimGrid)
 {
  int nsimd = Simd::Nsimd();

@@ -148,15 +171,10 @@ WilsonFermion5D<Impl>::WilsonFermion5D(int simd,GaugeField &_Umu,
  }

  {
-    GaugeField HUmu(_Umu._grid);
-    HUmu = _Umu*(-0.5);
-    Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
-    UmuEven=Umu;// Really want a reference.
-    UmuOdd =Umu;
  }
 }  
-
-
+  */
+     
 template<class Impl>
 void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
 {
@@ -376,8 +394,6 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag

 FermOpTemplateInstantiate(WilsonFermion5D);
 GparityFermOpTemplateInstantiate(WilsonFermion5D);
-template class WilsonFermion5D<DomainWallRedBlack5dImplF>;		
-template class WilsonFermion5D<DomainWallRedBlack5dImplD>;
  
 }}

--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@@ -125,12 +125,14 @@ namespace Grid {
 		      double _M5,const ImplParams &p= ImplParams());

      // Constructors
+      /*
      WilsonFermion5D(int simd, 
 		      GaugeField &_Umu,
 		      GridCartesian         &FiveDimGrid,
 		      GridRedBlackCartesian &FiveDimRedBlackGrid,
 		      GridCartesian         &FourDimGrid,
 		      double _M5,const ImplParams &p= ImplParams());
+      */

      // DoubleStore
      void ImportGauge(const GaugeField &_Umu);
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -572,7 +572,4 @@ void WilsonKernels<Impl>::DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,

  FermOpTemplateInstantiate(WilsonKernels);

-template class WilsonKernels<DomainWallRedBlack5dImplF>;		
-template class WilsonKernels<DomainWallRedBlack5dImplD>;
-
 }}
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -90,7 +90,7 @@ void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrd
 #define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
 template<>
-void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
@@ -110,10 +110,10 @@ template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl
 template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
+template void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
+template void WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 }}
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -867,16 +867,16 @@ template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(Stencil
 									 int ss,int sU,const FermionField &in, FermionField &out);


-template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallVec5dImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallVec5dImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallVec5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallVec5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);

--- a/lib/qcd/action/fermion/WilsonTMFermion.h
+++ b/lib/qcd/action/fermion/WilsonTMFermion.h
@@ -28,7 +28,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_WILSON_TM_FERMION_H
 #define  GRID_QCD_WILSON_TM_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/utils/SpaceTimeGrid.cc
+++ b/lib/qcd/utils/SpaceTimeGrid.cc
@@ -84,7 +84,7 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridC

 GridCartesian         *SpaceTimeGrid::makeFiveDimDWFGrid(int Ls,const GridCartesian *FourDimGrid)
 {
-  int N4=FourDimGrid->_ndimension;
+  int N4    = FourDimGrid->_ndimension;
  int nsimd = FourDimGrid->Nsimd();

  std::vector<int> latt5(1,Ls);
@@ -103,11 +103,11 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(int Ls,const Gr
 {
  int N4=FourDimGrid->_ndimension;
  int nsimd = FourDimGrid->Nsimd();
-  int cbd=0;
+  int cbd=1;
  std::vector<int> latt5(1,Ls);
  std::vector<int> simd5(1,nsimd);
  std::vector<int>  mpi5(1,1);
-  std::vector<int>   cb5(1,1);
+  std::vector<int>   cb5(1,0);
    
  for(int d=0;d<N4;d++){
    latt5.push_back(FourDimGrid->_fdimensions[d]);
--- a/lib/serialisation/Serialisation.h
+++ b/lib/serialisation/Serialisation.h
@@ -29,18 +29,18 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_SERIALISATION_READER_H
 #define GRID_SERIALISATION_READER_H

-#include <serialisation/MacroMagic.h>
-#include <serialisation/BaseIO.h>
 #include <stdint.h>

+#include "MacroMagic.h"
+#include "BaseIO.h"
+#include "BinaryIO.h"
+#include "TextIO.h"
+#include "XmlIO.h"
 //////////////////////////////////////////
 // Todo:
 //////////////////////////////////////////
-#include <serialisation/BinaryIO.h>
-#include <serialisation/TextIO.h>
-//#include <serialisation/JsonIO.h>
-//#include <serialisation/YamlIO.h>
-#include <serialisation/XmlIO.h>
+//#include "JsonIO.h"
+//#include "YamlIO.h"

 //////////////////////////////////////////
 // Select the default serialiser use ifdef's
--- a/lib/serialisation/XmlIO.h
+++ b/lib/serialisation/XmlIO.h
@@ -38,7 +38,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <vector>
 #include <cassert>

-#include "pugixml/pugixml.h"
+#include <Grid/pugixml/pugixml.h>

 namespace Grid
 {
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -303,13 +303,14 @@ namespace Grid {
 	int dist = perm&0xF;
        y=rotate(b,dist);
 	return;
-      }
-      switch(perm){
-      case 3: permute3(y,b); break;
-      case 2: permute2(y,b); break;
-      case 1: permute1(y,b); break;
-      case 0: permute0(y,b); break;
-      default: assert(0);
+      } else { 
+	switch(perm){
+	case 3: permute3(y,b); break;
+	case 2: permute2(y,b); break;
+	case 1: permute1(y,b); break;
+	case 0: permute0(y,b); break;
+	default: assert(0);
+	}
      }
    }
    
@@ -336,6 +337,20 @@ namespace Grid {
    ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
    return ret;
  }
+  template <class S, class V, IfNotComplex<S> =0> 
+  inline void rotate( Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
+  {
+    nrot = nrot % Grid_simd<S,V>::Nsimd();
+    //    std::cout << "Rotate Real by "<<nrot<<std::endl;
+    ret.v = Optimization::Rotate::rotate(b.v,nrot);
+  }
+  template <class S, class V, IfComplex<S> =0> 
+    inline void rotate(Grid_simd<S,V> &ret,Grid_simd<S,V> b,int nrot)
+  {
+    nrot = nrot % Grid_simd<S,V>::Nsimd();
+    //    std::cout << "Rotate Complex by "<<nrot<<std::endl;
+    ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
+  }

  ///////////////////////
  // Splat
@@ -347,6 +362,12 @@ namespace Grid {
    ret.v = binary<V>(a, b, VsplatSIMD());
  }    

+  template <class S, class V> 
+  inline void vbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
+    S* typepun =(S*) &src;
+    vsplat(ret,typepun[lane]);
+  }    
+
  // overload if complex
  template <class S,class V> inline void vsplat(Grid_simd<S,V> &ret, EnableIf<is_complex < S >, S> c) {
    vsplat(ret,real(c),imag(c));
@@ -354,7 +375,7 @@ namespace Grid {

  //if real fill with a, if complex fill with a in the real part (first function above)
  template <class S,class V>
-    inline void vsplat(Grid_simd<S,V> &ret,NotEnableIf<is_complex< S>,S> a){
+  inline void vsplat(Grid_simd<S,V> &ret,NotEnableIf<is_complex< S>,S> a){
    ret.v = unary<V>(a, VsplatSIMD());
  }    
  //////////////////////////
--- a/lib/simd/Intel512wilson.h
+++ b/lib/simd/Intel512wilson.h
@@ -90,8 +90,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define Chimu_31 UChi_11
 #define Chimu_32 UChi_12

-#include <simd/Intel512common.h>
-#include <simd/Intel512avx.h>
+#include "Intel512common.h"
+#include "Intel512avx.h"

 //////////////////////////////////////////////////////////////////
 // Macros used to build wilson kernel -- can rationalise and simplify
--- a/lib/tensors/Tensor_arith.h
+++ b/lib/tensors/Tensor_arith.h
@@ -28,11 +28,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_MATH_ARITH_H
 #define GRID_MATH_ARITH_H

-#include <tensors/Tensor_arith_add.h>
-#include <tensors/Tensor_arith_sub.h>
-#include <tensors/Tensor_arith_mac.h>
-#include <tensors/Tensor_arith_mul.h>
-#include <tensors/Tensor_arith_scalar.h>
+#include "Tensor_arith_add.h"
+#include "Tensor_arith_sub.h"
+#include "Tensor_arith_mac.h"
+#include "Tensor_arith_mul.h"
+#include "Tensor_arith_scalar.h"

 #endif

--- a/lib/tensors/Tensor_class.h
+++ b/lib/tensors/Tensor_class.h
@@ -84,6 +84,9 @@ public:
  friend strong_inline void vstream(iScalar<vtype> &out,const iScalar<vtype> &in){
    vstream(out._internal,in._internal);
  }
+  friend strong_inline void vbroadcast(iScalar<vtype> &out,const iScalar<vtype> &in,int lane){
+    vbroadcast(out._internal,in._internal,lane);
+  }
  friend strong_inline void zeroit(iScalar<vtype> &that){
    zeroit(that._internal);
  }
@@ -93,6 +96,9 @@ public:
  friend strong_inline void permute(iScalar<vtype> &out,const iScalar<vtype> &in,int permutetype){
    permute(out._internal,in._internal,permutetype);
  }
+  friend strong_inline void rotate(iScalar<vtype> &out,const iScalar<vtype> &in,int rot){
+    rotate(out._internal,in._internal,rot);
+  }

  // Unary negation
  friend strong_inline iScalar<vtype> operator -(const iScalar<vtype> &r) {
@@ -200,11 +206,21 @@ public:
      vstream(out._internal[i],in._internal[i]);
    }
  }
+  friend strong_inline void vbroadcast(iVector<vtype,N> &out,const iVector<vtype,N> &in,int lane){
+    for(int i=0;i<N;i++){
+      vbroadcast(out._internal[i],in._internal[i],lane);
+    }
+  }
  friend strong_inline void permute(iVector<vtype,N> &out,const iVector<vtype,N> &in,int permutetype){
    for(int i=0;i<N;i++){
      permute(out._internal[i],in._internal[i],permutetype);
    }
  }
+  friend strong_inline void rotate(iVector<vtype,N> &out,const iVector<vtype,N> &in,int rot){
+    for(int i=0;i<N;i++){
+      rotate(out._internal[i],in._internal[i],rot);
+    }
+  }

  // Unary negation
  friend strong_inline iVector<vtype,N> operator -(const iVector<vtype,N> &r) {
@@ -318,7 +334,13 @@ public:
      for(int j=0;j<N;j++){
 	vstream(out._internal[i][j],in._internal[i][j]);
      }}
-    }
+  }
+  friend strong_inline void vbroadcast(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int lane){
+      for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	vbroadcast(out._internal[i][j],in._internal[i][j],lane);
+      }}
+  }

  friend strong_inline void permute(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int permutetype){
    for(int i=0;i<N;i++){
@@ -326,6 +348,12 @@ public:
 	permute(out._internal[i][j],in._internal[i][j],permutetype);
    }}
  }
+  friend strong_inline void rotate(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int rot){
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	rotate(out._internal[i][j],in._internal[i][j],rot);
+    }}
+  }


  // Unary negation
--- a/prerequisites/Makefile.am
+++ b/prerequisites/Makefile.am
@@ -0,0 +1,38 @@
+FFTFLAGS=$(filter-out -std=c++11, $(CXXFLAGS) )
+
+EIGENVER=3.2.8
+EIGEN=eigen$(EIGENVER)
+EIGENTAR=$(EIGEN).tar.bz2
+EIGENURL=https://bitbucket.org/eigen/eigen/get/$(EIGENVER).tar.bz2
+
+FFTWVER=3.3.4
+FFTW=fftw-$(FFTWVER)
+FFTWTAR=fftw-$(FFTWVER).tar.gz
+FFTWURL=http://www.fftw.org/$(FFTWTAR)
+
+all: Eigen FFTW headerlist
+
+$(top_srcdir)/prerequisites/$(EIGENTAR): 
+	curl -v $(EIGENURL) -o $(top_srcdir)/prerequisites/$(EIGENTAR)
+
+$(top_srcdir)/prerequisites/$(FFTWTAR): 
+	curl -v $(FFTWURL) -o $(top_srcdir)/prerequisites/$(FFTWTAR)
+
+Eigen:  $(top_srcdir)/prerequisites/$(EIGENTAR)
+	tar xvf $(top_srcdir)/prerequisites/$(EIGENTAR)
+	- rm -rf  $(top_srcdir)/lib/Eigen
+	mv eigen-eigen*/Eigen .
+	echo EFILES=`find Eigen -type f -name '*.h' ` > $(top_srcdir)/lib/Eigen.inc
+	mv Eigen $(top_srcdir)/lib/
+	touch Eigen
+
+FFTW: $(top_srcdir)/prerequisites/$(FFTWTAR)
+	tar xvf $(top_srcdir)/prerequisites/$(FFTWTAR)
+	cd $(FFTW) &&	./configure --prefix=@abs_top_builddir@/prerequisites/fftwinstall CFLAGS="$(FFTFLAGS)" CC=$(CC) LDFLAGS="$(LDFLAGS)" && make all install
+	cp -pr fftwinstall/include/fftw3.h ../include/Grid/
+	cp -pr fftwinstall/lib/libfftw3.a  ../lib/
+	touch FFTW
+
+headerlist:
+	cd $(top_srcdir) && ./scripts/filelist
+	touch headerlist
--- a/prerequisites/eigen3.2.8.tar.bz2
+++ b/prerequisites/eigen3.2.8.tar.bz2
--- a/prerequisites/eigenIGENVER.tar.bz2
+++ b/prerequisites/eigenIGENVER.tar.bz2
--- a/scripts/filelist
+++ b/scripts/filelist
@@ -1,7 +1,8 @@
 #!/bin/bash
- 
-cd lib

+home=`pwd`
+ 
+cd $home/lib
 HFILES=`find . -type f -name '*.h'`
 CCFILES=`find . -type f -name '*.cc' -not  -name '*ommunicator*.cc'`
 echo> Make.inc
@@ -9,18 +10,24 @@ echo HFILES=$HFILES >> Make.inc
 echo >> Make.inc
 echo CCFILES=$CCFILES >> Make.inc

-cd ..
+cd $home/tests

-cd tests
+dirs=`find . -type d `
+
+for subdir in $dirs
+do
+
+pwd
+echo subdir is $subdir of $dirs
+
+cd $home/tests/$subdir

-echo> Make.inc
 TESTS=`ls T*.cc`
 TESTLIST=`echo ${TESTS} | sed s/.cc//g `

 echo > Make.inc
 echo bin_PROGRAMS += ${TESTLIST} | sed s/Test_zmm//g >> Make.inc
 echo >> Make.inc
-
 for f in $TESTS
 do
 BNAME=`basename $f .cc`
@@ -30,31 +37,10 @@ echo ${BNAME}_LDADD=-lGrid>> Make.inc
 echo >> Make.inc
 done

-cd qdpxx

-echo> Make.inc
-TESTS=`ls T*.cc`
-TESTLIST=`echo ${TESTS} | sed s/.cc//g `
-
-echo > Make.inc
-echo bin_PROGRAMS = ${TESTLIST} >> Make.inc
-echo >> Make.inc
-
-for f in $TESTS
-do
-BNAME=`basename $f .cc`
-echo >> Make.inc
-echo ${BNAME}_SOURCES=$f  >> Make.inc
-echo ${BNAME}_LDADD=-lGrid>> Make.inc
-echo >> Make.inc
 done

-cd ..
-cd ..
-
-
-cd benchmarks
-
+cd $home/benchmarks

 echo> Make.inc
 TESTS=`ls B*.cc`
--- a/scripts/linecount
+++ b/scripts/linecount
@@ -19,7 +19,7 @@ count=`grep total tmp.fil`
 echo $count " in lib/${sdir}" 
 done

-wc -l tests/*.cc | grep total >& tmp.fil
+wc -l tests/*.cc tests/*/*.cc | grep total >& tmp.fil
 count=`grep total tmp.fil`
 echo  $count " in tests"

--- a/tests/IO/Make.inc
+++ b/tests/IO/Make.inc
@@ -0,0 +1,11 @@
+
+bin_PROGRAMS += Test_nersc_io Test_serialisation
+
+
+Test_nersc_io_SOURCES=Test_nersc_io.cc
+Test_nersc_io_LDADD=-lGrid
+
+
+Test_serialisation_SOURCES=Test_serialisation.cc
+Test_serialisation_LDADD=-lGrid
+
--- a/tests/IO/Makefile.am
+++ b/tests/IO/Makefile.am
@@ -0,0 +1,19 @@
+# additional include paths necessary to compile the C++ library
+
+bin_PROGRAMS =
+SUBDIRS =
+
+AM_CXXFLAGS = -I$(top_srcdir)/include
+AM_LDFLAGS = -L$(top_builddir)/lib
+
+if USE_LAPACK
+AM_CXXFLAGS += -DUSE_LAPACK
+if USE_LAPACK_LIB
+#if test "X${ac_LAPACK}X" != XyesX 
+AM_CXXFLAGS += -I$(ac_LAPACK)/include
+AM_LDFLAGS += -L$(ac_LAPACK)/lib
+#fi
+endif
+endif
+
+include Make.inc
--- a/tests/IO/Test_nersc_io.cc
+++ b/tests/IO/Test_nersc_io.cc
@@ -27,7 +27,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
--- a/tests/IO/Test_serialisation.cc
+++ b/tests/IO/Test_serialisation.cc
@@ -26,7 +26,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {
  
--- a/tests/Make.inc
+++ b/tests/Make.inc
@@ -1,239 +1,11 @@

-bin_PROGRAMS += Test_GaugeAction Test_RectPlaq Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_cshift_red_black_rotate Test_cshift_rotate Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_dwf_rb5d Test_gamma Test_gp_rect_force Test_gparity Test_gpdwf_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd 
-
-
-Test_GaugeAction_SOURCES=Test_GaugeAction.cc
-Test_GaugeAction_LDADD=-lGrid
-
-
-Test_RectPlaq_SOURCES=Test_RectPlaq.cc
-Test_RectPlaq_LDADD=-lGrid
-
-
-Test_cayley_cg_SOURCES=Test_cayley_cg.cc
-Test_cayley_cg_LDADD=-lGrid
-
-
-Test_cayley_coarsen_support_SOURCES=Test_cayley_coarsen_support.cc
-Test_cayley_coarsen_support_LDADD=-lGrid
-
-
-Test_cayley_even_odd_SOURCES=Test_cayley_even_odd.cc
-Test_cayley_even_odd_LDADD=-lGrid
-
-
-Test_cayley_ldop_cr_SOURCES=Test_cayley_ldop_cr.cc
-Test_cayley_ldop_cr_LDADD=-lGrid
-
-
-Test_cf_coarsen_support_SOURCES=Test_cf_coarsen_support.cc
-Test_cf_coarsen_support_LDADD=-lGrid
-
-
-Test_cf_cr_unprec_SOURCES=Test_cf_cr_unprec.cc
-Test_cf_cr_unprec_LDADD=-lGrid
-
-
-Test_cheby_SOURCES=Test_cheby.cc
-Test_cheby_LDADD=-lGrid
-
-
-Test_contfrac_cg_SOURCES=Test_contfrac_cg.cc
-Test_contfrac_cg_LDADD=-lGrid
-
-
-Test_contfrac_even_odd_SOURCES=Test_contfrac_even_odd.cc
-Test_contfrac_even_odd_LDADD=-lGrid
-
-
-Test_contfrac_force_SOURCES=Test_contfrac_force.cc
-Test_contfrac_force_LDADD=-lGrid
+bin_PROGRAMS += Test_cshift Test_simd Test_stencil


 Test_cshift_SOURCES=Test_cshift.cc
 Test_cshift_LDADD=-lGrid


-Test_cshift_red_black_SOURCES=Test_cshift_red_black.cc
-Test_cshift_red_black_LDADD=-lGrid
-
-
-Test_cshift_red_black_rotate_SOURCES=Test_cshift_red_black_rotate.cc
-Test_cshift_red_black_rotate_LDADD=-lGrid
-
-
-Test_cshift_rotate_SOURCES=Test_cshift_rotate.cc
-Test_cshift_rotate_LDADD=-lGrid
-
-
-Test_dwf_cg_prec_SOURCES=Test_dwf_cg_prec.cc
-Test_dwf_cg_prec_LDADD=-lGrid
-
-
-Test_dwf_cg_schur_SOURCES=Test_dwf_cg_schur.cc
-Test_dwf_cg_schur_LDADD=-lGrid
-
-
-Test_dwf_cg_unprec_SOURCES=Test_dwf_cg_unprec.cc
-Test_dwf_cg_unprec_LDADD=-lGrid
-
-
-Test_dwf_cr_unprec_SOURCES=Test_dwf_cr_unprec.cc
-Test_dwf_cr_unprec_LDADD=-lGrid
-
-
-Test_dwf_even_odd_SOURCES=Test_dwf_even_odd.cc
-Test_dwf_even_odd_LDADD=-lGrid
-
-
-Test_dwf_force_SOURCES=Test_dwf_force.cc
-Test_dwf_force_LDADD=-lGrid
-
-
-Test_dwf_fpgcr_SOURCES=Test_dwf_fpgcr.cc
-Test_dwf_fpgcr_LDADD=-lGrid
-
-
-Test_dwf_gpforce_SOURCES=Test_dwf_gpforce.cc
-Test_dwf_gpforce_LDADD=-lGrid
-
-
-Test_dwf_hdcr_SOURCES=Test_dwf_hdcr.cc
-Test_dwf_hdcr_LDADD=-lGrid
-
-
-Test_dwf_lanczos_SOURCES=Test_dwf_lanczos.cc
-Test_dwf_lanczos_LDADD=-lGrid
-
-
-Test_dwf_rb5d_SOURCES=Test_dwf_rb5d.cc
-Test_dwf_rb5d_LDADD=-lGrid
-
-
-Test_gamma_SOURCES=Test_gamma.cc
-Test_gamma_LDADD=-lGrid
-
-
-Test_gp_rect_force_SOURCES=Test_gp_rect_force.cc
-Test_gp_rect_force_LDADD=-lGrid
-
-
-Test_gparity_SOURCES=Test_gparity.cc
-Test_gparity_LDADD=-lGrid
-
-
-Test_gpdwf_force_SOURCES=Test_gpdwf_force.cc
-Test_gpdwf_force_LDADD=-lGrid
-
-
-Test_gpwilson_even_odd_SOURCES=Test_gpwilson_even_odd.cc
-Test_gpwilson_even_odd_LDADD=-lGrid
-
-
-Test_hmc_EODWFRatio_SOURCES=Test_hmc_EODWFRatio.cc
-Test_hmc_EODWFRatio_LDADD=-lGrid
-
-
-Test_hmc_EODWFRatio_Gparity_SOURCES=Test_hmc_EODWFRatio_Gparity.cc
-Test_hmc_EODWFRatio_Gparity_LDADD=-lGrid
-
-
-Test_hmc_EOWilsonFermionGauge_SOURCES=Test_hmc_EOWilsonFermionGauge.cc
-Test_hmc_EOWilsonFermionGauge_LDADD=-lGrid
-
-
-Test_hmc_EOWilsonRatio_SOURCES=Test_hmc_EOWilsonRatio.cc
-Test_hmc_EOWilsonRatio_LDADD=-lGrid
-
-
-Test_hmc_GparityIwasakiGauge_SOURCES=Test_hmc_GparityIwasakiGauge.cc
-Test_hmc_GparityIwasakiGauge_LDADD=-lGrid
-
-
-Test_hmc_GparityWilsonGauge_SOURCES=Test_hmc_GparityWilsonGauge.cc
-Test_hmc_GparityWilsonGauge_LDADD=-lGrid
-
-
-Test_hmc_IwasakiGauge_SOURCES=Test_hmc_IwasakiGauge.cc
-Test_hmc_IwasakiGauge_LDADD=-lGrid
-
-
-Test_hmc_RectGauge_SOURCES=Test_hmc_RectGauge.cc
-Test_hmc_RectGauge_LDADD=-lGrid
-
-
-Test_hmc_WilsonFermionGauge_SOURCES=Test_hmc_WilsonFermionGauge.cc
-Test_hmc_WilsonFermionGauge_LDADD=-lGrid
-
-
-Test_hmc_WilsonGauge_SOURCES=Test_hmc_WilsonGauge.cc
-Test_hmc_WilsonGauge_LDADD=-lGrid
-
-
-Test_hmc_WilsonRatio_SOURCES=Test_hmc_WilsonRatio.cc
-Test_hmc_WilsonRatio_LDADD=-lGrid
-
-
-Test_lie_generators_SOURCES=Test_lie_generators.cc
-Test_lie_generators_LDADD=-lGrid
-
-
-Test_main_SOURCES=Test_main.cc
-Test_main_LDADD=-lGrid
-
-
-Test_multishift_sqrt_SOURCES=Test_multishift_sqrt.cc
-Test_multishift_sqrt_LDADD=-lGrid
-
-
-Test_nersc_io_SOURCES=Test_nersc_io.cc
-Test_nersc_io_LDADD=-lGrid
-
-
-Test_partfrac_force_SOURCES=Test_partfrac_force.cc
-Test_partfrac_force_LDADD=-lGrid
-
-
-Test_quenched_update_SOURCES=Test_quenched_update.cc
-Test_quenched_update_LDADD=-lGrid
-
-
-Test_rect_force_SOURCES=Test_rect_force.cc
-Test_rect_force_LDADD=-lGrid
-
-
-Test_remez_SOURCES=Test_remez.cc
-Test_remez_LDADD=-lGrid
-
-
-Test_rhmc_EOWilson1p1_SOURCES=Test_rhmc_EOWilson1p1.cc
-Test_rhmc_EOWilson1p1_LDADD=-lGrid
-
-
-Test_rhmc_EOWilsonRatio_SOURCES=Test_rhmc_EOWilsonRatio.cc
-Test_rhmc_EOWilsonRatio_LDADD=-lGrid
-
-
-Test_rhmc_Wilson1p1_SOURCES=Test_rhmc_Wilson1p1.cc
-Test_rhmc_Wilson1p1_LDADD=-lGrid
-
-
-Test_rhmc_WilsonRatio_SOURCES=Test_rhmc_WilsonRatio.cc
-Test_rhmc_WilsonRatio_LDADD=-lGrid
-
-
-Test_rng_SOURCES=Test_rng.cc
-Test_rng_LDADD=-lGrid
-
-
-Test_rng_fixed_SOURCES=Test_rng_fixed.cc
-Test_rng_fixed_LDADD=-lGrid
-
-
-Test_serialisation_SOURCES=Test_serialisation.cc
-Test_serialisation_LDADD=-lGrid
-
-
 Test_simd_SOURCES=Test_simd.cc
 Test_simd_LDADD=-lGrid

@@ -241,47 +13,3 @@ Test_simd_LDADD=-lGrid
 Test_stencil_SOURCES=Test_stencil.cc
 Test_stencil_LDADD=-lGrid

-
-Test_synthetic_lanczos_SOURCES=Test_synthetic_lanczos.cc
-Test_synthetic_lanczos_LDADD=-lGrid
-
-
-Test_wilson_cg_prec_SOURCES=Test_wilson_cg_prec.cc
-Test_wilson_cg_prec_LDADD=-lGrid
-
-
-Test_wilson_cg_schur_SOURCES=Test_wilson_cg_schur.cc
-Test_wilson_cg_schur_LDADD=-lGrid
-
-
-Test_wilson_cg_unprec_SOURCES=Test_wilson_cg_unprec.cc
-Test_wilson_cg_unprec_LDADD=-lGrid
-
-
-Test_wilson_cr_unprec_SOURCES=Test_wilson_cr_unprec.cc
-Test_wilson_cr_unprec_LDADD=-lGrid
-
-
-Test_wilson_even_odd_SOURCES=Test_wilson_even_odd.cc
-Test_wilson_even_odd_LDADD=-lGrid
-
-
-Test_wilson_force_SOURCES=Test_wilson_force.cc
-Test_wilson_force_LDADD=-lGrid
-
-
-Test_wilson_force_phiMdagMphi_SOURCES=Test_wilson_force_phiMdagMphi.cc
-Test_wilson_force_phiMdagMphi_LDADD=-lGrid
-
-
-Test_wilson_force_phiMphi_SOURCES=Test_wilson_force_phiMphi.cc
-Test_wilson_force_phiMphi_LDADD=-lGrid
-
-
-Test_wilson_tm_even_odd_SOURCES=Test_wilson_tm_even_odd.cc
-Test_wilson_tm_even_odd_LDADD=-lGrid
-
-
-Test_zmm_SOURCES=Test_zmm.cc
-Test_zmm_LDADD=-lGrid
-
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -1,11 +1,16 @@
 # additional include paths necessary to compile the C++ library

-SUBDIRS = 
+#SUBDIRS = core
+
+# Uncomment to enable complete test suite build
+SUBDIRS = core forces hmc solver debug	
+
 if BUILD_CHROMA_REGRESSION
  SUBDIRS+= qdpxx
 endif
+bin_PROGRAMS =

-AM_CXXFLAGS = -I$(top_srcdir)/lib
+AM_CXXFLAGS = -I$(top_srcdir)/include
 AM_LDFLAGS = -L$(top_builddir)/lib

 if USE_LAPACK
@@ -18,10 +23,4 @@ AM_LDFLAGS += -L$(ac_LAPACK)/lib
 endif
 endif

-if BUILD_ZMM
-  bin_PROGRAMS=Test_zmm
-else
-  bin_PROGRAMS=
-endif
-
 include Make.inc
--- a/tests/Test_cshift.cc
+++ b/tests/Test_cshift.cc
@@ -26,7 +26,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 using namespace Grid;
 using namespace Grid::QCD;
--- a/tests/Test_simd.cc
+++ b/tests/Test_simd.cc
@@ -26,7 +26,7 @@ Author: neo <cossu@post.kek.jp>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
--- a/tests/Test_stencil.cc
+++ b/tests/Test_stencil.cc
@@ -27,7 +27,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include "Grid.h"
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
--- a/tests/core/Make.inc
+++ b/tests/core/Make.inc
@@ -0,0 +1,83 @@
+
+bin_PROGRAMS += Test_GaugeAction Test_RectPlaq Test_cf_coarsen_support Test_checker Test_contfrac_even_odd Test_cshift_red_black Test_cshift_red_black_rotate Test_cshift_rotate Test_dwf_even_odd Test_dwf_rb5d Test_gamma Test_gparity Test_gpwilson_even_odd Test_lie_generators Test_main Test_quenched_update Test_rng Test_rng_fixed Test_wilson_even_odd Test_wilson_tm_even_odd
+
+
+Test_GaugeAction_SOURCES=Test_GaugeAction.cc
+Test_GaugeAction_LDADD=-lGrid
+
+
+Test_RectPlaq_SOURCES=Test_RectPlaq.cc
+Test_RectPlaq_LDADD=-lGrid
+
+
+Test_cf_coarsen_support_SOURCES=Test_cf_coarsen_support.cc
+Test_cf_coarsen_support_LDADD=-lGrid
+
+
+Test_checker_SOURCES=Test_checker.cc
+Test_checker_LDADD=-lGrid
+
+
+Test_contfrac_even_odd_SOURCES=Test_contfrac_even_odd.cc
+Test_contfrac_even_odd_LDADD=-lGrid
+
+
+Test_cshift_red_black_SOURCES=Test_cshift_red_black.cc
+Test_cshift_red_black_LDADD=-lGrid
+
+
+Test_cshift_red_black_rotate_SOURCES=Test_cshift_red_black_rotate.cc
+Test_cshift_red_black_rotate_LDADD=-lGrid
+
+
+Test_cshift_rotate_SOURCES=Test_cshift_rotate.cc
+Test_cshift_rotate_LDADD=-lGrid
+
+
+Test_dwf_even_odd_SOURCES=Test_dwf_even_odd.cc
+Test_dwf_even_odd_LDADD=-lGrid
+
+
+Test_dwf_rb5d_SOURCES=Test_dwf_rb5d.cc
+Test_dwf_rb5d_LDADD=-lGrid
+
+
+Test_gamma_SOURCES=Test_gamma.cc
+Test_gamma_LDADD=-lGrid
+
+
+Test_gparity_SOURCES=Test_gparity.cc
+Test_gparity_LDADD=-lGrid
+
+
+Test_gpwilson_even_odd_SOURCES=Test_gpwilson_even_odd.cc
+Test_gpwilson_even_odd_LDADD=-lGrid
+
+
+Test_lie_generators_SOURCES=Test_lie_generators.cc
+Test_lie_generators_LDADD=-lGrid
+
+
+Test_main_SOURCES=Test_main.cc
+Test_main_LDADD=-lGrid
+
+
+Test_quenched_update_SOURCES=Test_quenched_update.cc
+Test_quenched_update_LDADD=-lGrid
+
+
+Test_rng_SOURCES=Test_rng.cc
+Test_rng_LDADD=-lGrid
+
+
+Test_rng_fixed_SOURCES=Test_rng_fixed.cc
+Test_rng_fixed_LDADD=-lGrid
+
+
+Test_wilson_even_odd_SOURCES=Test_wilson_even_odd.cc
+Test_wilson_even_odd_LDADD=-lGrid
+
+
+Test_wilson_tm_even_odd_SOURCES=Test_wilson_tm_even_odd.cc
+Test_wilson_tm_even_odd_LDADD=-lGrid
+
--- a/tests/core/Makefile.am
+++ b/tests/core/Makefile.am
@@ -0,0 +1,19 @@
+# additional include paths necessary to compile the C++ library
+
+bin_PROGRAMS =
+SUBDIRS =
+
+AM_CXXFLAGS = -I$(top_srcdir)/include
+AM_LDFLAGS = -L$(top_builddir)/lib
+
+if USE_LAPACK
+AM_CXXFLAGS += -DUSE_LAPACK
+if USE_LAPACK_LIB
+#if test "X${ac_LAPACK}X" != XyesX 
+AM_CXXFLAGS += -I$(ac_LAPACK)/include
+AM_LDFLAGS += -L$(ac_LAPACK)/lib
+#fi
+endif
+endif
+
+include Make.inc
--- a/tests/core/Test_GaugeAction.cc
+++ b/tests/core/Test_GaugeAction.cc
@@ -27,10 +27,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
-
-#include <qcd/utils/CovariantCshift.h>
-#include <qcd/utils/WilsonLoops.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
--- a/tests/core/Test_RectPlaq.cc
+++ b/tests/core/Test_RectPlaq.cc
@@ -26,10 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
-
-#include <qcd/utils/CovariantCshift.h>
-#include <qcd/utils/WilsonLoops.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
--- a/tests/core/Test_cf_coarsen_support.cc
+++ b/tests/core/Test_cf_coarsen_support.cc
@@ -25,7 +25,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
--- a/tests/core/Test_checker.cc
+++ b/tests/core/Test_checker.cc
@@ -0,0 +1,155 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_cg_prec.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::GammaMatrix Gmu [] = {
+    Gamma::GammaX,
+    Gamma::GammaY,
+    Gamma::GammaZ,
+    Gamma::GammaT
+  };
+
+int toint(const char* str){
+  std::stringstream os; os << str;
+  int out; os >> out;
+  return out;
+}
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  assert(argc >= 5);
+  
+  std::vector<int> latt(4,0);
+  latt[0] = toint(argv[1]);
+  latt[1] = toint(argv[2]);
+  latt[2] = toint(argv[3]);
+  latt[3] = toint(argv[4]);
+  
+  const int Ls= toint(argv[5]);
+  
+  std::cout << "Lattice size (" << latt[0] << "," << latt[1] << "," << latt[2] << "," << latt[3] << ") Ls=" << Ls << std::endl;
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
+  std::cout << "SIMD layout (" << simd_layout[0] << "," << simd_layout[1] << "," << simd_layout[2] << "," << simd_layout[3] << ")" << std::endl;
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt, simd_layout,GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+
+  //typedef Lattice<iGparitySpinColourVector<vComplexD> > LatticeType;
+  typedef LatticeFermionD LatticeType;
+  
+  LatticeType    src(FGrid); random(RNG5,src);
+
+  LatticeType src_o(FrbGrid);
+  pickCheckerboard(Odd,src_o,src);
+
+  std::vector<int> site(5);
+  std::vector<int> cbsite(5);
+  typedef typename GridTypeMapper<LatticeType::vector_object>::scalar_object sobj;
+
+  // std::cout << "sizeof(vobj) " << sizeof(LatticeType::vector_object) << std::endl;
+  // std::cout << "sizeof(sobj) " << sizeof(sobj) << std::endl;
+  std::cout << "v1 from uncheckerboarded field, v2 from odd-parity red-black field\n";
+
+  for(site[0]=0;site[0]<Ls;site[0]++){
+    for(site[4]=0;site[4]<latt[3];site[4]++){
+      for(site[3]=0;site[3]<latt[2];site[3]++){
+	for(site[2]=0;site[2]<latt[1];site[2]++){
+	  for(site[1]=0;site[1]<latt[0];site[1]++){
+	    if(src_o._grid->CheckerBoard(site) != src_o.checkerboard)
+	      continue;
+
+	    std::cout << "Site (" << site[0] << "," << site[1] << "," << site[2] << "," << site[3] << "," << site[4] << ")" << std::endl;
+	    sobj v1, v2;
+	    peekLocalSite(v1,src,site);
+	    peekLocalSite(v2,src_o,site);
+	    
+	    RealD v1_norm = norm2(v1);
+	    RealD v2_norm = norm2(v2);
+	    RealD diff = v2_norm - v1_norm;
+
+	    std::cout << v1_norm << " " << v2_norm << " " << diff << '\n';
+	    if(fabs(diff) > 1e-12){
+	      std::cout << "ERROR!\n";
+	      exit(-1);
+	    }
+	  }
+	}
+      }
+    }
+  }
+
+  
+
+
+
+
+
+  
+  // LatticeFermion result(FGrid); result=zero;
+  // LatticeGaugeField Umu(UGrid); 
+
+  // SU3::HotConfiguration(RNG4,Umu);
+
+  // std::vector<LatticeColourMatrix> U(4,UGrid);
+  // for(int mu=0;mu<Nd;mu++){
+  //   U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+  // }
+  
+  // RealD mass=0.1;
+  // RealD M5=1.8;
+  // DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+
+  // LatticeFermion    src_o(FrbGrid);
+  // LatticeFermion result_o(FrbGrid);
+  // pickCheckerboard(Odd,src_o,src);
+  // result_o=zero;
+
+  // SchurDiagMooeeOperator<DomainWallFermionR,LatticeFermion> HermOpEO(Ddwf);
+  // ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
+  // CG(HermOpEO,src_o,result_o);
+
+  Grid_finalize();
+}
--- a/tests/core/Test_contfrac_even_odd.cc
+++ b/tests/core/Test_contfrac_even_odd.cc
@@ -25,7 +25,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
--- a/tests/core/Test_cshift_red_black.cc
+++ b/tests/core/Test_cshift_red_black.cc
@@ -27,7 +27,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 using namespace Grid;
 using namespace Grid::QCD;
--- a/tests/core/Test_cshift_red_black_rotate.cc
+++ b/tests/core/Test_cshift_red_black_rotate.cc
@@ -27,7 +27,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 using namespace Grid;
 using namespace Grid::QCD;
--- a/tests/core/Test_cshift_rotate.cc
+++ b/tests/core/Test_cshift_rotate.cc
@@ -26,7 +26,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 using namespace Grid;
 using namespace Grid::QCD;
--- a/tests/core/Test_dwf_even_odd.cc
+++ b/tests/core/Test_dwf_even_odd.cc
@@ -26,7 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
--- a/tests/core/Test_dwf_rb5d.cc
+++ b/tests/core/Test_dwf_rb5d.cc
@@ -26,7 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
@@ -44,9 +44,9 @@ struct scal {
    Gamma::GammaT
  };

-typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
-typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
-typedef WilsonFermion5D<DomainWallRedBlack5dImplD> WilsonFermion5DD;
+typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
+typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
+typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;

 typedef WilsonFermion5D<WilsonImplR> WilsonFermion5D_OKR;

@@ -97,14 +97,12 @@ int main (int argc, char ** argv)
  RealD M5  =1.8;
  typename WilsonFermion5DR::ImplParams params;

-  WilsonFermion5DR Dw(1,Umu,*FGrid,*FrbGrid,*sUGrid,M5,params);
+  WilsonFermion5DR Dw(Umu,*FGrid,*FrbGrid,*sUGrid,*sUrbGrid,M5,params);

  Dw.Dhop(src,result,0);

  std::cout << "Norm src = "<<norm2(src)<<" Norm res = "<<norm2(result) << std::endl;

-
-
  GridCartesian         * FokGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FokrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  WilsonFermion5D_OKR Dok(Umu,*FokGrid,*FokrbGrid,*UGrid,*UrbGrid,M5,params);
@@ -131,7 +129,7 @@ int main (int argc, char ** argv)
  
  std::cout << "Reference = "<<norm2(src_ok)<<" res = "<<norm2(ref_ok) << std::endl;
  ref_ok = ref_ok - result_ok;
-  std::cout << "Reference diff = "<<norm2(result_ok)<< std::endl;
+  std::cout << "Result = "<<norm2(result_ok)<< std::endl;
  std::cout << "Reference diff = "<<norm2(ref_ok)<< std::endl;

  Grid_finalize();
--- a/Show More
+++ b/Show More