Merge branch 'feature/new-build' into feature/hadrons

# Conflicts: # Makefile.am # scripts/copyright
2025-07-10 02:17:06 +01:00 · 2016-08-03 16:49:16 +01:00
parent e0b7004f96 3b376ed54e
commit 2485ef9c9c
215 changed files with 8454 additions and 4613 deletions
--- a/.gitignore
+++ b/.gitignore
@ -5,7 +5,6 @@
 *.o
 *.obj

-
 # Editor files #
 ################
 *~
@ -48,6 +47,7 @@ Config.h.in
 config.log
 config.status
 .deps
+*.inc

 # http://www.gnu.org/software/autoconf #
 ########################################
@ -63,19 +63,7 @@ config.sub
 config.guess
 INSTALL
 .dirstamp
-
-# Packages #
-############
-# it's better to unpack these files and commit the raw source
-# git has its own built in compression methods
-*.7z
-*.dmg
-*.gz
-*.iso
-*.jar
-*.rar
-*.tar
-*.zip
+ltmain.sh
 
 # Logs and databases #
 ######################
@ -101,3 +89,12 @@ build*/*
 #####################
 *.xcodeproj/*
 build.sh
+
+# Eigen source #
+################
+lib/Eigen/*
+
+# libtool macros #
+##################
+m4/lt*
+m4/libtool.m4
--- a/.travis.yml
+++ b/.travis.yml
@ -82,9 +82,13 @@ install:
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
    
 script:
-    - ./scripts/reconfigure_script
+    - ./bootstrap.sh
    - mkdir build
    - cd build
-    - ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=single --enable-simd=SSE4 --enable-comms=none
+    - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none
+    - make -j4 
+    - ./benchmarks/Benchmark_dwf --threads 1
+    - echo make clean
+    - ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none
    - make -j4
    - ./benchmarks/Benchmark_dwf --threads 1
--- a/Makefile.am
+++ b/Makefile.am
@ -1,5 +1,4 @@
 # additional include paths necessary to compile the C++ library
-AM_CXXFLAGS = -I$(top_srcdir)/
-SUBDIRS = lib tests benchmarks programs
+SUBDIRS = lib benchmarks tests programs

-filelist: $(SUBDIRS)
+ACLOCAL_AMFLAGS = -I m4
--- a/README.md
+++ b/README.md
@ -1,8 +1,28 @@
-# Grid [![Build Status](https://travis-ci.org/paboyle/Grid.svg?branch=master)](https://travis-ci.org/paboyle/Grid)
-Data parallel C++ mathematical object library
+# Grid
+<table>
+<tr>
+    <td>Last stable release</td>
+    <td><a href="https://travis-ci.org/paboyle/Grid">
+    <img src="https://travis-ci.org/paboyle/Grid.svg?branch=master"></a>
+    </td>
+</tr>
+<tr>
+    <td>Development branch</td>
+    <td><a href="https://travis-ci.org/paboyle/Grid">
+    <img src="https://travis-ci.org/paboyle/Grid.svg?branch=develop"></a>
+    </td>
+</tr>
+</table>

-Last update 2015/7/30
+**Data parallel C++ mathematical object library.**

+Please send all pull requests to the `develop` branch.
+
+License: GPL v2.
+
+Last update 2016/08/03.
+
+### Description
 This library provides data parallel C++ container classes with internal memory layout
 that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
 are provided, similar to HPF and cmfortran, and user control is given over the mapping of
@ -22,37 +42,67 @@ optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a signifi
 for most programmers.

 The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
-Presently SSE4 (128 bit) AVX, AVX2 (256 bit) and IMCI and AVX512 (512 bit) targets are supported (ARM NEON on the way).
+Presently SSE4 (128 bit) AVX, AVX2 (256 bit) and IMCI and AVX512 (512 bit) targets are supported (ARM NEON and BG/Q QPX on the way).

-These are presented as 
-
-     vRealF, vRealD, vComplexF, vComplexD 
-
-internal vector data types. These may be useful in themselves for other programmers.
-The corresponding scalar types are named
-
-     RealF, RealD, ComplexF, ComplexD
+These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. These may be useful in themselves for other programmers.
+The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.

 MPI, OpenMP, and SIMD parallelism are present in the library.
+Please see https://arxiv.org/abs/1512.03487 for more detail.

-   You can give `configure' initial values for configuration parameters
-by setting variables in the command line or in the environment.  Here
-are examples:
+### Installation
+First, start by cloning the repository:

-     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -msse4" --enable-simd=SSE4
+``` bash
+git clone https://github.com/paboyle/Grid.git
+```

-     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx" --enable-simd=AVX
+Then enter the cloned directory and set up the build system:

-     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx2" --enable-simd=AVX2
+``` bash
+cd Grid
+./bootstrap.sh
+```

-     ./configure CXX=icpc CXXFLAGS="-std=c++11 -O3 -mmic" --enable-simd=AVX512 --host=none
-     
-Note: Before running configure it could be necessary to execute the script 
-       
-       script/filelist
+Now you can execute the `configure` script to generate makefiles (here from a build directory):

+``` bash
+mkdir build; cd build
+../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi --prefix=<path>
+```

-     
-For developers:
-Use reconfigure_script in the scripts/ directory to create the autotools environment 
+where `--enable-precision=` set the default precision (`single` or `double`), `--enable-simd=` set the SIMD type (see possible values below), `--enable-comms=` set the protocol used for communications (`none`, `mpi` or `shmem`), and `<path>` should be replaced by the prefix path where you want to install Grid. Other options are available, use `configure --help` to display them. Like with any other program using GNU autotool, the `CXX`, `CXXFLAGS`, `LDFLAGS`, ... environment variables can be modified to customise the build.

+Finally, you can build and install Grid:
+
+``` bash
+make; make install
+```
+
+To minimise the build time, only the tests at the root of the `tests` directory are built by default. If you want to build tests in the sub-directory `<subdir>` you can execute:
+
+``` bash
+make -C tests/<subdir> tests
+```
+
+### Possible SIMD types
+
+The following options can be use with the `--enable-simd=` option to target different SIMD instruction sets:
+
+| String      | Description                            |
+| ----------- | -------------------------------------- |
+| `GEN`       | generic portable vector code           |
+| `SSE4`      | SSE 4.2 (128 bit)                      |
+| `AVX`       | AVX (256 bit)                          |
+| `AVXFMA4`   | AVX (256 bit) + FMA                    |
+| `AVX2`      | AVX 2 (256 bit)                        |
+| `AVX512`    | AVX 512 bit                            |
+| `AVX512MIC` | AVX 512 bit for Intel MIC architecture |
+| `ICMI`      | Intel ICMI instructions (512 bit)      |
+
+Alternatively, some CPU codenames can be directly used:
+
+| String      | Description                            |
+| ----------- | -------------------------------------- |
+| `KNC`       | [Intel Knights Corner](http://ark.intel.com/products/codename/57721/Knights-Corner) |
+| `KNL`       | [Intel Knights Landing](http://ark.intel.com/products/codename/48999/Knights-Landing) |
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@ -25,7 +25,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
@ -196,5 +196,126 @@ int main (int argc, char ** argv)



+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking sequential persistent halo exchange in "<<nmu<<" dimensions"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
+
+
+  for(int lat=4;lat<=32;lat+=2){
+    for(int Ls=1;Ls<=16;Ls*=2){
+
+      std::vector<int> latt_size  ({lat,lat,lat,lat});
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+
+      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+
+
+      int ncomm;
+      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+
+
+      std::vector<CartesianCommunicator::CommsRequest_t> empty;
+      std::vector<std::vector<CartesianCommunicator::CommsRequest_t> > requests_fwd(Nd,empty);
+      std::vector<std::vector<CartesianCommunicator::CommsRequest_t> > requests_bwd(Nd,empty);
+
+      for(int mu=0;mu<4;mu++){
+	ncomm=0;
+	if (mpi_layout[mu]>1 ) {
+	  ncomm++;
+
+	  int comm_proc;
+	  int xmit_to_rank;
+	  int recv_from_rank;
+
+	  comm_proc=1;
+	  Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	  Grid.SendToRecvFromInit(requests_fwd[mu],
+				  (void *)&xbuf[mu][0],
+				  xmit_to_rank,
+				  (void *)&rbuf[mu][0],
+				  recv_from_rank,
+				  bytes);
+
+	  comm_proc = mpi_layout[mu]-1;
+	  Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	  Grid.SendToRecvFromInit(requests_bwd[mu],
+				  (void *)&xbuf[mu+4][0],
+				  xmit_to_rank,
+				  (void *)&rbuf[mu+4][0],
+				  recv_from_rank,
+				  bytes);
+
+	}
+      }
+
+      {
+	double start=usecond();
+	for(int i=0;i<Nloop;i++){
+	  
+	  for(int mu=0;mu<4;mu++){
+	    
+	    if (mpi_layout[mu]>1 ) {
+	      
+	      Grid.SendToRecvFromBegin(requests_fwd[mu]);
+	      Grid.SendToRecvFromComplete(requests_fwd[mu]);
+	      Grid.SendToRecvFromBegin(requests_bwd[mu]);
+	      Grid.SendToRecvFromComplete(requests_bwd[mu]);
+	    }
+	  }
+	  Grid.Barrier();
+	}
+	
+	double stop=usecond();
+	
+	double dbytes    = bytes;
+	double xbytes    = Nloop*dbytes*2.0*ncomm;
+	double rbytes    = xbytes;
+	double bidibytes = xbytes+rbytes;
+	
+	double time = stop-start;
+	
+	std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
+
+      }
+
+
+      {
+	double start=usecond();
+	for(int i=0;i<Nloop;i++){
+	  
+	  for(int mu=0;mu<4;mu++){
+	    
+	    if (mpi_layout[mu]>1 ) {
+	      
+	      Grid.SendToRecvFromBegin(requests_fwd[mu]);
+	      Grid.SendToRecvFromBegin(requests_bwd[mu]);
+	      Grid.SendToRecvFromComplete(requests_fwd[mu]);
+	      Grid.SendToRecvFromComplete(requests_bwd[mu]);
+	    }
+	  }
+	  Grid.Barrier();
+	}
+	
+	double stop=usecond();
+	
+	double dbytes    = bytes;
+	double xbytes    = Nloop*dbytes*2.0*ncomm;
+	double rbytes    = xbytes;
+	double bidibytes = xbytes+rbytes;
+	
+	double time = stop-start;
+	
+	std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
+
+      }
+
+    }
+  }
+
+
+
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@ -26,8 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
-#include <PerfCount.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
@ -46,9 +45,9 @@ struct scal {
  };

 bool overlapComms = false;
-typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
-typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
-typedef WilsonFermion5D<DomainWallRedBlack5dImplD> WilsonFermion5DD;
+typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
+typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
+typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;


 int main (int argc, char ** argv)
@ -71,8 +70,8 @@ int main (int argc, char ** argv)

  std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
+  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
-  std::cout << GridLogMessage << "Making s innermost rb grids"<<std::endl;
  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);

  std::vector<int> seeds4({1,2,3,4});
@ -87,6 +86,16 @@ int main (int argc, char ** argv)
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);

+  /*  src=zero;
+  std::vector<int> origin(5,0);
+  SpinColourVector f=zero;
+  for(int sp=0;sp<4;sp++){
+  for(int co=0;co<3;co++){
+    f()(sp)(co)=Complex(1.0,0.0); 
+  }}
+  pokeSite(f,src,origin);
+  */
+
  ColourMatrix cm = Complex(1.0,0.0);

  LatticeGaugeField Umu(UGrid); 
@ -127,19 +136,16 @@ int main (int argc, char ** argv)
  RealD mass=0.1;
  RealD M5  =1.8;

-  typename DomainWallFermionR::ImplParams params; 
-  params.overlapCommsCompute = overlapComms;
-  
  RealD NP = UGrid->_Nprocessors;

  for(int doasm=1;doasm<2;doasm++){

    QCD::WilsonKernelsStatic::AsmOpt=doasm;

-  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
+  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  
  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
-  int ncall =10;
+  int ncall =100;
  if (1) {

    double t0=usecond();
@ -165,11 +171,12 @@ int main (int argc, char ** argv)

  if (1)
  {
-    typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
+    typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
    LatticeFermion ssrc(sFGrid);
    LatticeFermion sref(sFGrid);
    LatticeFermion sresult(sFGrid);
-    WilsonFermion5DR sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5,params);
+
+    WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
  
    for(int x=0;x<latt4[0];x++){
    for(int y=0;y<latt4[1];y++){
@ -181,7 +188,7 @@ int main (int argc, char ** argv)
      peekSite(tmp,src,site);
      pokeSite(tmp,ssrc,site);
    }}}}}
-
+    std::cout<<"src norms "<< norm2(src)<<" " <<norm2(ssrc)<<std::endl;
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      __SSC_START;
@ -208,6 +215,7 @@ int main (int argc, char ** argv)
      }
    }

+    std::cout<<"res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl;


    RealF sum=0;
@ -221,9 +229,11 @@ int main (int argc, char ** argv)
      peekSite(normal,result,site);
      peekSite(simd,sresult,site);
      sum=sum+norm2(normal-simd);
-      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
-      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<normal<<std::endl;
-      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<simd<<std::endl;
+      if (norm2(normal-simd) > 1.0e-6 ) {
+	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
+	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" normal "<<normal<<std::endl;
+	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" simd   "<<simd<<std::endl;
+      }
    }}}}}
    std::cout<<" difference between normal and simd is "<<sum<<std::endl;

@ -268,9 +278,9 @@ int main (int argc, char ** argv)
      pickCheckerboard(Even,ssrc_e,sresult);
      pickCheckerboard(Odd ,ssrc_o,sresult);
      ssrc_e = ssrc_e - sr_e;
-      std::cout<<GridLogMessage << "sE norm diff   "<< norm2(ssrc_e)<<std::endl;
+      std::cout<<GridLogMessage << "sE norm diff   "<< norm2(ssrc_e)<< "  vec nrm"<<norm2(sr_e) <<std::endl;
      ssrc_o = ssrc_o - sr_o;
-      std::cout<<GridLogMessage << "sO norm diff   "<< norm2(ssrc_o)<<std::endl;
+      std::cout<<GridLogMessage << "sO norm diff   "<< norm2(ssrc_o)<< "  vec nrm"<<norm2(sr_o) <<std::endl;
    }


--- a/benchmarks/Benchmark_dwf_ntpf
+++ b/benchmarks/Benchmark_dwf_ntpf
--- a/benchmarks/Benchmark_dwf_ntpf.cc
+++ b/benchmarks/Benchmark_dwf_ntpf.cc
@ -26,8 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
-#include <PerfCount.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
--- a/benchmarks/Benchmark_dwf_sweep.cc
+++ b/benchmarks/Benchmark_dwf_sweep.cc
@ -1,4 +1,3 @@
-
    /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 
@ -27,8 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
-#include <PerfCount.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
@ -127,7 +125,6 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )

  ColourMatrix cm = Complex(1.0,0.0);

-
  LatticeGaugeField Umu5d(FGrid); 

  // replicate across fifth dimension
@ -146,11 +143,10 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
  }

 #ifdef CHECK
-  if (1)
-  {
+  if (1) {
+
    ref = zero;
    for(int mu=0;mu<Nd;mu++){
-
      tmp = U[mu]*Cshift(src,mu+1,1);
      ref=ref + tmp - Gamma(Gmu[mu])*tmp;

@ -194,20 +190,19 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
    Counter.Report();
  }
  
-  if ( ! report ) 
-    {
-      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-      double flops=1344*volume*ncall;
-      std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t";
-    }
+  if ( ! report ) {
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=1344*volume*ncall;
+    std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t";
+  }
  
 #ifdef CHECK
-    err = ref-result; 
-    RealD errd = norm2(err);
-    if ( errd> 1.0e-4 ) {
-      std::cout<<GridLogMessage << "oops !!! norm diff   "<< norm2(err)<<std::endl;
-      exit(-1);
-    }
+  err = ref-result; 
+  RealD errd = norm2(err);
+  if ( errd> 1.0e-4 ) {
+    std::cout<<GridLogMessage << "oops !!! norm diff   "<< norm2(err)<<std::endl;
+    exit(-1);
+  }
 #endif
    
  LatticeFermion src_e (FrbGrid);
@ -233,10 +228,9 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
      std::cout<< flops/(t1-t0);
    }
  }
-  
 }

-#undef CHECK_SDW
+#define CHECK_SDW
 void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
 {

@ -244,7 +238,9 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
+  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);

@ -278,93 +274,89 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
    }
  }

-
  RealD mass=0.1;
  RealD M5  =1.8;

-    typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
-    LatticeFermionF ssrc(sFGrid);
-    LatticeFermionF sref(sFGrid);
-    LatticeFermionF sresult(sFGrid);
-    WilsonFermion5DF sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5);
+  typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
+  LatticeFermion ssrc(sFGrid);
+  LatticeFermion sref(sFGrid);
+  LatticeFermion sresult(sFGrid);
+  WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
  
-    for(int x=0;x<latt4[0];x++){
-    for(int y=0;y<latt4[1];y++){
-    for(int z=0;z<latt4[2];z++){
-    for(int t=0;t<latt4[3];t++){
-    for(int s=0;s<Ls;s++){
-      std::vector<int> site({s,x,y,z,t});
-      SpinColourVectorF tmp;
-      peekSite(tmp,src,site);
-      pokeSite(tmp,ssrc,site);
-    }}}}}
+  for(int x=0;x<latt4[0];x++){
+  for(int y=0;y<latt4[1];y++){
+  for(int z=0;z<latt4[2];z++){
+  for(int t=0;t<latt4[3];t++){
+  for(int s=0;s<Ls;s++){
+    std::vector<int> site({s,x,y,z,t});
+    SpinColourVector tmp;
+    peekSite(tmp,src,site);
+    pokeSite(tmp,ssrc,site);
+  }}}}}

-    double t0=usecond();
-    sDw.Dhop(ssrc,sresult,0);
-    double t1=usecond();
+  double t0=usecond();
+  sDw.Dhop(ssrc,sresult,0);
+  double t1=usecond();

 #ifdef TIMERS_OFF
-    int ncall =10;
+  int ncall =10;
 #else 
-    int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
+  int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
 #endif

-    PerformanceCounter Counter(8);
-    Counter.Start();
-    t0=usecond();
-    for(int i=0;i<ncall;i++){
-      sDw.Dhop(ssrc,sresult,0);
-    }
-    t1=usecond();
-    Counter.Stop();
+  PerformanceCounter Counter(8);
+  Counter.Start();
+  t0=usecond();
+  for(int i=0;i<ncall;i++){
+    sDw.Dhop(ssrc,sresult,0);
+  }
+  t1=usecond();
+  Counter.Stop();
+  
+  if ( report ) {
+    Counter.Report();
+  } else { 
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=1344*volume*ncall;
+    std::cout<<"\t"<< flops/(t1-t0);
+  }

-    if ( report ) {
-      Counter.Report();
-    } else { 
-
-      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-      double flops=1344*volume*ncall;
-      std::cout<<"\t"<< flops/(t1-t0);
-    }
-
-
-    LatticeFermionF sr_eo(sFGrid);
-    LatticeFermionF serr(sFGrid);
-    
-    LatticeFermion ssrc_e (sFrbGrid);
-    LatticeFermion ssrc_o (sFrbGrid);
-    LatticeFermion sr_e   (sFrbGrid);
-    LatticeFermion sr_o   (sFrbGrid);
+  LatticeFermion sr_eo(sFGrid);
+  LatticeFermion serr(sFGrid);
+  
+  LatticeFermion ssrc_e (sFrbGrid);
+  LatticeFermion ssrc_o (sFrbGrid);
+  LatticeFermion sr_e   (sFrbGrid);
+  LatticeFermion sr_o   (sFrbGrid);
      
-    pickCheckerboard(Even,ssrc_e,ssrc);
-    pickCheckerboard(Odd,ssrc_o,ssrc);
-
-    setCheckerboard(sr_eo,ssrc_o);
-    setCheckerboard(sr_eo,ssrc_e);
-    
-    sr_e = zero;
-    sr_o = zero;
+  pickCheckerboard(Even,ssrc_e,ssrc);
+  pickCheckerboard(Odd,ssrc_o,ssrc);
+  
+  setCheckerboard(sr_eo,ssrc_o);
+  setCheckerboard(sr_eo,ssrc_e);
    
+  sr_e = zero;
+  sr_o = zero;
+  
+  sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
+  PerformanceCounter CounterSdw(8);
+  CounterSdw.Start();
+  t0=usecond();
+  for(int i=0;i<ncall;i++){
+    __SSC_START;
    sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
-    PerformanceCounter CounterSdw(8);
-    CounterSdw.Start();
-    t0=usecond();
-    for(int i=0;i<ncall;i++){
-      __SSC_START;
-      sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
-      __SSC_STOP;
-    }
-    t1=usecond();
-    CounterSdw.Stop();
+    __SSC_STOP;
+  }
+  t1=usecond();
+  CounterSdw.Stop();

-    if ( report ) { 
-      CounterSdw.Report();
-    } else {
-
-      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-      double flops=(1344.0*volume*ncall)/2;
-      std::cout<<"\t"<< flops/(t1-t0);
-    }
+  if ( report ) { 
+    CounterSdw.Report();
+  } else {
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=(1344.0*volume*ncall)/2;
+    std::cout<<"\t"<< flops/(t1-t0);
+  }
 }


--- a/benchmarks/Benchmark_memory_asynch.cc
+++ b/benchmarks/Benchmark_memory_asynch.cc
@ -26,7 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
--- a/benchmarks/Benchmark_memory_bandwidth.cc
+++ b/benchmarks/Benchmark_memory_bandwidth.cc
@ -26,7 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@ -26,7 +26,7 @@ Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
--- a/benchmarks/Benchmark_wilson.cc
+++ b/benchmarks/Benchmark_wilson.cc
@ -26,7 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
--- a/benchmarks/Benchmark_zmm
+++ b/benchmarks/Benchmark_zmm
--- a/benchmarks/Benchmark_zmm.cc
+++ b/benchmarks/Benchmark_zmm.cc
@ -25,8 +25,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
-#include <PerfCount.h>
+#include <Grid/Grid.h>


 using namespace Grid;
--- a/benchmarks/Make.inc
+++ b/benchmarks/Make.inc
@ -1,39 +0,0 @@
-
-bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_dwf_ntpf Benchmark_dwf_sweep Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson Benchmark_zmm
-
-
-Benchmark_comms_SOURCES=Benchmark_comms.cc
-Benchmark_comms_LDADD=-lGrid
-
-
-Benchmark_dwf_SOURCES=Benchmark_dwf.cc
-Benchmark_dwf_LDADD=-lGrid
-
-
-Benchmark_dwf_ntpf_SOURCES=Benchmark_dwf_ntpf.cc
-Benchmark_dwf_ntpf_LDADD=-lGrid
-
-
-Benchmark_dwf_sweep_SOURCES=Benchmark_dwf_sweep.cc
-Benchmark_dwf_sweep_LDADD=-lGrid
-
-
-Benchmark_memory_asynch_SOURCES=Benchmark_memory_asynch.cc
-Benchmark_memory_asynch_LDADD=-lGrid
-
-
-Benchmark_memory_bandwidth_SOURCES=Benchmark_memory_bandwidth.cc
-Benchmark_memory_bandwidth_LDADD=-lGrid
-
-
-Benchmark_su3_SOURCES=Benchmark_su3.cc
-Benchmark_su3_LDADD=-lGrid
-
-
-Benchmark_wilson_SOURCES=Benchmark_wilson.cc
-Benchmark_wilson_LDADD=-lGrid
-
-
-Benchmark_zmm_SOURCES=Benchmark_zmm.cc
-Benchmark_zmm_LDADD=-lGrid
-
--- a/benchmarks/Makefile.am
+++ b/benchmarks/Makefile.am
@ -1,8 +1 @@
-# additional include paths necessary to compile the C++ library
-AM_CXXFLAGS = -I$(top_srcdir)/lib
-AM_LDFLAGS = -L$(top_builddir)/lib
-
-#
-# Test code
-#
 include Make.inc
--- a/bootstrap.sh
+++ b/bootstrap.sh
@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+
+./scripts/update_eigen.sh eigen-3.2.9.tar.bz2
+./scripts/filelist
+autoreconf -fvi
--- a/configure.ac
+++ b/configure.ac
@ -1,317 +1,308 @@
-#                         -*- Autoconf -*-
-# Process this file with autoconf to produce a configure script.
-#
-# Project Grid package  
-# 
-# Time-stamp: <2015-07-10 17:46:21 neo>
-
 AC_PREREQ([2.63])
-AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk])
-AC_CANONICAL_SYSTEM
+AC_INIT([Grid], [0.5.1-dev], [https://github.com/paboyle/Grid], [Grid])
 AM_INIT_AUTOMAKE(subdir-objects)
 AC_CONFIG_MACRO_DIR([m4])
 AC_CONFIG_SRCDIR([lib/Grid.h])
 AC_CONFIG_HEADERS([lib/Config.h])
 m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])

-AC_MSG_NOTICE([
-
-:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-Configuring $PACKAGE v$VERSION  for $host
-:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-])
-
-# Checks for programs.
+############### Checks for programs
 AC_LANG(C++)
+: ${CXXFLAGS="-O3"}
 AC_PROG_CXX
 AC_OPENMP
-AC_PROG_RANLIB
-#AX_CXX_COMPILE_STDCXX_11(noext, mandatory)
-AX_EXT
+AM_CXXFLAGS="$OPENMP_CXXFLAGS $AM_CXXFLAGS"
+LT_INIT([disable-shared])

-# Checks for libraries.
-#AX_GCC_VAR_ATTRIBUTE(aligned)
-
-# Checks for header files.
+############### Checks for header files
 AC_CHECK_HEADERS(stdint.h)
 AC_CHECK_HEADERS(mm_malloc.h)
 AC_CHECK_HEADERS(malloc/malloc.h)
 AC_CHECK_HEADERS(malloc.h)
 AC_CHECK_HEADERS(endian.h)
 AC_CHECK_HEADERS(execinfo.h)
-AC_CHECK_HEADERS(gmp.h)
 AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
 AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])

-# Checks for typedefs, structures, and compiler characteristics.
+############### Checks for typedefs, structures, and compiler characteristics
 AC_TYPE_SIZE_T
 AC_TYPE_UINT32_T
 AC_TYPE_UINT64_T

-# Checks for library functions.
-echo
-echo Checking libraries 
-echo :::::::::::::::::::::::::::::::::::::::::::
+############### Options
+AC_ARG_WITH([gmp],
+    [AS_HELP_STRING([--with-gmp=prefix],
+    [try this for a non-standard install prefix of the GMP library])],
+    [AM_CXXFLAGS="-I$with_gmp/include $AM_CXXFLAGS"]
+    [AM_LDFLAGS="-L$with_gmp/lib" $AM_LDFLAGS])
+AC_ARG_WITH([mpfr],
+    [AS_HELP_STRING([--with-mpfr=prefix],
+    [try this for a non-standard install prefix of the MPFR library])],
+    [AM_CXXFLAGS="-I$with_mpfr/include $AM_CXXFLAGS"]
+    [AM_LDFLAGS="-L$with_mpfr/lib $AM_LDFLAGS"])
+AC_ARG_ENABLE([lapack],
+    [AC_HELP_STRING([--enable-lapack=yes|no|prefix], [enable LAPACK])], 
+    [ac_LAPACK=${enable_lapack}],[ac_LAPACK=no])
+case ${ac_LAPACK} in
+    no)
+        ;;
+    yes)
+        AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
+    *)
+        AM_CXXFLAGS="-I$ac_LAPACK/include $AM_CXXFLAGS"
+        AM_LDFLAGS="-L$ac_LAPACK/lib $AM_LDFLAGS"
+        AC_DEFINE([USE_LAPACK],[1],[use LAPACK])
+esac

+################ Get compiler informations
+AC_LANG([C++])
+AX_CXX_COMPILE_STDCXX_11([noext],[mandatory])
+AX_COMPILER_VENDOR
+AC_DEFINE_UNQUOTED([CXX_COMP_VENDOR],["$ax_cv_cxx_compiler_vendor"],
+      [vendor of C++ compiler that will compile the code])
+AX_GXX_VERSION
+AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"],
+      [version of g++ that will compile the code])
+
+############### Checks for library functions
+CXXFLAGS_CPY=$CXXFLAGS
+LDFLAGS_CPY=$LDFLAGS
+LIBS_CPY=$LIBS
+CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS"
+LDFLAGS="$AM_LDFLAGS $LDFLAGS"
 AC_CHECK_FUNCS([gettimeofday])
+AC_CHECK_LIB([gmp],[__gmpf_init],
+             [AC_CHECK_LIB([mpfr],[mpfr_init],
+                 [AC_DEFINE([HAVE_LIBMPFR], [1], [Define to 1 if you have the `MPFR' library (-lmpfr).])]
+                 [have_mpfr=true]
+                 [LIBS="$LIBS -lmpfr"],
+                 [AC_MSG_ERROR([MPFR library not found])])]
+             [AC_DEFINE([HAVE_LIBGMP], [1], [Define to 1 if you have the `GMP' library (-lgmp).])]
+             [have_gmp=true]
+             [LIBS="$LIBS -lgmp"])

-#AC_CHECK_LIB([gmp],[__gmpf_init],,
-#        [AC_MSG_ERROR(GNU Multiple Precision GMP library was not found in your system.
-#Please install or provide the correct path to your installation
-#Info at: http://www.gmplib.org)])
+if test "${ac_LAPACK}x" != "nox"; then
+    AC_CHECK_LIB([lapack],[LAPACKE_sbdsdc],[],
+                 [AC_MSG_ERROR("LAPACK enabled but library not found")])
+fi
+CXXFLAGS=$CXXFLAGS_CPY
+LDFLAGS=$LDFLAGS_CPY

-#AC_CHECK_LIB([mpfr],[mpfr_init],,
-#        [AC_MSG_ERROR(GNU Multiple Precision MPFR library was not found in your system.
-#Please install or provide the correct path to your installation
-#Info at: http://www.mpfr.org/)])
-
-#
-# SIMD instructions selection
-#
-
-AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVXFMA4|AVX2|AVX512|IMCI],\
+############### SIMD instruction selection
+AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVXFMA4|AVX2|AVX512|AVX512MIC|IMCI|KNL|KNC],\
 	[Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, IMCI])],\
-	[ac_SIMD=${enable_simd}],[ac_SIMD=DEBUG])
+	[ac_SIMD=${enable_simd}],[ac_SIMD=GEN])

-supported=no
-
-ac_ZMM=no;
+case ${ax_cv_cxx_compiler_vendor} in
+  clang|gnu)
+    case ${ac_SIMD} in
+      SSE4)
+        AC_DEFINE([SSE4],[1],[SSE4 intrinsics])
+        SIMD_FLAGS='-msse4.2';;
+      AVX)
+        AC_DEFINE([AVX1],[1],[AVX intrinsics])
+        SIMD_FLAGS='-mavx';;
+      AVXFMA4)
+        AC_DEFINE([AVXFMA4],[1],[AVX intrinsics with FMA4])
+        SIMD_FLAGS='-mavx -mfma4';;
+      AVX2)
+        AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
+        SIMD_FLAGS='-mavx2';;
+      AVX512|AVX512MIC|KNL)
+        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
+        SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';;
+      IMCI|KNC)
+        AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner])
+        SIMD_FLAGS='';;
+      GEN)
+        AC_DEFINE([GENERIC_VEC],[1],[generic vector code])
+        SIMD_FLAGS='';;
+      *)
+        AC_MSG_ERROR(["SIMD option ${ac_SIMD} not supported by the GCC/Clang"]);;
+    esac;;
+  intel)
+    case ${ac_SIMD} in
+      SSE4)
+        AC_DEFINE([SSE4],[1],[SSE4 intrinsics])
+        SIMD_FLAGS='-msse4.2 -xsse4.2';;
+      AVX)
+        AC_DEFINE([AVX1],[1],[AVX intrinsics])
+        SIMD_FLAGS='-mavx -xavx';;
+      AVXFMA4)
+        AC_DEFINE([AVXFMA4],[1],[AVX intrinsics with FMA4])
+        SIMD_FLAGS='-mavx -xavx -mfma';;
+      AVX2)
+        AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
+        SIMD_FLAGS='-march=core-avx2 -xcore-avx2';;
+      AVX512)
+        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
+        SIMD_FLAGS='-xcore-avx512';;
+      AVX512MIC|KNL)
+        AC_DEFINE([AVX512],[1],[AVX512 intrinsics for Knights Landing])
+        SIMD_FLAGS='-xmic-avx512';;
+      IMCI|KNC)
+        AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner])
+        SIMD_FLAGS='';;
+      GEN)
+        AC_DEFINE([GENERIC_VEC],[1],[generic vector code])
+        SIMD_FLAGS='';;
+      *)
+        AC_MSG_ERROR(["SIMD option ${ac_SIMD} not supported by the Intel compiler"]);;
+    esac;;
+  *)
+    AC_MSG_WARN([Compiler unknown, using generic vector code])
+    AC_DEFINE([GENERIC_VEC],[1],[generic vector code]);;
+esac
+AM_CXXFLAGS="$SIMD_FLAGS $AM_CXXFLAGS"
+AM_CFLAGS="$SIMD_FLAGS $AM_CFLAGS"

 case ${ac_SIMD} in
-     SSE4)
-       echo Configuring for SSE4
-       AC_DEFINE([SSE4],[1],[SSE4 Intrinsics] )
-       if test x"$ax_cv_support_ssse3_ext" = x"yes"; then  dnl minimal support for SSE4
-         supported=yes
-       else
-  	AC_MSG_WARN([Your processor does not support SSE4 instructions])
-       fi
-     ;;
-     AVX)
-       echo Configuring for AVX
-       AC_DEFINE([AVX1],[1],[AVX Intrinsics] )
-       if test x"$ax_cv_support_avx_ext" = x"yes"; then  dnl minimal support for AVX
-       supported=yes			  
-       else
-       	AC_MSG_WARN([Your processor does not support AVX instructions])
-       fi
-     ;;
-     AVXFMA4)
-       echo Configuring for AVX
-       AC_DEFINE([AVXFMA4],[1],[AVX Intrinsics with FMA4] )
-       if test x"$ax_cv_support_avx_ext" = x"yes"; then  dnl minimal support for AVX
-       supported=yes			  
-       else
-       	AC_MSG_WARN([Your processor does not support AVX instructions])
-       fi
-     ;;
-     AVX2)
-       echo Configuring for AVX2
-       AC_DEFINE([AVX2],[1],[AVX2 Intrinsics] )
-       if test x"$ax_cv_support_avx2_ext" = x"yes"; then  dnl minimal support for AVX2
-       supported=yes
-       else
-       AC_MSG_WARN([Your processor does not support AVX2 instructions])
-       fi
-     ;;
-     AVX512)
-       echo Configuring for AVX512 
-       AC_DEFINE([AVX512],[1],[AVX512 Intrinsics for Knights Landing] )
-       supported="cross compilation"
-       ac_ZMM=yes;
-     ;;
-     IMCI)
-       echo Configuring for IMCI
-       AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner] )
-       supported="cross compilation"
-       ac_ZMM=no;
-     ;;
-     NEONv8)
-       echo Configuring for experimental ARMv8a support 
-       AC_DEFINE([NEONv8],[1],[NEON ARMv8 Experimental support ] )
-       supported="cross compilation"
-     ;;
-     DEBUG)
-       echo Configuring without SIMD support - only for compiler DEBUGGING!
-       AC_DEFINE([EMPTY_SIMD],[1],[EMPTY_SIMD only for DEBUGGING] )
-      ;;     
-     *)
-     AC_MSG_ERROR([${ac_SIMD} flag unsupported as --enable-simd option\nRun ./configure --help for the list of options]); 
-     ;;
+  AVX512|AVX512MIC|KNL)
+    AC_DEFINE([TEST_ZMM],[1],[compile ZMM test]);;
+  *)
+	;;
 esac

-case ${ac_ZMM} in
-yes)
-	echo Enabling ZMM source code
-;;
-no)
-	echo Disabling ZMM source code
-;;
-esac
-
-AM_CONDITIONAL(BUILD_ZMM,[ test "X${ac_ZMM}X" == "XyesX" ])
-
+############### precision selection
 AC_ARG_ENABLE([precision],[AC_HELP_STRING([--enable-precision=single|double],[Select default word size of Real])],[ac_PRECISION=${enable_precision}],[ac_PRECISION=double])
 case ${ac_PRECISION} in
     single)
-       echo default precision is single
       AC_DEFINE([GRID_DEFAULT_PRECISION_SINGLE],[1],[GRID_DEFAULT_PRECISION is SINGLE] )
     ;;
     double)
-       echo default precision is double
       AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] )
     ;;
 esac

-#
-# Comms selection
-#
-
+############### communication type selection
 AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])

 case ${ac_COMMS} in
     none)
-       echo Configuring for NO communications
       AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
     ;;
     mpi)
-       echo Configuring for MPI communications
       AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
+       LX_FIND_MPI
+       if test "x$have_CXX_mpi" = 'xno'; then AC_MSG_ERROR(["MPI not found"]); fi
+       AM_CXXFLAGS="$MPI_CXXFLAGS $AM_CXXFLAGS"
+       AM_CFLAGS="$MPI_CFLAGS $AM_CFLAGS"
+       AM_LDFLAGS="`echo $MPI_CXXLDFLAGS | sed -E 's/-l@<:@^ @:>@+//g'` $AM_LDFLAGS"
+       LIBS="`echo $MPI_CXXLDFLAGS | sed -E 's/-L@<:@^ @:>@+//g'` $LIBS"
     ;;
     shmem)
-       echo Configuring for SHMEM communications
       AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] )
     ;;
     *)
     AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]); 
     ;;
 esac
-
 AM_CONDITIONAL(BUILD_COMMS_SHMEM,[ test "X${ac_COMMS}X" == "XshmemX" ])
 AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" ])
 AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])

-#
-# RNG selection
-#
+############### RNG selection
 AC_ARG_ENABLE([rng],[AC_HELP_STRING([--enable-rng=ranlux48|mt19937],\
 	[Select Random Number Generator to be used])],\
 	[ac_RNG=${enable_rng}],[ac_RNG=ranlux48])
+
 case ${ac_RNG} in
     ranlux48)
-     AC_DEFINE([RNG_RANLUX],[1],[RNG_RANLUX] )
+      AC_DEFINE([RNG_RANLUX],[1],[RNG_RANLUX] )
     ;;
     mt19937)
-     AC_DEFINE([RNG_MT19937],[1],[RNG_MT19937] )
+      AC_DEFINE([RNG_MT19937],[1],[RNG_MT19937] )
     ;;
     *)
-     AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]); 
+      AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]); 
     ;;
 esac

-#
-# SDE timing mode
-#
-AC_ARG_ENABLE([timers],[AC_HELP_STRING([--enable-timers=yes|no],\
+############### timer option
+AC_ARG_ENABLE([timers],[AC_HELP_STRING([--enable-timers],\
 	[Enable system dependent high res timers])],\
 	[ac_TIMERS=${enable_timers}],[ac_TIMERS=yes])
 case ${ac_TIMERS} in
     yes)
-     AC_DEFINE([TIMERS_ON],[1],[TIMERS_ON] )
+      AC_DEFINE([TIMERS_ON],[1],[TIMERS_ON] )
     ;;
     no)
-     AC_DEFINE([TIMERS_OFF],[1],[TIMERS_OFF] )
+      AC_DEFINE([TIMERS_OFF],[1],[TIMERS_OFF] )
     ;;
     *)
-     AC_MSG_ERROR([${ac_TIMERS} unsupported --enable-timers option]); 
+      AC_MSG_ERROR([${ac_TIMERS} unsupported --enable-timers option]); 
     ;;
 esac

-#
-# Chroma regression tests
-#
+############### Chroma regression test
 AC_ARG_ENABLE([chroma],[AC_HELP_STRING([--enable-chroma],[Expect chroma compiled under c++11 ])],ac_CHROMA=yes,ac_CHROMA=no)
-
 case ${ac_CHROMA} in
-     yes)
-       echo Enabling tests regressing to Chroma
-     ;;
-     no)
-       echo Disabling tests regressing to Chroma
+     yes|no)
     ;;
     *)
-     AC_MSG_ERROR([${ac_CHROMA} unsupported --enable-chroma option]); 
+       AC_MSG_ERROR([${ac_CHROMA} unsupported --enable-chroma option]); 
     ;;
 esac
-
 AM_CONDITIONAL(BUILD_CHROMA_REGRESSION,[ test "X${ac_CHROMA}X" == "XyesX" ])

-#
-# Lapack
-#
-AC_ARG_ENABLE([lapack],[AC_HELP_STRING([--enable-lapack],[Enable lapack yes/no ])],[ac_LAPACK=${enable_lapack}],[ac_LAPACK=no])
+############### Doxygen
+AC_PROG_DOXYGEN

-case ${ac_LAPACK} in
-     yes)
-       echo Enabling lapack
-     ;;
-     no)
-       echo Disabling lapack
-     ;;
-     *)
-       echo Enabling lapack at ${ac_LAPACK}
-     ;;
-esac
+if test -n "$DOXYGEN"
+then
+AC_CONFIG_FILES([docs/doxy.cfg])
+fi

-AM_CONDITIONAL(USE_LAPACK,[ test "X${ac_LAPACK}X" != "XnoX" ])
-AM_CONDITIONAL(USE_LAPACK_LIB,[ test "X${ac_LAPACK}X" != "XyesX" ])
-
-###################################################################
-# Checks for doxygen support
-# if present enables the "make doxyfile" command
-#echo
-#echo Checking doxygen support 
-#echo :::::::::::::::::::::::::::::::::::::::::::
-#AC_PROG_DOXYGEN
-
-#if test -n "$DOXYGEN"
-#then
-#AC_CONFIG_FILES([docs/doxy.cfg])
-#fi
-
-echo
-echo Creating configuration files
-echo :::::::::::::::::::::::::::::::::::::::::::
+############### Ouput
+cwd=`pwd -P`; cd ${srcdir}; abs_srcdir=`pwd -P`; cd ${cwd}
+AM_CXXFLAGS="-I${abs_srcdir}/include $AM_CXXFLAGS"
+AM_CFLAGS="-I${abs_srcdir}/include $AM_CFLAGS"
+AM_LDFLAGS="-L${cwd}/lib $AM_LDFLAGS"
+AC_SUBST([AM_CFLAGS])
+AC_SUBST([AM_CXXFLAGS])
+AC_SUBST([AM_LDFLAGS])
 AC_CONFIG_FILES(Makefile)
 AC_CONFIG_FILES(lib/Makefile)
 AC_CONFIG_FILES(tests/Makefile)
+AC_CONFIG_FILES(tests/IO/Makefile)
+AC_CONFIG_FILES(tests/core/Makefile)
+AC_CONFIG_FILES(tests/debug/Makefile)
+AC_CONFIG_FILES(tests/forces/Makefile)
+AC_CONFIG_FILES(tests/hmc/Makefile)
+AC_CONFIG_FILES(tests/solver/Makefile)
 AC_CONFIG_FILES(tests/qdpxx/Makefile)
 AC_CONFIG_FILES(benchmarks/Makefile)
 AC_CONFIG_FILES(programs/Makefile)
 AC_CONFIG_FILES(programs/Hadrons/Makefile)
 AC_OUTPUT

-
 echo "
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Summary of configuration for $PACKAGE v$VERSION
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-The following features are enabled:
-
+----- PLATFORM ----------------------------------------
 - architecture (build)          : $build_cpu
 - os (build)                    : $build_os
 - architecture (target)         : $target_cpu
 - os (target)                   : $target_os
- build DOXYGEN documentation   : `if test "x$enable_doc" = xyes; then echo yes; else echo no; fi`
- graphs and diagrams           : `if test "x$enable_dot" = xyes; then echo yes; else echo no; fi`
- Supported SIMD flags          : $SIMD_FLAGS
----------------------------------------------------------
- enabled simd support          : ${ac_SIMD}   (config macro says supported: $supported )
+- compiler vendor               : ${ax_cv_cxx_compiler_vendor}
+- compiler version              : ${ax_cv_gxx_version}
+----- BUILD OPTIONS -----------------------------------
+- SIMD                          : ${ac_SIMD}
 - communications type           : ${ac_COMMS}
 - default precision             : ${ac_PRECISION}
 - RNG choice                    : ${ac_RNG} 
- LAPACK	                : ${ac_LAPACK} 
-
-
+- GMP                           : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi`
+- LAPACK                        : ${ac_LAPACK}
+- build DOXYGEN documentation   : `if test "x$enable_doc" = xyes; then echo yes; else echo no; fi`
+- graphs and diagrams           : `if test "x$enable_dot" = xyes; then echo yes; else echo no; fi`
+----- BUILD FLAGS -------------------------------------
+- CXXFLAGS: 
+`echo ${AM_CXXFLAGS} ${CXXFLAGS} | sed 's/ -/\n\t-/g' | sed 's/^-/\t-/g'`
+- LDFLAGS:
+`echo ${AM_LDFLAGS} ${LDFLAGS} | sed 's/ -/\n\t-/g' | sed 's/^-/\t-/g'`
+- LIBS:
+`echo ${LIBS} | sed 's/ -/\n\t-/g' | sed 's/^-/\t-/g'`
+-------------------------------------------------------
 "
--- a/eigen-3.2.9.tar.bz2
+++ b/eigen-3.2.9.tar.bz2
--- a/include/Grid
+++ b/include/Grid
@ -0,0 +1 @@
+../lib
--- a/lib/Algorithms.h
+++ b/lib/Algorithms.h
@ -29,27 +29,28 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_ALGORITHMS_H
 #define GRID_ALGORITHMS_H

-#include <algorithms/SparseMatrix.h>
-#include <algorithms/LinearOperator.h>
-#include <algorithms/Preconditioner.h>
+#include <Grid/algorithms/SparseMatrix.h>
+#include <Grid/algorithms/LinearOperator.h>
+#include <Grid/algorithms/Preconditioner.h>

-#include <algorithms/approx/Zolotarev.h>
-#include <algorithms/approx/Chebyshev.h>
-#include <algorithms/approx/Remez.h>
-#include <algorithms/approx/MultiShiftFunction.h>
+#include <Grid/algorithms/approx/Zolotarev.h>
+#include <Grid/algorithms/approx/Chebyshev.h>
+#include <Grid/algorithms/approx/Remez.h>
+#include <Grid/algorithms/approx/MultiShiftFunction.h>

-#include <algorithms/iterative/ConjugateGradient.h>
-#include <algorithms/iterative/ConjugateResidual.h>
-#include <algorithms/iterative/NormalEquations.h>
-#include <algorithms/iterative/SchurRedBlack.h>
+#include <Grid/algorithms/iterative/ConjugateGradient.h>
+#include <Grid/algorithms/iterative/ConjugateResidual.h>
+#include <Grid/algorithms/iterative/NormalEquations.h>
+#include <Grid/algorithms/iterative/SchurRedBlack.h>

-#include <algorithms/iterative/ConjugateGradientMultiShift.h>
+#include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
+#include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>

 // Lanczos support
-#include <algorithms/iterative/MatrixUtils.h>
-#include <algorithms/iterative/ImplicitlyRestartedLanczos.h>
+#include <Grid/algorithms/iterative/MatrixUtils.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>

-#include <algorithms/CoarsenedMatrix.h>
+#include <Grid/algorithms/CoarsenedMatrix.h>

 // Eigen/lanczos
 // EigCg
--- a/lib/Cartesian.h
+++ b/lib/Cartesian.h
@ -28,8 +28,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_CARTESIAN_H
 #define GRID_CARTESIAN_H

-#include <cartesian/Cartesian_base.h>
-#include <cartesian/Cartesian_full.h>
-#include <cartesian/Cartesian_red_black.h> 
+#include <Grid/cartesian/Cartesian_base.h>
+#include <Grid/cartesian/Cartesian_full.h>
+#include <Grid/cartesian/Cartesian_red_black.h> 

 #endif
--- a/lib/Communicator.h
+++ b/lib/Communicator.h
@ -28,6 +28,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_COMMUNICATOR_H
 #define GRID_COMMUNICATOR_H

-#include <communicator/Communicator_base.h>
+#include <Grid/communicator/Communicator_base.h>

 #endif
--- a/lib/Cshift.h
+++ b/lib/Cshift.h
@ -28,17 +28,17 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef _GRID_CSHIFT_H_
 #define _GRID_CSHIFT_H_

-#include <cshift/Cshift_common.h>
+#include <Grid/cshift/Cshift_common.h>

 #ifdef GRID_COMMS_NONE
-#include <cshift/Cshift_none.h>
+#include <Grid/cshift/Cshift_none.h>
 #endif

 #ifdef GRID_COMMS_MPI
-#include <cshift/Cshift_mpi.h>
+#include <Grid/cshift/Cshift_mpi.h>
 #endif 

 #ifdef GRID_COMMS_SHMEM
-#include <cshift/Cshift_mpi.h> // uses same implementation of communicator
+#include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
 #endif 
 #endif
--- a/lib/Grid.h
+++ b/lib/Grid.h
@ -59,29 +59,29 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 ///////////////////
 // Grid headers
 ///////////////////
-#include <serialisation/Serialisation.h>
-#include <Config.h>
-#include <Timer.h>
-#include <PerfCount.h>
-#include <Log.h>
-#include <AlignedAllocator.h>
-#include <Simd.h>
-#include <Threads.h>
-#include <Lexicographic.h>
-#include <Communicator.h> 
-#include <Cartesian.h>    
-#include <Tensors.h>      
-#include <Lattice.h>      
-#include <Cshift.h>       
-#include <Stencil.h>      
-#include <Algorithms.h>   
-#include <parallelIO/BinaryIO.h>
-#include <qcd/QCD.h>
-#include <parallelIO/NerscIO.h>
-#include <Init.h>
+#include <Grid/serialisation/Serialisation.h>
+#include "Config.h"
+#include <Grid/Timer.h>
+#include <Grid/PerfCount.h>
+#include <Grid/Log.h>
+#include <Grid/AlignedAllocator.h>
+#include <Grid/Simd.h>
+#include <Grid/Threads.h>
+#include <Grid/Lexicographic.h>
+#include <Grid/Communicator.h> 
+#include <Grid/Cartesian.h>    
+#include <Grid/Tensors.h>      
+#include <Grid/Lattice.h>      
+#include <Grid/Cshift.h>       
+#include <Grid/Stencil.h>      
+#include <Grid/Algorithms.h>   
+#include <Grid/parallelIO/BinaryIO.h>
+#include <Grid/qcd/QCD.h>
+#include <Grid/parallelIO/NerscIO.h>
+#include <Grid/Init.h>

-#include <qcd/hmc/NerscCheckpointer.h>
-#include <qcd/hmc/HmcRunner.h>
+#include <Grid/qcd/hmc/NerscCheckpointer.h>
+#include <Grid/qcd/hmc/HmcRunner.h>



--- a/lib/Init.cc
+++ b/lib/Init.cc
@ -193,7 +193,7 @@ void Grid_init(int *argc,char ***argv)
    std::cout<<GridLogMessage<<"--mpi n.n.n.n   : default MPI decomposition"<<std::endl;    
    std::cout<<GridLogMessage<<"--threads n     : default number of OMP threads"<<std::endl;
    std::cout<<GridLogMessage<<"--grid n.n.n.n  : default Grid size"<<std::endl;    
-    std::cout<<GridLogMessage<<"--log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug"<<std::endl;
+    std::cout<<GridLogMessage<<"--log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
    exit(EXIT_SUCCESS);
  }

@ -234,26 +234,34 @@ void Grid_init(int *argc,char ***argv)
    std::cout<<GridLogMessage<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
  }

+  std::string COL_RED    = GridLogColours.colour["RED"];
+  std::string COL_PURPLE = GridLogColours.colour["PURPLE"];
+  std::string COL_BLACK  = GridLogColours.colour["BLACK"];
+  std::string COL_GREEN  = GridLogColours.colour["GREEN"];
+  std::string COL_BLUE   = GridLogColours.colour["BLUE"];
+  std::string COL_YELLOW = GridLogColours.colour["YELLOW"];
+  std::string COL_BACKGROUND = GridLogColours.colour["NORMAL"];
+
+  
  std::cout <<std::endl;
-  std::cout <<Logger::RED  << "__|__|__|__|__"<<             "|__|__|_"<<Logger::PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
-  std::cout <<Logger::RED  << "__|__|__|__|__"<<             "|__|__|_"<<Logger::PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
-  std::cout <<Logger::RED  << "__|__|  |  |  "<<             "|  |  | "<<Logger::PURPLE<<" |  |  |"<<                "  |  |  | _|__"<<std::endl; 
-  std::cout <<Logger::RED  << "__|__         "<<             "        "<<Logger::PURPLE<<"        "<<                "          _|__"<<std::endl; 
-  std::cout <<Logger::RED  << "__|_  "<<Logger::GREEN<<" GGGG   "<<Logger::RED<<" RRRR   "<<Logger::BLUE  <<" III    "<<Logger::PURPLE<<"DDDD  "<<Logger::PURPLE<<"    _|__"<<std::endl;
-  std::cout <<Logger::RED  << "__|_  "<<Logger::GREEN<<"G       "<<Logger::RED<<" R   R  "<<Logger::BLUE  <<"  I     "<<Logger::PURPLE<<"D   D "<<Logger::PURPLE<<"    _|__"<<std::endl;
-  std::cout <<Logger::RED  << "__|_  "<<Logger::GREEN<<"G       "<<Logger::RED<<" R   R  "<<Logger::BLUE  <<"  I     "<<Logger::PURPLE<<"D    D"<<Logger::PURPLE<<"    _|__"<<std::endl;
-  std::cout <<Logger::BLUE << "__|_  "<<Logger::GREEN<<"G  GG   "<<Logger::RED<<" RRRR   "<<Logger::BLUE  <<"  I     "<<Logger::PURPLE<<"D    D"<<Logger::GREEN <<"    _|__"<<std::endl;
-  std::cout <<Logger::BLUE << "__|_  "<<Logger::GREEN<<"G   G   "<<Logger::RED<<" R  R   "<<Logger::BLUE  <<"  I     "<<Logger::PURPLE<<"D   D "<<Logger::GREEN <<"    _|__"<<std::endl;
-  std::cout <<Logger::BLUE << "__|_  "<<Logger::GREEN<<" GGGG   "<<Logger::RED<<" R   R  "<<Logger::BLUE  <<" III    "<<Logger::PURPLE<<"DDDD  "<<Logger::GREEN <<"    _|__"<<std::endl;
-  std::cout <<Logger::BLUE << "__|__         "<<             "        "<<Logger::GREEN <<"        "<<                "          _|__"<<std::endl; 
-  std::cout <<Logger::BLUE << "__|__|__|__|__"<<             "|__|__|_"<<Logger::GREEN <<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
-  std::cout <<Logger::BLUE << "__|__|__|__|__"<<             "|__|__|_"<<Logger::GREEN <<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
-  std::cout <<Logger::BLUE << "  |  |  |  |  "<<             "|  |  | "<<Logger::GREEN <<" |  |  |"<<                "  |  |  |  |  "<<std::endl; 
+  std::cout <<COL_RED  << "__|__|__|__|__"<<             "|__|__|_"<<COL_PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
+  std::cout <<COL_RED  << "__|__|__|__|__"<<             "|__|__|_"<<COL_PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
+  std::cout <<COL_RED  << "__|__|  |  |  "<<             "|  |  | "<<COL_PURPLE<<" |  |  |"<<                "  |  |  | _|__"<<std::endl; 
+  std::cout <<COL_RED  << "__|__         "<<             "        "<<COL_PURPLE<<"        "<<                "          _|__"<<std::endl; 
+  std::cout <<COL_RED  << "__|_  "<<COL_GREEN<<" GGGG   "<<COL_RED<<" RRRR   "<<COL_BLUE  <<" III    "<<COL_PURPLE<<"DDDD  "<<COL_PURPLE<<"    _|__"<<std::endl;
+  std::cout <<COL_RED  << "__|_  "<<COL_GREEN<<"G       "<<COL_RED<<" R   R  "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D   D "<<COL_PURPLE<<"    _|__"<<std::endl;
+  std::cout <<COL_RED  << "__|_  "<<COL_GREEN<<"G       "<<COL_RED<<" R   R  "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D    D"<<COL_PURPLE<<"    _|__"<<std::endl;
+  std::cout <<COL_BLUE << "__|_  "<<COL_GREEN<<"G  GG   "<<COL_RED<<" RRRR   "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D    D"<<COL_GREEN <<"    _|__"<<std::endl;
+  std::cout <<COL_BLUE << "__|_  "<<COL_GREEN<<"G   G   "<<COL_RED<<" R  R   "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D   D "<<COL_GREEN <<"    _|__"<<std::endl;
+  std::cout <<COL_BLUE << "__|_  "<<COL_GREEN<<" GGGG   "<<COL_RED<<" R   R  "<<COL_BLUE  <<" III    "<<COL_PURPLE<<"DDDD  "<<COL_GREEN <<"    _|__"<<std::endl;
+  std::cout <<COL_BLUE << "__|__         "<<             "        "<<COL_GREEN <<"        "<<                "          _|__"<<std::endl; 
+  std::cout <<COL_BLUE << "__|__|__|__|__"<<             "|__|__|_"<<COL_GREEN <<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
+  std::cout <<COL_BLUE << "__|__|__|__|__"<<             "|__|__|_"<<COL_GREEN <<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
+  std::cout <<COL_BLUE << "  |  |  |  |  "<<             "|  |  | "<<COL_GREEN <<" |  |  |"<<                "  |  |  |  |  "<<std::endl; 
  std::cout << std::endl;
  std::cout << std::endl;
-  std::cout <<Logger::YELLOW<< std::endl;
+  std::cout <<COL_YELLOW<< std::endl;
  std::cout << "Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors"<<std::endl;
-  std::cout << "Colours by Tadahito Boyle "<<std::endl;
  std::cout << std::endl;
  std::cout << "This program is free software; you can redistribute it and/or modify"<<std::endl;
  std::cout << "it under the terms of the GNU General Public License as published by"<<std::endl;
@ -264,7 +272,8 @@ void Grid_init(int *argc,char ***argv)
  std::cout << "but WITHOUT ANY WARRANTY; without even the implied warranty of"<<std::endl;
  std::cout << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the"<<std::endl;
  std::cout << "GNU General Public License for more details."<<std::endl;
-  std::cout << Logger::BLACK <<std::endl;
+  std::cout << COL_BACKGROUND <<std::endl;
+  std::cout << std::endl;
 }

  
--- a/lib/Lattice.h
+++ b/lib/Lattice.h
@ -28,6 +28,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_LATTICE_H
 #define GRID_LATTICE_H

-#include <lattice/Lattice_base.h>
+#include <Grid/lattice/Lattice_base.h>

 #endif
--- a/lib/Log.cc
+++ b/lib/Log.cc
@ -1,126 +1,92 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/Log.cc
+Source file: ./lib/Log.cc

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 #include <Grid.h>

 namespace Grid {

 GridStopWatch Logger::StopWatch;
-std::ostream  Logger::devnull(0);
-std::string Logger::BLACK("\033[30m");
-std::string Logger::RED("\033[31m");
-std::string Logger::GREEN("\033[32m");
-std::string Logger::YELLOW("\033[33m");
-std::string Logger::BLUE("\033[34m");
-std::string Logger::PURPLE("\033[35m");
-std::string Logger::CYAN("\033[36m");
-std::string Logger::WHITE("\033[37m");
-std::string Logger::NORMAL("\033[0;39m");
-std::string EMPTY("");
+std::ostream Logger::devnull(0);

-#if 0  
-  GridLogger GridLogError      (1,"Error",Logger::RED);
-  GridLogger GridLogWarning    (1,"Warning",Logger::YELLOW);
-  GridLogger GridLogMessage    (1,"Message",Logger::BLACK);
-  GridLogger GridLogDebug      (1,"Debug",Logger::PURPLE);
-  GridLogger GridLogPerformance(1,"Performance",Logger::GREEN);
-  GridLogger GridLogIterative  (1,"Iterative",Logger::BLUE);
-  GridLogger GridLogIntegrator (1,"Integrator",Logger::BLUE);
-#else
-  GridLogger GridLogError      (1,"Error",EMPTY);
-  GridLogger GridLogWarning    (1,"Warning",EMPTY);
-  GridLogger GridLogMessage    (1,"Message",EMPTY);
-  GridLogger GridLogDebug      (1,"Debug",EMPTY);
-  GridLogger GridLogPerformance(1,"Performance",EMPTY);
-  GridLogger GridLogIterative  (1,"Iterative",EMPTY);
-  GridLogger GridLogIntegrator (1,"Integrator",EMPTY);
-#endif
+Colours GridLogColours(0);
+GridLogger GridLogError(1, "Error", GridLogColours, "RED");
+GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
+GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL");
+GridLogger GridLogDebug(1, "Debug", GridLogColours, "PURPLE");
+GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
+GridLogger GridLogIterative(1, "Iterative", GridLogColours, "BLUE");
+GridLogger GridLogIntegrator(1, "Integrator", GridLogColours, "BLUE");

-void GridLogConfigure(std::vector<std::string> &logstreams)
-{
+void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogError.Active(0);
  GridLogWarning.Active(0);
-  GridLogMessage.Active(0);
+  GridLogMessage.Active(1); // at least the messages should be always on
  GridLogIterative.Active(0);
  GridLogDebug.Active(0);
  GridLogPerformance.Active(0);
  GridLogIntegrator.Active(0);
+  GridLogColours.Active(0);

-  int blackAndWhite = 1;
-  if(blackAndWhite){
-    Logger::BLACK = std::string("");
-    Logger::RED    =Logger::BLACK;
-    Logger::GREEN  =Logger::BLACK;
-    Logger::YELLOW =Logger::BLACK;
-    Logger::BLUE   =Logger::BLACK;
-    Logger::PURPLE =Logger::BLACK;
-    Logger::CYAN   =Logger::BLACK;
-    Logger::WHITE  =Logger::BLACK;
-    Logger::NORMAL =Logger::BLACK;
-  }
-
-  for(int i=0;i<logstreams.size();i++){
-    if ( logstreams[i]== std::string("Error")       ) GridLogError.Active(1);
-    if ( logstreams[i]== std::string("Warning")     ) GridLogWarning.Active(1);
-    if ( logstreams[i]== std::string("Message")     ) GridLogMessage.Active(1);
-    if ( logstreams[i]== std::string("Iterative")   ) GridLogIterative.Active(1);
-    if ( logstreams[i]== std::string("Debug")       ) GridLogDebug.Active(1);
-    if ( logstreams[i]== std::string("Performance") ) GridLogPerformance.Active(1);
-    if ( logstreams[i]== std::string("Integrator" ) ) GridLogIntegrator.Active(1);
+  for (int i = 0; i < logstreams.size(); i++) {
+    if (logstreams[i] == std::string("Error")) GridLogError.Active(1);
+    if (logstreams[i] == std::string("Warning")) GridLogWarning.Active(1);
+    if (logstreams[i] == std::string("NoMessage")) GridLogMessage.Active(0);
+    if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1);
+    if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1);
+    if (logstreams[i] == std::string("Performance"))
+      GridLogPerformance.Active(1);
+    if (logstreams[i] == std::string("Integrator")) GridLogIntegrator.Active(1);
+    if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1);
  }
 }

 ////////////////////////////////////////////////////////////
 // Verbose limiter on MPI tasks
 ////////////////////////////////////////////////////////////
-void Grid_quiesce_nodes(void)
-{
-  int me=0;
+void Grid_quiesce_nodes(void) {
+  int me = 0;
 #ifdef GRID_COMMS_MPI
-  MPI_Comm_rank(MPI_COMM_WORLD,&me);
+  MPI_Comm_rank(MPI_COMM_WORLD, &me);
 #endif
 #ifdef GRID_COMMS_SHMEM
  me = shmem_my_pe();
 #endif
-  if ( me ) { 
+  if (me) {
    std::cout.setstate(std::ios::badbit);
  }
 }

-void Grid_unquiesce_nodes(void)
-{
+void Grid_unquiesce_nodes(void) {
 #ifdef GRID_COMMS_MPI
-    std::cout.clear();
+  std::cout.clear();
 #endif
 }
-
-
 }
-
--- a/lib/Log.h
+++ b/lib/Log.h
@ -6,9 +6,9 @@

    Copyright (C) 2015

-Author: Antonin Portelli <antonin.portelli@me.com>
-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: Antonin Portelli <antonin.portelli@me.com>
+    Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@ -27,6 +27,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+
+#include <map>
+
 #ifndef GRID_LOG_H
 #define GRID_LOG_H

@ -34,56 +37,99 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <execinfo.h>
 #endif

-namespace Grid {
+    namespace Grid {

 // Dress the output; use std::chrono for time stamping via the StopWatch class
 int Rank(void); // used for early stage debug before library init


+class Colours{
+protected:
+  bool is_active;
+public:
+  std::map<std::string, std::string> colour;
+
+  Colours(bool activate=false){
+    Active(activate);
+  };
+
+  void Active(bool activate){
+    is_active=activate;
+
+    if (is_active){
+     colour["BLACK"]  ="\033[30m";
+     colour["RED"]    ="\033[31m";
+     colour["GREEN"]  ="\033[32m";
+     colour["YELLOW"] ="\033[33m";
+     colour["BLUE"]   ="\033[34m";
+     colour["PURPLE"] ="\033[35m";
+     colour["CYAN"]   ="\033[36m";
+     colour["WHITE"]  ="\033[37m";
+     colour["NORMAL"] ="\033[0;39m";
+   } else {
+    colour["BLACK"] ="";
+    colour["RED"]   ="";
+    colour["GREEN"] ="";
+    colour["YELLOW"]="";
+    colour["BLUE"]  ="";
+    colour["PURPLE"]="";
+    colour["CYAN"]  ="";
+    colour["WHITE"] ="";
+    colour["NORMAL"]="";
+  }
+
+
+};
+
+};
+
+
 class Logger {
 protected:
-    int active;
-    std::string name, topName, COLOUR;
-public:
-    static GridStopWatch StopWatch;
-    static std::ostream devnull;
+  Colours &Painter;
+  int active;
+  std::string name, topName;
+  std::string COLOUR;

-    static std::string BLACK;
-    static std::string RED  ;
-    static std::string GREEN;
-    static std::string YELLOW;
-    static std::string BLUE  ;
-    static std::string PURPLE;
-    static std::string CYAN  ;
-    static std::string WHITE ;
-    static std::string NORMAL;
-    
- Logger(std::string topNm, int on, std::string nm,std::string col)
-   : active(on), name(nm), topName(topNm), COLOUR(col) {};
-    
-    void Active(int on) {active = on;};
-    int  isActive(void) {return active;};
-    
-    friend std::ostream& operator<< (std::ostream& stream, const Logger& log){
-        if ( log.active ) {
-            StopWatch.Stop();
-            GridTime now = StopWatch.Elapsed();
-            StopWatch.Start();
-            stream << BLACK <<std::setw(8) << std::left << log.topName << BLACK<< " : ";
-            stream << log.COLOUR <<std::setw(11)  << log.name << BLACK << " : ";
-            stream << YELLOW <<std::setw(6) << now <<BLACK << " : " ;
-            stream << log.COLOUR;
-            return stream;
-        } else { 
-            return devnull;
-        }
+public:
+  static GridStopWatch StopWatch;
+  static std::ostream devnull;
+
+  std::string background() {return Painter.colour["NORMAL"];}
+  std::string evidence() {return Painter.colour["YELLOW"];}
+  std::string colour() {return Painter.colour[COLOUR];}
+
+  Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col)
+  : active(on),
+  name(nm),
+  topName(topNm),
+  Painter(col_class),
+  COLOUR(col){} ;
+  
+  void Active(int on) {active = on;};
+  int  isActive(void) {return active;};
+  
+  friend std::ostream& operator<< (std::ostream& stream, Logger& log){
+
+    if ( log.active ) {
+      StopWatch.Stop();
+      GridTime now = StopWatch.Elapsed();
+      StopWatch.Start();
+      stream << log.background()<< log.topName << log.background()<< " : ";
+      stream << log.colour() <<std::setw(14) << std::left << log.name << log.background() << " : ";
+      stream << log.evidence()<< now << log.background() << " : " << log.colour();
+      return stream;
+    } else { 
+      return devnull;
    }
-    
+  }
+
 };
-    
+
 class GridLogger: public Logger {
 public:
- GridLogger(int on, std::string nm, std::string col = Logger::BLACK): Logger("Grid", on, nm, col){};
+  GridLogger(int on, std::string nm, Colours&col_class, std::string col_key = "NORMAL"):
+  Logger("Grid", on, nm, col_class, col_key){};
 };

 void GridLogConfigure(std::vector<std::string> &logstreams);
@ -95,38 +141,40 @@ extern GridLogger GridLogDebug  ;
 extern GridLogger GridLogPerformance;
 extern GridLogger GridLogIterative  ;
 extern GridLogger GridLogIntegrator  ;
+extern Colours    GridLogColours;


 #define _NBACKTRACE (256)
 extern void * Grid_backtrace_buffer[_NBACKTRACE];

 #define BACKTRACEFILE() {\
-    char string[20];					\
-    std::sprintf(string,"backtrace.%d",Rank());				\
-    std::FILE * fp = std::fopen(string,"w");				\
-    BACKTRACEFP(fp)\
-    std::fclose(fp);	    \
+char string[20];					\
+std::sprintf(string,"backtrace.%d",Rank());				\
+std::FILE * fp = std::fopen(string,"w");				\
+BACKTRACEFP(fp)\
+std::fclose(fp);	    \
 }


 #ifdef HAVE_EXECINFO_H
 #define BACKTRACEFP(fp) { \
-  int symbols    = backtrace        (Grid_backtrace_buffer,_NBACKTRACE);\
-  char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);\
-  for (int i = 0; i < symbols; i++){\
-    std::fprintf (fp,"BackTrace Strings: %d %s\n",i, strings[i]); std::fflush(fp); \
-  }\
+int symbols    = backtrace        (Grid_backtrace_buffer,_NBACKTRACE);\
+char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);\
+for (int i = 0; i < symbols; i++){\
+  std::fprintf (fp,"BackTrace Strings: %d %s\n",i, strings[i]); std::fflush(fp); \
+}\
 }
 #else 
 #define BACKTRACEFP(fp) { \
-    std::fprintf (fp,"BT %d %lx\n",0, __builtin_return_address(0)); std::fflush(fp); \
-    std::fprintf (fp,"BT %d %lx\n",1, __builtin_return_address(1)); std::fflush(fp); \
-    std::fprintf (fp,"BT %d %lx\n",2, __builtin_return_address(2)); std::fflush(fp); \
-    std::fprintf (fp,"BT %d %lx\n",3, __builtin_return_address(3)); std::fflush(fp); \
+std::fprintf (fp,"BT %d %lx\n",0, __builtin_return_address(0)); std::fflush(fp); \
+std::fprintf (fp,"BT %d %lx\n",1, __builtin_return_address(1)); std::fflush(fp); \
+std::fprintf (fp,"BT %d %lx\n",2, __builtin_return_address(2)); std::fflush(fp); \
+std::fprintf (fp,"BT %d %lx\n",3, __builtin_return_address(3)); std::fflush(fp); \
 }
 #endif

 #define BACKTRACE() BACKTRACEFP(stdout) 

+
 }
 #endif
--- a/lib/Make.inc
+++ b/lib/Make.inc
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@ -1,6 +1,3 @@
-# additional include paths necessary to compile the C++ library
-AM_CXXFLAGS = -I$(top_srcdir)/
-
 extra_sources=
 if BUILD_COMMS_MPI
  extra_sources+=communicator/Communicator_mpi.cc
@ -17,16 +14,11 @@ endif
 #
 # Libraries
 #
-
 include Make.inc
+include Eigen.inc

-lib_LIBRARIES = libGrid.a
-libGrid_a_SOURCES = $(CCFILES) $(extra_sources)
-
-
-#	qcd/action/fermion/PartialFractionFermion5D.cc\	\
-#
-# Include files
-#
-nobase_include_HEADERS=$(HFILES)
+lib_LTLIBRARIES = libGrid.la

+libGrid_la_SOURCES             = $(CCFILES) $(extra_sources)
+libGrid_ladir                  = $(pkgincludedir)
+nobase_dist_pkginclude_HEADERS = $(HFILES) $(eigen_files) Config.h
--- a/lib/Simd.h
+++ b/lib/Simd.h
@ -1,32 +1,33 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/Simd.h
+Source file: ./lib/Simd.h

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
 Author: paboyle <paboyle@ph.ed.ac.uk>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef GRID_SIMD_H
 #define GRID_SIMD_H

@ -118,6 +119,14 @@ namespace Grid {
  inline ComplexD timesI(const ComplexD &r)     { return(r*ComplexD(0.0,1.0));}
  inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));}
  inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));}
+
+  // define projections to real and imaginay parts
+  inline ComplexF projReal(const ComplexF &r){return( ComplexF(std::real(r), 0.0));}
+  inline ComplexD projReal(const ComplexD &r){return( ComplexD(std::real(r), 0.0));}
+  inline ComplexF projImag(const ComplexF &r){return (ComplexF(std::imag(r), 0.0 ));}
+  inline ComplexD projImag(const ComplexD &r){return (ComplexD(std::imag(r), 0.0));}
+
+  // define auxiliary functions for complex computations
  inline void timesI(ComplexF &ret,const ComplexF &r)     { ret = timesI(r);}
  inline void timesI(ComplexD &ret,const ComplexD &r)     { ret = timesI(r);}
  inline void timesMinusI(ComplexF &ret,const ComplexF &r){ ret = timesMinusI(r);}
@ -163,8 +172,8 @@ namespace Grid {

 };

-#include <simd/Grid_vector_types.h>
-#include <simd/Grid_vector_unops.h>
+#include "simd/Grid_vector_types.h"
+#include "simd/Grid_vector_unops.h"

 namespace Grid {
  // Default precision
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@ -30,7 +30,7 @@

 #include <thread>

- #include <stencil/Lebesgue.h>   // subdir aggregate
+ #include <Grid/stencil/Lebesgue.h>   // subdir aggregate

 //////////////////////////////////////////////////////////////////////////////////////////
 // Must not lose sight that goal is to be able to construct really efficient
--- a/lib/Tensors.h
+++ b/lib/Tensors.h
@ -30,22 +30,22 @@ Author: neo <cossu@post.kek.jp>
 #ifndef GRID_MATH_H
 #define GRID_MATH_H

-#include <tensors/Tensor_traits.h>
-#include <tensors/Tensor_class.h>
-#include <tensors/Tensor_arith.h>
-#include <tensors/Tensor_inner.h>
-#include <tensors/Tensor_outer.h>
-#include <tensors/Tensor_transpose.h>
-#include <tensors/Tensor_trace.h>
-#include <tensors/Tensor_index.h>
-#include <tensors/Tensor_Ta.h>
-#include <tensors/Tensor_determinant.h>
-#include <tensors/Tensor_exp.h>
-//#include <tensors/Tensor_peek.h>
-//#include <tensors/Tensor_poke.h>
-#include <tensors/Tensor_reality.h>
-#include <tensors/Tensor_unary.h>
-#include <tensors/Tensor_extract_merge.h>
-#include <tensors/Tensor_logical.h>
+#include <Grid/tensors/Tensor_traits.h>
+#include <Grid/tensors/Tensor_class.h>
+#include <Grid/tensors/Tensor_arith.h>
+#include <Grid/tensors/Tensor_inner.h>
+#include <Grid/tensors/Tensor_outer.h>
+#include <Grid/tensors/Tensor_transpose.h>
+#include <Grid/tensors/Tensor_trace.h>
+#include <Grid/tensors/Tensor_index.h>
+#include <Grid/tensors/Tensor_Ta.h>
+#include <Grid/tensors/Tensor_determinant.h>
+#include <Grid/tensors/Tensor_exp.h>
+//#include <Grid/tensors/Tensor_peek.h>
+//#include <Grid/tensors/Tensor_poke.h>
+#include <Grid/tensors/Tensor_reality.h>
+#include <Grid/tensors/Tensor_unary.h>
+#include <Grid/tensors/Tensor_extract_merge.h>
+#include <Grid/tensors/Tensor_logical.h>

 #endif
--- a/lib/algorithms/CoarsenedMatrix.h
+++ b/lib/algorithms/CoarsenedMatrix.h
@ -31,7 +31,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_ALGORITHM_COARSENED_MATRIX_H
 #define  GRID_ALGORITHM_COARSENED_MATRIX_H

-#include <Grid.h>

 namespace Grid {

--- a/lib/algorithms/SparseMatrix.h
+++ b/lib/algorithms/SparseMatrix.h
@ -28,7 +28,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_ALGORITHM_SPARSE_MATRIX_H
 #define  GRID_ALGORITHM_SPARSE_MATRIX_H

-#include <Grid.h>

 namespace Grid {

--- a/lib/algorithms/approx/Chebyshev.h
+++ b/lib/algorithms/approx/Chebyshev.h
@ -29,8 +29,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_CHEBYSHEV_H
 #define GRID_CHEBYSHEV_H

-#include<Grid.h>
-#include<algorithms/LinearOperator.h>
+#include <Grid/algorithms/LinearOperator.h>

 namespace Grid {

--- a/lib/algorithms/approx/Remez.h
+++ b/lib/algorithms/approx/Remez.h
@ -18,10 +18,10 @@
 #include <stddef.h>
 #include <Config.h>

-#ifdef HAVE_GMP_H
-#include <algorithms/approx/bigfloat.h>
+#ifdef HAVE_LIBGMP
+#include "bigfloat.h"
 #else
-#include <algorithms/approx/bigfloat_double.h>
+#include "bigfloat_double.h"
 #endif

 #define JMAX 10000 //Maximum number of iterations of Newton's approximation
--- a/lib/algorithms/iterative/ConjugateGradient.h
+++ b/lib/algorithms/iterative/ConjugateGradient.h
@ -40,9 +40,10 @@ namespace Grid {
  template<class Field> 
    class ConjugateGradient : public OperatorFunction<Field> {
 public:                                                
+    bool ErrorOnNoConverge; //throw an assert when the CG fails to converge. Defaults true.
    RealD   Tolerance;
    Integer MaxIterations;
-    ConjugateGradient(RealD tol,Integer maxit) : Tolerance(tol), MaxIterations(maxit) { 
+  ConjugateGradient(RealD tol,Integer maxit, bool err_on_no_conv = true) : Tolerance(tol), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv){ 
    };


@ -137,13 +138,15 @@ public:
 	  std::cout<<GridLogMessage<<"Time elapsed: Total "<< SolverTimer.Elapsed() << " Matrix  "<<MatrixTimer.Elapsed() << " Linalg "<<LinalgTimer.Elapsed();
 	  std::cout<<std::endl;
 	  
-	  assert(true_residual/Tolerance < 1000.0);
+	  if(ErrorOnNoConverge)
+	    assert(true_residual/Tolerance < 1000.0);

 	  return;
 	}
      }
      std::cout<<GridLogMessage<<"ConjugateGradient did NOT converge"<<std::endl;
-      assert(0);
+      if(ErrorOnNoConverge)	
+	assert(0);
    }
  };
 }
--- a/lib/algorithms/iterative/ConjugateGradientMixedPrec.h
+++ b/lib/algorithms/iterative/ConjugateGradientMixedPrec.h
@ -0,0 +1,142 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrec.h
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@phys.columbia.edu>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
+#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
+
+namespace Grid {
+
+  //Mixed precision restarted defect correction CG
+  template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
+  class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
+  public:                                                
+    RealD   Tolerance;
+    Integer MaxInnerIterations;
+    Integer MaxOuterIterations;
+    GridBase* SinglePrecGrid; //Grid for single-precision fields
+    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
+    LinearOperatorBase<FieldF> &Linop_f;
+    LinearOperatorBase<FieldD> &Linop_d;
+
+    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
+    LinearFunction<FieldF> *guesser;
+    
+    MixedPrecisionConjugateGradient(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d) :
+      Linop_f(_Linop_f), Linop_d(_Linop_d),
+      Tolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
+      OuterLoopNormMult(100.), guesser(NULL){ };
+
+    void useGuesser(LinearFunction<FieldF> &g){
+      guesser = &g;
+    }
+  
+    void operator() (const FieldD &src_d_in, FieldD &sol_d){
+      GridStopWatch TotalTimer;
+      TotalTimer.Start();
+    
+      int cb = src_d_in.checkerboard;
+      sol_d.checkerboard = cb;
+    
+      RealD src_norm = norm2(src_d_in);
+      RealD stop = src_norm * Tolerance*Tolerance;
+
+      GridBase* DoublePrecGrid = src_d_in._grid;
+      FieldD tmp_d(DoublePrecGrid);
+      tmp_d.checkerboard = cb;
+    
+      FieldD tmp2_d(DoublePrecGrid);
+      tmp2_d.checkerboard = cb;
+    
+      FieldD src_d(DoublePrecGrid);
+      src_d = src_d_in; //source for next inner iteration, computed from residual during operation
+    
+      RealD inner_tol = Tolerance;
+    
+      FieldF src_f(SinglePrecGrid);
+      src_f.checkerboard = cb;
+    
+      FieldF sol_f(SinglePrecGrid);
+      sol_f.checkerboard = cb;
+    
+      ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
+      CG_f.ErrorOnNoConverge = false;
+
+      GridStopWatch InnerCGtimer;
+
+      GridStopWatch PrecChangeTimer;
+    
+      for(Integer outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
+	//Compute double precision rsd and also new RHS vector.
+	Linop_d.HermOp(sol_d, tmp_d);
+	RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
+      
+	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
+
+	if(norm < OuterLoopNormMult * stop){
+	  std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
+	  break;
+	}
+	while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
+
+	PrecChangeTimer.Start();
+	precisionChange(src_f, src_d);
+	PrecChangeTimer.Stop();
+      
+	zeroit(sol_f);
+
+	//Optionally improve inner solver guess (eg using known eigenvectors)
+	if(guesser != NULL)
+	  (*guesser)(src_f, sol_f);
+
+	//Inner CG
+	CG_f.Tolerance = inner_tol;
+	InnerCGtimer.Start();
+	CG_f(Linop_f, src_f, sol_f);
+	InnerCGtimer.Stop();
+      
+	//Convert sol back to double and add to double prec solution
+	PrecChangeTimer.Start();
+	precisionChange(tmp_d, sol_f);
+	PrecChangeTimer.Stop();
+      
+	axpy(sol_d, 1.0, tmp_d, sol_d);
+      }
+    
+      //Final trial CG
+      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl;
+    
+      ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
+      CG_d(Linop_d, src_d_in, sol_d);
+
+      TotalTimer.Stop();
+      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
+    }
+  };
+
+}
+
+#endif
--- a/lib/algorithms/iterative/DenseMatrix.h
+++ b/lib/algorithms/iterative/DenseMatrix.h
@ -130,8 +130,8 @@ DenseMatrix<T> GetSubMtx(DenseMatrix<T> &A,int row_st, int row_end, int col_st,

 }

-#include <algorithms/iterative/Householder.h>
-#include <algorithms/iterative/Francis.h>
+#include "Householder.h"
+#include "Francis.h"

 #endif

--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@ -33,8 +33,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifdef USE_LAPACK
 #include <lapacke.h>
 #endif
-#include <algorithms/iterative/DenseMatrix.h>
-#include <algorithms/iterative/EigenSort.h>
+#include "DenseMatrix.h"
+#include "EigenSort.h"

 namespace Grid {

--- a/lib/cartesian/Cartesian_base.h
+++ b/lib/cartesian/Cartesian_base.h
@ -29,7 +29,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_CARTESIAN_BASE_H
 #define GRID_CARTESIAN_BASE_H

-#include <Grid.h>

 namespace Grid{

@ -107,6 +106,12 @@ public:
        for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
        return idx;
    }
+    virtual int iIndex(std::vector<int> &lcoor)
+    {
+        int idx=0;
+        for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
+        return idx;
+    }
    inline int oIndexReduced(std::vector<int> &ocoor)
    {
      int idx=0; 
@ -123,12 +128,6 @@ public:
    //////////////////////////////////////////////////////////
    // SIMD lane addressing
    //////////////////////////////////////////////////////////
-    inline int iIndex(std::vector<int> &lcoor)
-    {
-        int idx=0;
-        for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
-        return idx;
-    }
    inline void iCoorFromIindex(std::vector<int> &coor,int lane)
    {
      Lexicographic::CoorFromIndex(coor,lane,_simd_layout);
@ -220,7 +219,7 @@ public:
      }

      i_idx= iIndex(cblcoor);// this does not imply divide by 2 on checker dim
-      o_idx= oIndex(lcoor);// this implies divide by 2 on checkerdim
+      o_idx= oIndex(lcoor);  // this implies divide by 2 on checkerdim
    }

    void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector<int> &gcoor)
--- a/lib/cartesian/Cartesian_red_black.h
+++ b/lib/cartesian/Cartesian_red_black.h
@ -32,17 +32,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 namespace Grid {

-    static const int CbRed  =0;
-    static const int CbBlack=1;
-    static const int Even   =CbRed;
-    static const int Odd    =CbBlack;
-
-    // Perhaps these are misplaced and 
-    // should be in sparse matrix.
-    // Also should make these a named enum type
-    static const int DaggerNo=0;
-    static const int DaggerYes=1;
-
+  static const int CbRed  =0;
+  static const int CbBlack=1;
+  static const int Even   =CbRed;
+  static const int Odd    =CbBlack;
+    
 // Specialise this for red black grids storing half the data like a chess board.
 class GridRedBlackCartesian : public GridBase
 {
@ -224,9 +218,21 @@ protected:
 	  idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
 	}
      }
-        return idx;
+      return idx;
    };
        
+    virtual int iIndex(std::vector<int> &lcoor)
+    {
+        int idx=0;
+        for(int d=0;d<_ndimension;d++) {
+	  if( d==_checker_dim ) {
+	    idx+=_istride[d]*(lcoor[d]/(2*_rdimensions[d]));
+	  } else { 
+	    idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
+	  }
+	}
+        return idx;
+    }
 };

 }
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@ -127,12 +127,21 @@ class CartesianCommunicator {
 			int recv_from_rank,
 			int bytes);

+    void SendToRecvFromInit(std::vector<CommsRequest_t> &list,
+			    void *xmit,
+			    int xmit_to_rank,
+			    void *recv,
+			    int recv_from_rank,
+			    int bytes);
+
    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 			 void *xmit,
 			 int xmit_to_rank,
 			 void *recv,
 			 int recv_from_rank,
 			 int bytes);
+
+    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list);
    void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);

    ////////////////////////////////////////////////////////////
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@ -144,6 +144,28 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
 }

 // Basic Halo comms primitive
+// Basic Halo comms primitive
+void CartesianCommunicator::SendToRecvFromInit(std::vector<CommsRequest_t> &list,
+					       void *xmit,
+					       int dest,
+					       void *recv,
+					       int from,
+					       int bytes)
+{
+  MPI_Request xrq;
+  MPI_Request rrq;
+  int rank = _processor;
+  int ierr;
+  ierr =MPI_Send_init(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+  ierr|=MPI_Recv_init(recv, bytes, MPI_CHAR,dest,_processor,communicator,&rrq);
+  assert(ierr==0);
+  list.push_back(xrq);
+  list.push_back(rrq);
+}
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list)
+{
+  MPI_Startall(list.size(),&list[0]);
+}
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
@ -151,17 +173,12 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 						int from,
 						int bytes)
 {
-  MPI_Request xrq;
-  MPI_Request rrq;
-  int rank = _processor;
-  int ierr;
-  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
-  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
-  
-  assert(ierr==0);
-
-  list.push_back(xrq);
-  list.push_back(rrq);
+  std::vector<CommsRequest_t> reqs(0);
+  SendToRecvFromInit(reqs,xmit,dest,recv,from,bytes);
+  SendToRecvFromBegin(reqs);
+  for(int i=0;i<reqs.size();i++){
+    list.push_back(reqs[i]);
+  }
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@ -84,6 +84,19 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 {
  assert(0);
 }
+void CartesianCommunicator::SendToRecvFromInit(std::vector<CommsRequest_t> &list,
+						void *xmit,
+						int dest,
+						void *recv,
+						int from,
+						int bytes)
+{
+  assert(0);
+}
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list)
+{
+  assert(0);
+}
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  assert(0);
--- a/lib/communicator/Communicator_shmem.cc
+++ b/lib/communicator/Communicator_shmem.cc
@ -268,6 +268,10 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
 }

 // Basic Halo comms primitive
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list)
+{
+  assert(0); //unimplemented
+}
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
@ -280,6 +284,15 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
  //  shmem_putmem_nb(recv,xmit,bytes,dest,NULL);
  shmem_putmem(recv,xmit,bytes,dest);
 }
+void CartesianCommunicator::SendToRecvFromInit(std::vector<CommsRequest_t> &list,
+						void *xmit,
+						int dest,
+						void *recv,
+						int from,
+						int bytes)
+{
+  assert(0); // Unimplemented
+}
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  //  shmem_quiet();      // I'm done
--- a/lib/lattice/Lattice_ET.h
+++ b/lib/lattice/Lattice_ET.h
@ -1,73 +1,74 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/lattice/Lattice_ET.h
+Source file: ./lib/lattice/Lattice_ET.h

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef GRID_LATTICE_ET_H
 #define GRID_LATTICE_ET_H

 #include <iostream>
-#include <vector>
 #include <tuple>
 #include <typeinfo>
+#include <vector>

 namespace Grid {

-  ////////////////////////////////////////////////////
-  // Predicated where support
-  ////////////////////////////////////////////////////
-  template<class iobj,class vobj,class robj>
-    inline vobj predicatedWhere(const iobj &predicate,const vobj &iftrue,const robj &iffalse) {
+////////////////////////////////////////////////////
+// Predicated where support
+////////////////////////////////////////////////////
+template <class iobj, class vobj, class robj>
+inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
+                            const robj &iffalse) {
+  typename std::remove_const<vobj>::type ret;

-    typename std::remove_const<vobj>::type ret;
+  typedef typename vobj::scalar_object scalar_object;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;

-    typedef typename vobj::scalar_object scalar_object;
-    typedef typename vobj::scalar_type scalar_type;
-    typedef typename vobj::vector_type vector_type;
+  const int Nsimd = vobj::vector_type::Nsimd();
+  const int words = sizeof(vobj) / sizeof(vector_type);

-    const int Nsimd = vobj::vector_type::Nsimd();
-    const int words = sizeof(vobj)/sizeof(vector_type);
+  std::vector<Integer> mask(Nsimd);
+  std::vector<scalar_object> truevals(Nsimd);
+  std::vector<scalar_object> falsevals(Nsimd);

-    std::vector<Integer> mask(Nsimd);
-    std::vector<scalar_object> truevals (Nsimd);
-    std::vector<scalar_object> falsevals(Nsimd);
+  extract(iftrue, truevals);
+  extract(iffalse, falsevals);
+  extract<vInteger, Integer>(TensorRemove(predicate), mask);

-    extract(iftrue   ,truevals);
-    extract(iffalse  ,falsevals);
-    extract<vInteger,Integer>(TensorRemove(predicate),mask);
-
-    for(int s=0;s<Nsimd;s++){
-      if (mask[s]) falsevals[s]=truevals[s];
-    }
-
-    merge(ret,falsevals);
-    return ret;
+  for (int s = 0; s < Nsimd; s++) {
+    if (mask[s]) falsevals[s] = truevals[s];
  }

+  merge(ret, falsevals);
+  return ret;
+}
+
 ////////////////////////////////////////////
 // recursive evaluation of expressions; Could
 // switch to generic approach with variadics, a la
@ -75,303 +76,351 @@ namespace Grid {
 // from tuple is hideous; C++14 introduces std::make_index_sequence for this
 ////////////////////////////////////////////

+// leaf eval of lattice ; should enable if protect using traits

-//leaf eval of lattice ; should enable if protect using traits
+template <typename T>
+using is_lattice = std::is_base_of<LatticeBase, T>;

-template <typename T> using is_lattice      = std::is_base_of<LatticeBase,T >;
+template <typename T>
+using is_lattice_expr = std::is_base_of<LatticeExpressionBase, T>;

 template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;

+//Specialization of getVectorType for lattices
+template<typename T>
+struct getVectorType<Lattice<T> >{
+  typedef typename Lattice<T>::vector_object type;
+};
+ 
 template<class sobj>
 inline sobj eval(const unsigned int ss, const sobj &arg)
 {
  return arg;
 }
-template<class lobj>
-inline const lobj &eval(const unsigned int ss, const Lattice<lobj> &arg)
-{
-    return arg._odata[ss];
+template <class lobj>
+inline const lobj &eval(const unsigned int ss, const Lattice<lobj> &arg) {
+  return arg._odata[ss];
 }

 // handle nodes in syntax tree
 template <typename Op, typename T1>
-auto inline eval(const unsigned int ss, const LatticeUnaryExpression<Op,T1 > &expr) // eval one operand
-  -> decltype(expr.first.func(eval(ss,std::get<0>(expr.second))))
-{
-  return expr.first.func(eval(ss,std::get<0>(expr.second)));
+auto inline eval(
+    const unsigned int ss,
+    const LatticeUnaryExpression<Op, T1> &expr)  // eval one operand
+    -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)))) {
+  return expr.first.func(eval(ss, std::get<0>(expr.second)));
 }

 template <typename Op, typename T1, typename T2>
-auto inline eval(const unsigned int ss, const LatticeBinaryExpression<Op,T1,T2> &expr) // eval two operands
-  -> decltype(expr.first.func(eval(ss,std::get<0>(expr.second)),eval(ss,std::get<1>(expr.second))))
-{
-  return expr.first.func(eval(ss,std::get<0>(expr.second)),eval(ss,std::get<1>(expr.second)));
+auto inline eval(
+    const unsigned int ss,
+    const LatticeBinaryExpression<Op, T1, T2> &expr)  // eval two operands
+    -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)),
+                                eval(ss, std::get<1>(expr.second)))) {
+  return expr.first.func(eval(ss, std::get<0>(expr.second)),
+                         eval(ss, std::get<1>(expr.second)));
 }

 template <typename Op, typename T1, typename T2, typename T3>
-auto inline eval(const unsigned int ss, const LatticeTrinaryExpression<Op,T1,T2,T3 > &expr) // eval three operands
-  -> decltype(expr.first.func(eval(ss,std::get<0>(expr.second)),eval(ss,std::get<1>(expr.second)),eval(ss,std::get<2>(expr.second))))
-{
-  return expr.first.func(eval(ss,std::get<0>(expr.second)),eval(ss,std::get<1>(expr.second)),eval(ss,std::get<2>(expr.second)) );
+auto inline eval(const unsigned int ss,
+                 const LatticeTrinaryExpression<Op, T1, T2, T3>
+                     &expr)  // eval three operands
+    -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)),
+                                eval(ss, std::get<1>(expr.second)),
+                                eval(ss, std::get<2>(expr.second)))) {
+  return expr.first.func(eval(ss, std::get<0>(expr.second)),
+                         eval(ss, std::get<1>(expr.second)),
+                         eval(ss, std::get<2>(expr.second)));
 }

 //////////////////////////////////////////////////////////////////////////
-// Obtain the grid from an expression, ensuring conformable. This must follow a tree recursion
+// Obtain the grid from an expression, ensuring conformable. This must follow a
+// tree recursion
 //////////////////////////////////////////////////////////////////////////
-template<class T1, typename std::enable_if<is_lattice<T1>::value, T1>::type * =nullptr >
-inline void GridFromExpression(GridBase * &grid,const T1& lat)   // Lattice leaf
-{
-  if ( grid ) {
-    conformable(grid,lat._grid);
-  } 
-  grid=lat._grid;
-}
-template<class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr >
-inline void GridFromExpression(GridBase * &grid,const T1& notlat)   // non-lattice leaf
+template <class T1,
+          typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
+inline void GridFromExpression(GridBase *&grid, const T1 &lat)  // Lattice leaf
 {
+  if (grid) {
+    conformable(grid, lat._grid);
+  }
+  grid = lat._grid;
 }
+template <class T1,
+          typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
+inline void GridFromExpression(GridBase *&grid,
+                               const T1 &notlat)  // non-lattice leaf
+{}
 template <typename Op, typename T1>
-inline void GridFromExpression(GridBase * &grid,const LatticeUnaryExpression<Op,T1 > &expr)
-{
-  GridFromExpression(grid,std::get<0>(expr.second));// recurse 
+inline void GridFromExpression(GridBase *&grid,
+                               const LatticeUnaryExpression<Op, T1> &expr) {
+  GridFromExpression(grid, std::get<0>(expr.second));  // recurse
 }

 template <typename Op, typename T1, typename T2>
-inline void GridFromExpression(GridBase * &grid,const LatticeBinaryExpression<Op,T1,T2> &expr) 
-{
-  GridFromExpression(grid,std::get<0>(expr.second));// recurse
-  GridFromExpression(grid,std::get<1>(expr.second));
+inline void GridFromExpression(
+    GridBase *&grid, const LatticeBinaryExpression<Op, T1, T2> &expr) {
+  GridFromExpression(grid, std::get<0>(expr.second));  // recurse
+  GridFromExpression(grid, std::get<1>(expr.second));
 }
 template <typename Op, typename T1, typename T2, typename T3>
-inline void GridFromExpression( GridBase * &grid,const LatticeTrinaryExpression<Op,T1,T2,T3 > &expr) 
-{
-  GridFromExpression(grid,std::get<0>(expr.second));// recurse
-  GridFromExpression(grid,std::get<1>(expr.second));
-  GridFromExpression(grid,std::get<2>(expr.second));
+inline void GridFromExpression(
+    GridBase *&grid, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) {
+  GridFromExpression(grid, std::get<0>(expr.second));  // recurse
+  GridFromExpression(grid, std::get<1>(expr.second));
+  GridFromExpression(grid, std::get<2>(expr.second));
 }

-
 //////////////////////////////////////////////////////////////////////////
-// Obtain the CB from an expression, ensuring conformable. This must follow a tree recursion
+// Obtain the CB from an expression, ensuring conformable. This must follow a
+// tree recursion
 //////////////////////////////////////////////////////////////////////////
-template<class T1, typename std::enable_if<is_lattice<T1>::value, T1>::type * =nullptr >
-inline void CBFromExpression(int &cb,const T1& lat)   // Lattice leaf
+template <class T1,
+          typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
+inline void CBFromExpression(int &cb, const T1 &lat)  // Lattice leaf
 {
-  if ( (cb==Odd) || (cb==Even) ) {
-    assert(cb==lat.checkerboard);
-  } 
-  cb=lat.checkerboard;
+  if ((cb == Odd) || (cb == Even)) {
+    assert(cb == lat.checkerboard);
+  }
+  cb = lat.checkerboard;
  //  std::cout<<GridLogMessage<<"Lattice leaf cb "<<cb<<std::endl;
 }
-template<class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr >
-inline void CBFromExpression(int &cb,const T1& notlat)   // non-lattice leaf
+template <class T1,
+          typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
+inline void CBFromExpression(int &cb, const T1 &notlat)  // non-lattice leaf
 {
  //  std::cout<<GridLogMessage<<"Non lattice leaf cb"<<cb<<std::endl;
 }
 template <typename Op, typename T1>
-inline void CBFromExpression(int &cb,const LatticeUnaryExpression<Op,T1 > &expr)
-{
-  CBFromExpression(cb,std::get<0>(expr.second));// recurse 
+inline void CBFromExpression(int &cb,
+                             const LatticeUnaryExpression<Op, T1> &expr) {
+  CBFromExpression(cb, std::get<0>(expr.second));  // recurse
  //  std::cout<<GridLogMessage<<"Unary node cb "<<cb<<std::endl;
 }

 template <typename Op, typename T1, typename T2>
-inline void CBFromExpression(int &cb,const LatticeBinaryExpression<Op,T1,T2> &expr) 
-{
-  CBFromExpression(cb,std::get<0>(expr.second));// recurse
-  CBFromExpression(cb,std::get<1>(expr.second));
+inline void CBFromExpression(int &cb,
+                             const LatticeBinaryExpression<Op, T1, T2> &expr) {
+  CBFromExpression(cb, std::get<0>(expr.second));  // recurse
+  CBFromExpression(cb, std::get<1>(expr.second));
  //  std::cout<<GridLogMessage<<"Binary node cb "<<cb<<std::endl;
 }
 template <typename Op, typename T1, typename T2, typename T3>
-inline void CBFromExpression( int &cb,const LatticeTrinaryExpression<Op,T1,T2,T3 > &expr) 
-{
-  CBFromExpression(cb,std::get<0>(expr.second));// recurse
-  CBFromExpression(cb,std::get<1>(expr.second));
-  CBFromExpression(cb,std::get<2>(expr.second));
+inline void CBFromExpression(
+    int &cb, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) {
+  CBFromExpression(cb, std::get<0>(expr.second));  // recurse
+  CBFromExpression(cb, std::get<1>(expr.second));
+  CBFromExpression(cb, std::get<2>(expr.second));
  //  std::cout<<GridLogMessage<<"Trinary node cb "<<cb<<std::endl;
 }

 ////////////////////////////////////////////
 // Unary operators and funcs
 ////////////////////////////////////////////
-#define GridUnopClass(name,ret)\
-template <class arg> struct name\
-{\
-  static auto inline func(const arg a)-> decltype(ret) { return ret; } \
-};
+#define GridUnopClass(name, ret)                                          \
+  template <class arg>                                                    \
+  struct name {                                                           \
+    static auto inline func(const arg a) -> decltype(ret) { return ret; } \
+  };

-GridUnopClass(UnarySub,-a);
-GridUnopClass(UnaryNot,Not(a));
-GridUnopClass(UnaryAdj,adj(a));
-GridUnopClass(UnaryConj,conjugate(a));
-GridUnopClass(UnaryTrace,trace(a));
-GridUnopClass(UnaryTranspose,transpose(a));
-GridUnopClass(UnaryTa,Ta(a));
-GridUnopClass(UnaryProjectOnGroup,ProjectOnGroup(a));
-GridUnopClass(UnaryReal,real(a));
-GridUnopClass(UnaryImag,imag(a));
-GridUnopClass(UnaryToReal,toReal(a));
-GridUnopClass(UnaryToComplex,toComplex(a));
-GridUnopClass(UnaryAbs,abs(a));
-GridUnopClass(UnarySqrt,sqrt(a));
-GridUnopClass(UnaryRsqrt,rsqrt(a));
-GridUnopClass(UnarySin,sin(a));
-GridUnopClass(UnaryCos,cos(a));
-GridUnopClass(UnaryLog,log(a));
-GridUnopClass(UnaryExp,exp(a));
+GridUnopClass(UnarySub, -a);
+GridUnopClass(UnaryNot, Not(a));
+GridUnopClass(UnaryAdj, adj(a));
+GridUnopClass(UnaryConj, conjugate(a));
+GridUnopClass(UnaryTrace, trace(a));
+GridUnopClass(UnaryTranspose, transpose(a));
+GridUnopClass(UnaryTa, Ta(a));
+GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
+GridUnopClass(UnaryReal, real(a));
+GridUnopClass(UnaryImag, imag(a));
+GridUnopClass(UnaryToReal, toReal(a));
+GridUnopClass(UnaryToComplex, toComplex(a));
+GridUnopClass(UnaryTimesI, timesI(a));
+GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
+GridUnopClass(UnaryAbs, abs(a));
+GridUnopClass(UnarySqrt, sqrt(a));
+GridUnopClass(UnaryRsqrt, rsqrt(a));
+GridUnopClass(UnarySin, sin(a));
+GridUnopClass(UnaryCos, cos(a));
+GridUnopClass(UnaryAsin, asin(a));
+GridUnopClass(UnaryAcos, acos(a));
+GridUnopClass(UnaryLog, log(a));
+GridUnopClass(UnaryExp, exp(a));

 ////////////////////////////////////////////
 // Binary operators
 ////////////////////////////////////////////
-#define GridBinOpClass(name,combination)\
-template <class left,class right>\
-struct name\
-{\
-  static auto inline func(const left &lhs,const right &rhs)-> decltype(combination) const \
-    {\
-      return combination;\
-    }\
-}
-GridBinOpClass(BinaryAdd,lhs+rhs);
-GridBinOpClass(BinarySub,lhs-rhs);
-GridBinOpClass(BinaryMul,lhs*rhs);
+#define GridBinOpClass(name, combination)                      \
+  template <class left, class right>                           \
+  struct name {                                                \
+    static auto inline func(const left &lhs, const right &rhs) \
+        -> decltype(combination) const {                       \
+      return combination;                                      \
+    }                                                          \
+  }
+GridBinOpClass(BinaryAdd, lhs + rhs);
+GridBinOpClass(BinarySub, lhs - rhs);
+GridBinOpClass(BinaryMul, lhs *rhs);

-GridBinOpClass(BinaryAnd   ,lhs&rhs);
-GridBinOpClass(BinaryOr    ,lhs|rhs);
-GridBinOpClass(BinaryAndAnd,lhs&&rhs);
-GridBinOpClass(BinaryOrOr  ,lhs||rhs);
+GridBinOpClass(BinaryAnd, lhs &rhs);
+GridBinOpClass(BinaryOr, lhs | rhs);
+GridBinOpClass(BinaryAndAnd, lhs &&rhs);
+GridBinOpClass(BinaryOrOr, lhs || rhs);

 ////////////////////////////////////////////////////
 // Trinary conditional op
 ////////////////////////////////////////////////////
-#define GridTrinOpClass(name,combination)\
-template <class predicate,class left, class right>	\
-struct name\
-{\
-  static auto inline func(const predicate &pred,const left &lhs,const right &rhs)-> decltype(combination) const \
-    {\
-      return combination;\
-    }\
-}
+#define GridTrinOpClass(name, combination)                                     \
+  template <class predicate, class left, class right>                          \
+  struct name {                                                                \
+    static auto inline func(const predicate &pred, const left &lhs,            \
+                            const right &rhs) -> decltype(combination) const { \
+      return combination;                                                      \
+    }                                                                          \
+  }

-GridTrinOpClass(TrinaryWhere,(predicatedWhere<predicate, \
-			       typename std::remove_reference<left>::type, \
-			       typename std::remove_reference<right>::type> (pred,lhs,rhs)));
+GridTrinOpClass(
+    TrinaryWhere,
+    (predicatedWhere<predicate, typename std::remove_reference<left>::type,
+                     typename std::remove_reference<right>::type>(pred, lhs,
+                                                                  rhs)));

 ////////////////////////////////////////////
 // Operator syntactical glue
 ////////////////////////////////////////////
- 
-#define GRID_UNOP(name)   name<decltype(eval(0, arg))>
-#define GRID_BINOP(name)  name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
-#define GRID_TRINOP(name) name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>

-#define GRID_DEF_UNOP(op, name)\
-template <typename T1,\
-  typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value, T1>::type* = nullptr> inline auto op(const T1 &arg) \
-  -> decltype(LatticeUnaryExpression<GRID_UNOP(name),const T1&>(std::make_pair(GRID_UNOP(name)(),std::forward_as_tuple(arg)))) \
-{ return LatticeUnaryExpression<GRID_UNOP(name), const T1 &>(std::make_pair(GRID_UNOP(name)(),std::forward_as_tuple(arg))); }
+#define GRID_UNOP(name) name<decltype(eval(0, arg))>
+#define GRID_BINOP(name) name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
+#define GRID_TRINOP(name) \
+  name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>

-#define GRID_BINOP_LEFT(op, name)\
-template <typename T1,typename T2,\
-          typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value, T1>::type* = nullptr>\
-inline auto op(const T1 &lhs,const T2&rhs) \
-  -> decltype(LatticeBinaryExpression<GRID_BINOP(name),const T1&,const T2 &>(std::make_pair(GRID_BINOP(name)(),\
-											    std::forward_as_tuple(lhs, rhs)))) \
-{\
- return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(std::make_pair(GRID_BINOP(name)(),\
-									  std::forward_as_tuple(lhs, rhs))); \
-}
+#define GRID_DEF_UNOP(op, name)                                             \
+  template <typename T1,                                                    \
+            typename std::enable_if<is_lattice<T1>::value ||                \
+                                        is_lattice_expr<T1>::value,         \
+                                    T1>::type * = nullptr>                  \
+  inline auto op(const T1 &arg)                                             \
+      ->decltype(LatticeUnaryExpression<GRID_UNOP(name), const T1 &>(       \
+          std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg)))) { \
+    return LatticeUnaryExpression<GRID_UNOP(name), const T1 &>(             \
+        std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg)));     \
+  }

-#define GRID_BINOP_RIGHT(op, name)\
- template <typename T1,typename T2,\
-           typename std::enable_if<!is_lattice<T1>::value && !is_lattice_expr<T1>::value, T1>::type* = nullptr,\
-           typename std::enable_if< is_lattice<T2>::value ||  is_lattice_expr<T2>::value, T2>::type* = nullptr> \
-inline auto op(const T1 &lhs,const T2&rhs)			\
-  -> decltype(LatticeBinaryExpression<GRID_BINOP(name),const T1&,const T2 &>(std::make_pair(GRID_BINOP(name)(),\
-											    std::forward_as_tuple(lhs, rhs)))) \
-{\
- return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(std::make_pair(GRID_BINOP(name)(),\
-								          std::forward_as_tuple(lhs, rhs))); \
-}
+#define GRID_BINOP_LEFT(op, name)                                             \
+  template <typename T1, typename T2,                                         \
+            typename std::enable_if<is_lattice<T1>::value ||                  \
+                                        is_lattice_expr<T1>::value,           \
+                                    T1>::type * = nullptr>                    \
+  inline auto op(const T1 &lhs, const T2 &rhs)                                \
+      ->decltype(                                                             \
+          LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(  \
+              std::make_pair(GRID_BINOP(name)(),                              \
+                             std::forward_as_tuple(lhs, rhs)))) {             \
+    return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
+        std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \
+  }

-#define GRID_DEF_BINOP(op, name)\
- GRID_BINOP_LEFT(op,name);\
- GRID_BINOP_RIGHT(op,name);
+#define GRID_BINOP_RIGHT(op, name)                                            \
+  template <typename T1, typename T2,                                         \
+            typename std::enable_if<!is_lattice<T1>::value &&                 \
+                                        !is_lattice_expr<T1>::value,          \
+                                    T1>::type * = nullptr,                    \
+            typename std::enable_if<is_lattice<T2>::value ||                  \
+                                        is_lattice_expr<T2>::value,           \
+                                    T2>::type * = nullptr>                    \
+  inline auto op(const T1 &lhs, const T2 &rhs)                                \
+      ->decltype(                                                             \
+          LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(  \
+              std::make_pair(GRID_BINOP(name)(),                              \
+                             std::forward_as_tuple(lhs, rhs)))) {             \
+    return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
+        std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \
+  }

+#define GRID_DEF_BINOP(op, name) \
+  GRID_BINOP_LEFT(op, name);     \
+  GRID_BINOP_RIGHT(op, name);

-#define GRID_DEF_TRINOP(op, name)\
-template <typename T1,typename T2,typename T3> inline auto op(const T1 &pred,const T2&lhs,const T3 &rhs) \
-  -> decltype(LatticeTrinaryExpression<GRID_TRINOP(name),const T1&,const T2 &,const T3&>(std::make_pair(GRID_TRINOP(name)(),\
-										   std::forward_as_tuple(pred,lhs,rhs)))) \
-{\
-  return LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &,const T3&>(std::make_pair(GRID_TRINOP(name)(), \
-										 std::forward_as_tuple(pred,lhs, rhs))); \
-}
+#define GRID_DEF_TRINOP(op, name)                                              \
+  template <typename T1, typename T2, typename T3>                             \
+  inline auto op(const T1 &pred, const T2 &lhs, const T3 &rhs)                 \
+      ->decltype(                                                              \
+          LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &,  \
+                                   const T3 &>(std::make_pair(                 \
+              GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs)))) {  \
+    return LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &, \
+                                    const T3 &>(std::make_pair(                \
+        GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs)));          \
+  }
 ////////////////////////
-//Operator definitions
+// Operator definitions
 ////////////////////////

-GRID_DEF_UNOP(operator -,UnarySub);
-GRID_DEF_UNOP(Not,UnaryNot);
-GRID_DEF_UNOP(operator !,UnaryNot);
-GRID_DEF_UNOP(adj,UnaryAdj);
-GRID_DEF_UNOP(conjugate,UnaryConj);
-GRID_DEF_UNOP(trace,UnaryTrace);
-GRID_DEF_UNOP(transpose,UnaryTranspose);
-GRID_DEF_UNOP(Ta,UnaryTa);
-GRID_DEF_UNOP(ProjectOnGroup,UnaryProjectOnGroup);
-GRID_DEF_UNOP(real,UnaryReal);
-GRID_DEF_UNOP(imag,UnaryImag);
-GRID_DEF_UNOP(toReal,UnaryToReal);
-GRID_DEF_UNOP(toComplex,UnaryToComplex);
-GRID_DEF_UNOP(abs  ,UnaryAbs); //abs overloaded in cmath C++98; DON'T do the abs-fabs-dabs-labs thing
-GRID_DEF_UNOP(sqrt ,UnarySqrt);
-GRID_DEF_UNOP(rsqrt,UnaryRsqrt);
-GRID_DEF_UNOP(sin  ,UnarySin);
-GRID_DEF_UNOP(cos  ,UnaryCos);
-GRID_DEF_UNOP(log  ,UnaryLog);
-GRID_DEF_UNOP(exp  ,UnaryExp);
+GRID_DEF_UNOP(operator-, UnarySub);
+GRID_DEF_UNOP(Not, UnaryNot);
+GRID_DEF_UNOP(operator!, UnaryNot);
+GRID_DEF_UNOP(adj, UnaryAdj);
+GRID_DEF_UNOP(conjugate, UnaryConj);
+GRID_DEF_UNOP(trace, UnaryTrace);
+GRID_DEF_UNOP(transpose, UnaryTranspose);
+GRID_DEF_UNOP(Ta, UnaryTa);
+GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
+GRID_DEF_UNOP(real, UnaryReal);
+GRID_DEF_UNOP(imag, UnaryImag);
+GRID_DEF_UNOP(toReal, UnaryToReal);
+GRID_DEF_UNOP(toComplex, UnaryToComplex);
+GRID_DEF_UNOP(timesI, UnaryTimesI);
+GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
+GRID_DEF_UNOP(abs, UnaryAbs);  // abs overloaded in cmath C++98; DON'T do the
+                               // abs-fabs-dabs-labs thing
+GRID_DEF_UNOP(sqrt, UnarySqrt);
+GRID_DEF_UNOP(rsqrt, UnaryRsqrt);
+GRID_DEF_UNOP(sin, UnarySin);
+GRID_DEF_UNOP(cos, UnaryCos);
+GRID_DEF_UNOP(asin, UnaryAsin);
+GRID_DEF_UNOP(acos, UnaryAcos);
+GRID_DEF_UNOP(log, UnaryLog);
+GRID_DEF_UNOP(exp, UnaryExp);

-GRID_DEF_BINOP(operator+,BinaryAdd);
-GRID_DEF_BINOP(operator-,BinarySub);
-GRID_DEF_BINOP(operator*,BinaryMul);
+GRID_DEF_BINOP(operator+, BinaryAdd);
+GRID_DEF_BINOP(operator-, BinarySub);
+GRID_DEF_BINOP(operator*, BinaryMul);

-GRID_DEF_BINOP(operator&,BinaryAnd);
-GRID_DEF_BINOP(operator|,BinaryOr);
-GRID_DEF_BINOP(operator&&,BinaryAndAnd);
-GRID_DEF_BINOP(operator||,BinaryOrOr);
+GRID_DEF_BINOP(operator&, BinaryAnd);
+GRID_DEF_BINOP(operator|, BinaryOr);
+GRID_DEF_BINOP(operator&&, BinaryAndAnd);
+GRID_DEF_BINOP(operator||, BinaryOrOr);

-GRID_DEF_TRINOP(where,TrinaryWhere);
+GRID_DEF_TRINOP(where, TrinaryWhere);

 /////////////////////////////////////////////////////////////
 // Closure convenience to force expression to evaluate
 /////////////////////////////////////////////////////////////
-template<class Op,class T1>
-  auto closure(const LatticeUnaryExpression<Op,T1> & expr)
-  -> Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second))))>
-{
-  Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second))))> ret(expr);
+template <class Op, class T1>
+auto closure(const LatticeUnaryExpression<Op, T1> &expr)
+    -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> {
+  Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> ret(
+      expr);
  return ret;
 }
-template<class Op,class T1, class T2>
-  auto closure(const LatticeBinaryExpression<Op,T1,T2> & expr)
-  -> Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
-				      eval(0,std::get<1>(expr.second))))>
-{
-  Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
-				   eval(0,std::get<1>(expr.second))))> ret(expr);
+template <class Op, class T1, class T2>
+auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
+    -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
+                                        eval(0, std::get<1>(expr.second))))> {
+  Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
+                                   eval(0, std::get<1>(expr.second))))>
+      ret(expr);
  return ret;
 }
-template<class Op,class T1, class T2, class T3>
-  auto closure(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
-  -> Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
-				      eval(0,std::get<1>(expr.second)),
-				      eval(0,std::get<2>(expr.second))))>
-{
-  Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
-				   eval(0,std::get<1>(expr.second)),
-				   eval(0,std::get<2>(expr.second))))> ret(expr);
+template <class Op, class T1, class T2, class T3>
+auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
+    -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
+                                        eval(0, std::get<1>(expr.second)),
+                                        eval(0, std::get<2>(expr.second))))> {
+  Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
+                                   eval(0, std::get<1>(expr.second)),
+                                   eval(0, std::get<2>(expr.second))))>
+      ret(expr);
  return ret;
 }

@ -382,12 +431,11 @@ template<class Op,class T1, class T2, class T3>
 #undef GRID_DEF_UNOP
 #undef GRID_DEF_BINOP
 #undef GRID_DEF_TRINOP
-
 }

 #if 0
 using namespace Grid;
- 	      
+        
 int main(int argc,char **argv){
   
   Lattice<double> v1(16);
@ -397,7 +445,7 @@ using namespace Grid;
   BinaryAdd<double,double> tmp;
   LatticeBinaryExpression<BinaryAdd<double,double>,Lattice<double> &,Lattice<double> &> 
     expr(std::make_pair(tmp,
-	  std::forward_as_tuple(v1,v2)));
+    std::forward_as_tuple(v1,v2)));
   tmp.func(eval(0,v1),eval(0,v2));

   auto var = v1+v2;
--- a/lib/lattice/Lattice_base.h
+++ b/lib/lattice/Lattice_base.h
@ -1,32 +1,33 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/lattice/Lattice_base.h
+Source file: ./lib/lattice/Lattice_base.h

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef GRID_LATTICE_BASE_H
 #define GRID_LATTICE_BASE_H

@ -101,6 +102,7 @@ public:
    int begin(void) { return 0;};
    int end(void)   { return _odata.size(); }
    vobj & operator[](int i) { return _odata[i]; };
+    const vobj & operator[](int i) const { return _odata[i]; };

 public:
    typedef typename vobj::scalar_type scalar_type;
@ -255,6 +257,18 @@ PARALLEL_FOR_LOOP
        checkerboard=0;
    }

+    Lattice(const Lattice& r){ // copy constructor
+    	_grid = r._grid;
+    	checkerboard = r.checkerboard;
+    	_odata.resize(_grid->oSites());// essential
+  		PARALLEL_FOR_LOOP
+        for(int ss=0;ss<_grid->oSites();ss++){
+            _odata[ss]=r._odata[ss];
+        }  	
+    }
+
+
+
    virtual ~Lattice(void) = default;
    
    template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
@ -267,7 +281,7 @@ PARALLEL_FOR_LOOP
    template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
      this->checkerboard = r.checkerboard;
      conformable(*this,r);
-      std::cout<<GridLogMessage<<"Lattice operator ="<<std::endl;
+      
 PARALLEL_FOR_LOOP
        for(int ss=0;ss<_grid->oSites();ss++){
            this->_odata[ss]=r._odata[ss];
@ -324,27 +338,27 @@ PARALLEL_FOR_LOOP



-#include <lattice/Lattice_conformable.h>
+#include "Lattice_conformable.h"
 #define GRID_LATTICE_EXPRESSION_TEMPLATES
 #ifdef  GRID_LATTICE_EXPRESSION_TEMPLATES
-#include <lattice/Lattice_ET.h>
+#include "Lattice_ET.h"
 #else 
-#include <lattice/Lattice_overload.h>
+#include "Lattice_overload.h"
 #endif
-#include <lattice/Lattice_arith.h>
-#include <lattice/Lattice_trace.h>
-#include <lattice/Lattice_transpose.h>
-#include <lattice/Lattice_local.h>
-#include <lattice/Lattice_reduction.h>
-#include <lattice/Lattice_peekpoke.h>
-#include <lattice/Lattice_reality.h>
-#include <lattice/Lattice_comparison_utils.h>
-#include <lattice/Lattice_comparison.h>
-#include <lattice/Lattice_coordinate.h>
-#include <lattice/Lattice_where.h>
-#include <lattice/Lattice_rng.h>
-#include <lattice/Lattice_unary.h>
-#include <lattice/Lattice_transfer.h>
+#include "Lattice_arith.h"
+#include "Lattice_trace.h"
+#include "Lattice_transpose.h"
+#include "Lattice_local.h"
+#include "Lattice_reduction.h"
+#include "Lattice_peekpoke.h"
+#include "Lattice_reality.h"
+#include "Lattice_comparison_utils.h"
+#include "Lattice_comparison.h"
+#include "Lattice_coordinate.h"
+#include "Lattice_where.h"
+#include "Lattice_rng.h"
+#include "Lattice_unary.h"
+#include "Lattice_transfer.h"


 #endif
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@ -40,7 +40,7 @@ namespace Grid {
    ////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
    ComplexD nrm = innerProduct(arg,arg);
-    return real(nrm); 
+    return std::real(nrm); 
  }

    template<class vobj>
--- a/lib/lattice/Lattice_transfer.h
+++ b/lib/lattice/Lattice_transfer.h
@ -386,7 +386,7 @@ void InsertSlice(Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int
  }

  // the above should guarantee that the operations are local
-PARALLEL_FOR_LOOP
+  //PARALLEL_FOR_LOOP
  for(int idx=0;idx<lg->lSites();idx++){
    std::vector<int> lcoor(nl);
    std::vector<int> hcoor(nh);
@ -420,15 +420,15 @@ void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, in
  assert(hg->_processors[orthog]==1);

  int dl; dl = 0;
-  for(int d=0;d<nh;d++){
-    if ( d != orthog) {
-      assert(lg->_processors[dl]  == hg->_processors[d]);
-      assert(lg->_ldimensions[dl] == hg->_ldimensions[d]);
-      dl++;
+    for(int d=0;d<nh;d++){
+      if ( d != orthog) {
+	assert(lg->_processors[dl]  == hg->_processors[d]);
+	assert(lg->_ldimensions[dl] == hg->_ldimensions[d]);
+	dl++;
    }
  }
  // the above should guarantee that the operations are local
-PARALLEL_FOR_LOOP
+  //PARALLEL_FOR_LOOP
  for(int idx=0;idx<lg->lSites();idx++){
    std::vector<int> lcoor(nl);
    std::vector<int> hcoor(nh);
@ -482,6 +482,96 @@ void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)

 }

+//Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
+template<typename vobj, typename sobj>
+typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>::value, void>::type unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in){
+  typedef typename vobj::vector_type vtype;
+  
+  GridBase* in_grid = in._grid;
+  out.resize(in_grid->lSites());
+  
+  int ndim = in_grid->Nd();
+  int in_nsimd = vtype::Nsimd();

+  std::vector<std::vector<int> > in_icoor(in_nsimd);
+      
+  for(int lane=0; lane < in_nsimd; lane++){
+    in_icoor[lane].resize(ndim);
+    in_grid->iCoorFromIindex(in_icoor[lane], lane);
+  }
+  
+PARALLEL_FOR_LOOP
+  for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index
+    //Assemble vector of pointers to output elements
+    std::vector<sobj*> out_ptrs(in_nsimd);
+
+    std::vector<int> in_ocoor(ndim);
+    in_grid->oCoorFromOindex(in_ocoor, in_oidx);
+
+    std::vector<int> lcoor(in_grid->Nd());
+      
+    for(int lane=0; lane < in_nsimd; lane++){
+      for(int mu=0;mu<ndim;mu++)
+	lcoor[mu] = in_ocoor[mu] + in_grid->_rdimensions[mu]*in_icoor[lane][mu];
+
+      int lex;
+      Lexicographic::IndexFromCoor(lcoor, lex, in_grid->_ldimensions);
+      out_ptrs[lane] = &out[lex];
+    }
+    
+    //Unpack into those ptrs
+    const vobj & in_vobj = in._odata[in_oidx];
+    extract1(in_vobj, out_ptrs, 0);
+  }
+}
+
+//Convert a Lattice from one precision to another
+template<class VobjOut, class VobjIn>
+void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
+  assert(out._grid->Nd() == in._grid->Nd());
+  out.checkerboard = in.checkerboard;
+  GridBase *in_grid=in._grid;
+  GridBase *out_grid = out._grid;
+
+  typedef typename VobjOut::scalar_object SobjOut;
+  typedef typename VobjIn::scalar_object SobjIn;
+
+  int ndim = out._grid->Nd();
+  int out_nsimd = out_grid->Nsimd();
+    
+  std::vector<std::vector<int> > out_icoor(out_nsimd);
+      
+  for(int lane=0; lane < out_nsimd; lane++){
+    out_icoor[lane].resize(ndim);
+    out_grid->iCoorFromIindex(out_icoor[lane], lane);
+  }
+        
+  std::vector<SobjOut> in_slex_conv(in_grid->lSites());
+  unvectorizeToLexOrdArray(in_slex_conv, in);
+    
+  PARALLEL_FOR_LOOP
+  for(int out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){
+    std::vector<int> out_ocoor(ndim);
+    out_grid->oCoorFromOindex(out_ocoor, out_oidx);
+
+    std::vector<SobjOut*> ptrs(out_nsimd);      
+
+    std::vector<int> lcoor(out_grid->Nd());
+      
+    for(int lane=0; lane < out_nsimd; lane++){
+      for(int mu=0;mu<ndim;mu++)
+	lcoor[mu] = out_ocoor[mu] + out_grid->_rdimensions[mu]*out_icoor[lane][mu];
+	
+      int llex; Lexicographic::IndexFromCoor(lcoor, llex, out_grid->_ldimensions);
+      ptrs[lane] = &in_slex_conv[llex];
+    }
+    merge(out._odata[out_oidx], ptrs, 0);
+  }
+}
+
+
+  
+
+ 
 }
 #endif
--- a/lib/pugixml/.dirstamp
+++ b/lib/pugixml/.dirstamp
--- a/lib/pugixml/pugixml.h
+++ b/lib/pugixml/pugixml.h
@ -17,7 +17,7 @@
 #endif

 // Include user configuration file (this can define various configuration macros)
-#include <pugixml/pugiconfig.hpp>
+#include "pugiconfig.hpp"

 #ifndef HEADER_PUGIXML_HPP
 #define HEADER_PUGIXML_HPP
--- a/lib/qcd/QCD.h
+++ b/lib/qcd/QCD.h
@ -60,6 +60,12 @@ namespace QCD {
    static const int SpinIndex   = 1;
    static const int LorentzIndex= 0;

+    // Also should make these a named enum type
+    static const int DaggerNo=0;
+    static const int DaggerYes=1;
+    static const int InverseNo=0;
+    static const int InverseYes=1;
+
    // Useful traits is this a spin index
    //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;

@ -484,16 +490,16 @@ namespace QCD {
 }   //namespace QCD
 } // Grid

-#include <qcd/utils/SpaceTimeGrid.h>
-#include <qcd/spin/Dirac.h>
-#include <qcd/spin/TwoSpinor.h>
-#include <qcd/utils/LinalgUtils.h>
-#include <qcd/utils/CovariantCshift.h>
-#include <qcd/utils/SUn.h>
-#include <qcd/action/Actions.h>
-#include <qcd/hmc/integrators/Integrator.h>
-#include <qcd/hmc/integrators/Integrator_algorithm.h>
-#include <qcd/hmc/HMC.h>
-
+#include <Grid/qcd/utils/SpaceTimeGrid.h>
+#include <Grid/qcd/spin/Dirac.h>
+#include <Grid/qcd/spin/TwoSpinor.h>
+#include <Grid/qcd/utils/LinalgUtils.h>
+#include <Grid/qcd/utils/CovariantCshift.h>
+#include <Grid/qcd/utils/SUn.h>
+#include <Grid/qcd/action/Actions.h>
+#include <Grid/qcd/hmc/integrators/Integrator.h>
+#include <Grid/qcd/hmc/integrators/Integrator_algorithm.h>
+#include <Grid/qcd/hmc/HMC.h>
+#include <Grid/qcd/smearing/Smearing.h>

 #endif
--- a/lib/qcd/action/ActionBase.h
+++ b/lib/qcd/action/ActionBase.h
@ -35,6 +35,7 @@ template<class GaugeField>
 class Action { 

 public:
+  bool is_smeared = false;
  // Boundary conditions? // Heatbath?
  virtual void  refresh(const GaugeField &U, GridParallelRNG& pRNG) = 0;// refresh pseudofermions
  virtual RealD S    (const GaugeField &U)                        = 0;  // evaluate the action
--- a/lib/qcd/action/Actions.h
+++ b/lib/qcd/action/Actions.h
@ -40,25 +40,25 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 ////////////////////////////////////////////
 // Abstract base interface
 ////////////////////////////////////////////
-#include <qcd/action/ActionBase.h>
-#include <qcd/action/ActionParams.h>
+#include <Grid/qcd/action/ActionBase.h>
+#include <Grid/qcd/action/ActionParams.h>

 ////////////////////////////////////////////
 // Utility functions
 ////////////////////////////////////////////
-#include <qcd/action/gauge/GaugeImpl.h>
-#include <qcd/utils/WilsonLoops.h>
+#include <Grid/qcd/action/gauge/GaugeImpl.h>
+#include <Grid/qcd/utils/WilsonLoops.h>

-#include <qcd/action/fermion/WilsonCompressor.h>     //used by all wilson type fermions
-#include <qcd/action/fermion/FermionOperatorImpl.h>
-#include <qcd/action/fermion/FermionOperator.h>
-#include <qcd/action/fermion/WilsonKernels.h>        //used by all wilson type fermions
+#include <Grid/qcd/action/fermion/WilsonCompressor.h>     //used by all wilson type fermions
+#include <Grid/qcd/action/fermion/FermionOperatorImpl.h>
+#include <Grid/qcd/action/fermion/FermionOperator.h>
+#include <Grid/qcd/action/fermion/WilsonKernels.h>        //used by all wilson type fermions

 ////////////////////////////////////////////
 // Gauge Actions
 ////////////////////////////////////////////
-#include <qcd/action/gauge/WilsonGaugeAction.h>
-#include <qcd/action/gauge/PlaqPlusRectangleAction.h>
+#include <Grid/qcd/action/gauge/WilsonGaugeAction.h>
+#include <Grid/qcd/action/gauge/PlaqPlusRectangleAction.h>

 namespace Grid {
 namespace QCD {
@ -107,41 +107,50 @@ typedef SymanzikGaugeAction<ConjugateGimplD>        ConjugateSymanzikGaugeAction
 // for EVERY .cc file. This define centralises the list and restores global push of impl cases
 ////////////////////////////////////////////////////////////////////////////////////////////////////

-#define FermOpTemplateInstantiate(A) \
+
+#define FermOp4dVecTemplateInstantiate(A) \
  template class A<WilsonImplF>;		\
  template class A<WilsonImplD>;		\
  template class A<GparityWilsonImplF>;		\
  template class A<GparityWilsonImplD>;		

+#define FermOp5dVecTemplateInstantiate(A) \
+  template class A<DomainWallVec5dImplF>;	\
+  template class A<DomainWallVec5dImplD>;	
+
+#define FermOpTemplateInstantiate(A) \
+ FermOp4dVecTemplateInstantiate(A) \
+ FermOp5dVecTemplateInstantiate(A) 
+
 #define GparityFermOpTemplateInstantiate(A) 

 ////////////////////////////////////////////
 // Fermion operators / actions
 ////////////////////////////////////////////

-#include <qcd/action/fermion/WilsonFermion.h>       // 4d wilson like
-#include <qcd/action/fermion/WilsonTMFermion.h>       // 4d wilson like
-#include <qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
+#include <Grid/qcd/action/fermion/WilsonFermion.h>       // 4d wilson like
+#include <Grid/qcd/action/fermion/WilsonTMFermion.h>       // 4d wilson like
+#include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types

-//#include <qcd/action/fermion/CloverFermion.h>
+//#include <Grid/qcd/action/fermion/CloverFermion.h>

-#include <qcd/action/fermion/CayleyFermion5D.h>     // Cayley types
-#include <qcd/action/fermion/DomainWallFermion.h>
-#include <qcd/action/fermion/DomainWallFermion.h>
-#include <qcd/action/fermion/MobiusFermion.h>
-#include <qcd/action/fermion/ScaledShamirFermion.h>
-#include <qcd/action/fermion/MobiusZolotarevFermion.h>
-#include <qcd/action/fermion/ShamirZolotarevFermion.h>
-#include <qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h>
-#include <qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h>
+#include <Grid/qcd/action/fermion/CayleyFermion5D.h>     // Cayley types
+#include <Grid/qcd/action/fermion/DomainWallFermion.h>
+#include <Grid/qcd/action/fermion/DomainWallFermion.h>
+#include <Grid/qcd/action/fermion/MobiusFermion.h>
+#include <Grid/qcd/action/fermion/ScaledShamirFermion.h>
+#include <Grid/qcd/action/fermion/MobiusZolotarevFermion.h>
+#include <Grid/qcd/action/fermion/ShamirZolotarevFermion.h>
+#include <Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h>
+#include <Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h>

-#include <qcd/action/fermion/ContinuedFractionFermion5D.h>               // Continued fraction
-#include <qcd/action/fermion/OverlapWilsonContfracTanhFermion.h>
-#include <qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h>
+#include <Grid/qcd/action/fermion/ContinuedFractionFermion5D.h>               // Continued fraction
+#include <Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h>
+#include <Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h>

-#include <qcd/action/fermion/PartialFractionFermion5D.h>                 // Partial fraction
-#include <qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
-#include <qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h>
+#include <Grid/qcd/action/fermion/PartialFractionFermion5D.h>                 // Partial fraction
+#include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
+#include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h>

 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // More maintainable to maintain the following typedef list centrally, as more "impl" targets
@ -222,21 +231,21 @@ typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
 ///////////////////////////////////////////////////////////////////////////////
 // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
 ///////////////////////////////////////////////////////////////////////////////
-#include <qcd/action/fermion/g5HermitianLinop.h>
+#include <Grid/qcd/action/fermion/g5HermitianLinop.h>

 ////////////////////////////////////////
 // Pseudo fermion combinations for HMC
 ////////////////////////////////////////
-#include <qcd/action/pseudofermion/EvenOddSchurDifferentiable.h>
+#include <Grid/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h>

-#include <qcd/action/pseudofermion/TwoFlavour.h>
-#include <qcd/action/pseudofermion/TwoFlavourRatio.h>
-#include <qcd/action/pseudofermion/TwoFlavourEvenOdd.h>
-#include <qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h>
+#include <Grid/qcd/action/pseudofermion/TwoFlavour.h>
+#include <Grid/qcd/action/pseudofermion/TwoFlavourRatio.h>
+#include <Grid/qcd/action/pseudofermion/TwoFlavourEvenOdd.h>
+#include <Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h>

-#include <qcd/action/pseudofermion/OneFlavourRational.h>
-#include <qcd/action/pseudofermion/OneFlavourRationalRatio.h>
-#include <qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
-#include <qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>
+#include <Grid/qcd/action/pseudofermion/OneFlavourRational.h>
+#include <Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h>
+#include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
+#include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>

 #endif
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@ -28,7 +28,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+
 #include <Grid.h>
+
+
 namespace Grid {
 namespace QCD {

@ -45,486 +48,342 @@ namespace QCD {
 		   FourDimGrid,
 	 	   FourDimRedBlackGrid,_M5,p),
   mass(_mass)
- {
- }
+ { }

- template<class Impl>
-  void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din)
-  {
-    // Assemble Din
-    int Ls=this->Ls;
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	//	Din = bs psi[s] + cs[s] psi[s+1}
-	axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
-	//      Din+= -mass*cs[s] psi[s+1}
-	axpby_ssp_pplus (Din,1.0,Din,-mass*cs[s],psi,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pminus(Din,bs[s],psi,-mass*cs[s],psi,s,0);
-	axpby_ssp_pplus (Din,1.0,Din,cs[s],psi,s,s-1);
-      } else {
-	axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
-	axpby_ssp_pplus(Din,1.0,Din,cs[s],psi,s,s-1);
-      }
-    }
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  std::vector<RealD> diag (Ls,1.0);
+  std::vector<RealD> upper(Ls,-1.0); upper[Ls-1]=mass;
+  std::vector<RealD> lower(Ls,-1.0); lower[0]   =mass;
+  M5D(psi,chi,chi,lower,diag,upper);
+}
+template<class Impl>
+void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din)
+{
+  int Ls=this->Ls;
+  std::vector<RealD> diag = bs;
+  std::vector<RealD> upper= cs;
+  std::vector<RealD> lower= cs; 
+  upper[Ls-1]=-mass*upper[Ls-1];
+  lower[0]   =-mass*lower[0];
+  M5D(psi,psi,Din,lower,diag,upper);
+}
+template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  std::vector<RealD> diag = beo;
+  std::vector<RealD> upper(Ls);
+  std::vector<RealD> lower(Ls);
+  for(int i=0;i<Ls;i++) {
+    upper[i]=-ceo[i];
+    lower[i]=-ceo[i];
  }
- template<class Impl>
-  void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din)
-  {
-    int Ls=this->Ls;
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	axpby_ssp_pplus (Din,bs[s],psi,cs[s+1],psi,s,s+1);
-	axpby_ssp_pminus(Din,1.0,Din,-mass*cs[Ls-1],psi,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pplus (Din,bs[s],psi,-mass*cs[0],psi,s,0);
-	axpby_ssp_pminus(Din,1.0,Din,cs[s-1],psi,s,s-1);
-      } else {
-	axpby_ssp_pplus (Din,bs[s],psi,cs[s+1],psi,s,s+1);
-	axpby_ssp_pminus(Din,1.0,Din,cs[s-1],psi,s,s-1);
-      }
-    }
+  upper[Ls-1]=-mass*upper[Ls-1];
+  lower[0]   =-mass*lower[0];
+  M5D(psi,psi,chi,lower,diag,upper);
+}
+template<class Impl>
+void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  std::vector<RealD> diag = bee;
+  std::vector<RealD> upper(Ls);
+  std::vector<RealD> lower(Ls);
+  for(int i=0;i<Ls;i++) {
+    upper[i]=-cee[i];
+    lower[i]=-cee[i];
  }
+  upper[Ls-1]=-mass*upper[Ls-1];
+  lower[0]   =-mass*lower[0];
+  M5D(psi,psi,chi,lower,diag,upper);
+}

-  // override multiply
- template<class Impl>
-  RealD CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
-  {
-    int Ls=this->Ls;
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  std::vector<RealD> diag = bee;
+  std::vector<RealD> upper(Ls);
+  std::vector<RealD> lower(Ls);

-    FermionField Din(psi._grid);
-
-    // Assemble Din
-    /*
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	//	Din = bs psi[s] + cs[s] psi[s+1}
-	axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
-	//      Din+= -mass*cs[s] psi[s+1}
-	axpby_ssp_pplus (Din,1.0,Din,-mass*cs[s],psi,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pminus(Din,bs[s],psi,-mass*cs[s],psi,s,0);
-	axpby_ssp_pplus (Din,1.0,Din,cs[s],psi,s,s-1);
-      } else {
-	axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
-	axpby_ssp_pplus(Din,1.0,Din,cs[s],psi,s,s-1);
-      }
-    }
-    */
-    Meooe5D(psi,Din);
-
-    this->DW(Din,chi,DaggerNo);
-    // ((b D_W + D_w hop terms +1) on s-diag
-    axpby(chi,1.0,1.0,chi,psi); 
-
-    // Call Mooee??
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ){
-	axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s+1);
-	axpby_ssp_pplus (chi,1.0,chi,mass,psi,s,Ls-1);
-      } else if ( s==(Ls-1)) {
-	axpby_ssp_pminus(chi,1.0,chi,mass,psi,s,0);
-	axpby_ssp_pplus (chi,1.0,chi,-1.0,psi,s,s-1);
-      } else {
-	axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s+1);
-	axpby_ssp_pplus (chi,1.0,chi,-1.0,psi,s,s-1);
-      }
-    }
-    return norm2(chi);
-  }
-
- template<class Impl>
-  RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
-  {
-    // Under adjoint
-    //D1+        D1- P-    ->   D1+^dag   P+ D2-^dag
-    //D2- P+     D2+            P-D1-^dag D2+dag
-
-    FermionField Din(psi._grid);
-    // Apply Dw
-    this->DW(psi,Din,DaggerYes); 
-
-    MeooeDag5D(Din,chi);
-
-    int Ls=this->Ls;
-    for(int s=0;s<Ls;s++){
-
-      // Collect the terms in DW
-      //	Chi = bs Din[s] + cs[s] Din[s+1}
-      //    Chi+= -mass*cs[s] psi[s+1}
-      /*
-      if ( s==0 ) {
-	axpby_ssp_pplus (chi,bs[s],Din,cs[s+1],Din,s,s+1);
-	axpby_ssp_pminus(chi,1.0,chi,-mass*cs[Ls-1],Din,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pplus (chi,bs[s],Din,-mass*cs[0],Din,s,0);
-	axpby_ssp_pminus(chi,1.0,chi,cs[s-1],Din,s,s-1);
-      } else {
-	axpby_ssp_pplus (chi,bs[s],Din,cs[s+1],Din,s,s+1);
-	axpby_ssp_pminus(chi,1.0,chi,cs[s-1],Din,s,s-1);
-      }
-      */
-
-      // FIXME just call MooeeDag??
-
-      // Collect the terms indept of DW
-      if ( s==0 ){
-	axpby_ssp_pplus (chi,1.0,chi,-1.0,psi,s,s+1);
-	axpby_ssp_pminus(chi,1.0,chi,mass,psi,s,Ls-1);
-      } else if ( s==(Ls-1)) {
-	axpby_ssp_pplus (chi,1.0,chi,mass,psi,s,0);
-	axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s-1);
-      } else {
-	axpby_ssp_pplus(chi,1.0,chi,-1.0,psi,s,s+1);
-	axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s-1);
-      }
-    }
-    // ((b D_W + D_w hop terms +1) on s-diag
-    axpby (chi,1.0,1.0,chi,psi); 
-    return norm2(chi);
-  }
-
-  // half checkerboard operations
- template<class Impl>
-  void CayleyFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
-  {
-    int Ls=this->Ls;
-
-    FermionField tmp(psi._grid);
+  for (int s=0;s<Ls;s++){
    // Assemble the 5d matrix
-    Meooe5D(psi,tmp); 
-#if 0
-    std::cout << "Meooe Test replacement norm2 tmp = " <<norm2(tmp)<<std::endl;
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	//	tmp = bs psi[s] + cs[s] psi[s+1}
-	//      tmp+= -mass*cs[s] psi[s+1}
-	axpby_ssp_pminus(tmp,beo[s],psi,-ceo[s],psi ,s, s+1);
-	axpby_ssp_pplus(tmp,1.0,tmp,mass*ceo[s],psi,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pminus(tmp,beo[s],psi,mass*ceo[s],psi,s,0);
-	axpby_ssp_pplus(tmp,1.0,tmp,-ceo[s],psi,s,s-1);
-      } else {
-	axpby_ssp_pminus(tmp,beo[s],psi,-ceo[s],psi,s,s+1);
-	axpby_ssp_pplus (tmp,1.0,tmp,-ceo[s],psi,s,s-1);
-      }
-    }
-    std::cout << "Meooe Test replacement norm2 tmp old = " <<norm2(tmp)<<std::endl;
-#endif
-
-    // Apply 4d dslash
-    if ( psi.checkerboard == Odd ) {
-      this->DhopEO(tmp,chi,DaggerNo);
+    if ( s==0 ) {
+      upper[s] = -cee[s+1] ;
+      lower[s] = mass*cee[Ls-1];
+    } else if ( s==(Ls-1)) { 
+      upper[s] = mass*cee[0];
+      lower[s] = -cee[s-1];
    } else {
-      this->DhopOE(tmp,chi,DaggerNo);
+      upper[s]=-cee[s+1];
+      lower[s]=-cee[s-1];
    }
  }

-  template<class Impl>
-  void CayleyFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
-  {
-    FermionField tmp(psi._grid);
-    // Apply 4d dslash
-    if ( psi.checkerboard == Odd ) {
-      this->DhopEO(psi,tmp,DaggerYes);
-    } else {
-      this->DhopOE(psi,tmp,DaggerYes);
-    }
+  M5Ddag(psi,psi,chi,lower,diag,upper);
+}

-    MeooeDag5D(tmp,chi); 
-#if 0
-    std::cout << "Meooe Test replacement norm2 chi new = " <<norm2(chi)<<std::endl;
-    // Assemble the 5d matrix
-    int Ls=this->Ls;
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	axpby_ssp_pplus(chi,beo[s],tmp,   -ceo[s+1]  ,tmp,s,s+1);
-	axpby_ssp_pminus(chi,   1.0,chi,mass*ceo[Ls-1],tmp,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pplus(chi,beo[s],tmp,mass*ceo[0],tmp,s,0);
-	axpby_ssp_pminus(chi,1.0,chi,-ceo[s-1],tmp,s,s-1);
-      } else {
-	axpby_ssp_pplus(chi,beo[s],tmp,-ceo[s+1],tmp,s,s+1);
-	axpby_ssp_pminus(chi,1.0   ,chi,-ceo[s-1],tmp,s,s-1);
-      }
-    }
-    std::cout << "Meooe Test replacement norm2 chi old = " <<norm2(chi)<<std::endl;
-#endif
+template<class Impl>
+void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  std::vector<RealD> diag(Ls,1.0);
+  std::vector<RealD> upper(Ls,-1.0);
+  std::vector<RealD> lower(Ls,-1.0);
+  upper[Ls-1]=-mass*upper[Ls-1];
+  lower[0]   =-mass*lower[0];
+  M5Ddag(psi,chi,chi,lower,diag,upper);
+}

+template<class Impl>
+void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din)
+{
+  int Ls=this->Ls;
+  std::vector<RealD> diag =bs;
+  std::vector<RealD> upper=cs;
+  std::vector<RealD> lower=cs;
+  upper[Ls-1]=-mass*upper[Ls-1];
+  lower[0]   =-mass*lower[0];
+  M5Ddag(psi,psi,Din,lower,diag,upper);
+}
+
+template<class Impl>
+RealD CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  
+  FermionField Din(psi._grid);
+  
+  // Assemble Din
+  Meooe5D(psi,Din);
+  
+  this->DW(Din,chi,DaggerNo);
+  // ((b D_W + D_w hop terms +1) on s-diag
+  axpby(chi,1.0,1.0,chi,psi); 
+  
+  M5D(psi,chi);
+  return(norm2(chi));
+}
+
+template<class Impl>
+RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
+{
+  // Under adjoint
+  //D1+        D1- P-    ->   D1+^dag   P+ D2-^dag
+  //D2- P+     D2+            P-D1-^dag D2+dag
+  
+  FermionField Din(psi._grid);
+  // Apply Dw
+  this->DW(psi,Din,DaggerYes); 
+  
+  MeooeDag5D(Din,chi);
+  
+  M5Ddag(psi,chi);
+  // ((b D_W + D_w hop terms +1) on s-diag
+  axpby (chi,1.0,1.0,chi,psi); 
+  return norm2(chi);
+}
+
+// half checkerboard operations
+template<class Impl>
+void CayleyFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  FermionField tmp(psi._grid);
+
+  Meooe5D(psi,tmp); 
+
+  if ( psi.checkerboard == Odd ) {
+    this->DhopEO(tmp,chi,DaggerNo);
+  } else {
+    this->DhopOE(tmp,chi,DaggerNo);
  }
+}

- template<class Impl>
-  void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
-  {
-    int Ls=this->Ls;
-    for (int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	axpby_ssp_pminus(chi,bee[s],psi ,-cee[s],psi,s,s+1);
-	axpby_ssp_pplus (chi,1.0,chi,mass*cee[s],psi,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pminus(chi,bee[s],psi,mass*cee[s],psi,s,0);
-	axpby_ssp_pplus (chi,1.0,chi,-cee[s],psi,s,s-1);
-      } else {
-	axpby_ssp_pminus(chi,bee[s],psi,-cee[s],psi,s,s+1);
-	axpby_ssp_pplus (chi,1.0,chi,-cee[s],psi,s,s-1);
-      }
-    }
+template<class Impl>
+void CayleyFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
+{
+  FermionField tmp(psi._grid);
+  // Apply 4d dslash
+  if ( psi.checkerboard == Odd ) {
+    this->DhopEO(psi,tmp,DaggerYes);
+  } else {
+    this->DhopOE(psi,tmp,DaggerYes);
  }
+  MeooeDag5D(tmp,chi); 
+}

- template<class Impl>
-  void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
-    int Ls=this->Ls;
-    FermionField tmp(psi._grid);
-    // Assemble the 5d matrix
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	//	tmp = bs psi[s] + cs[s] psi[s+1}
-	//      tmp+= -mass*cs[s] psi[s+1}
-	axpby_ssp_pminus(tmp,beo[s],psi,-ceo[s],psi ,s, s+1);
-	axpby_ssp_pplus(tmp,1.0,tmp,mass*ceo[s],psi,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pminus(tmp,beo[s],psi,mass*ceo[s],psi,s,0);
-	axpby_ssp_pplus(tmp,1.0,tmp,-ceo[s],psi,s,s-1);
-      } else {
-	axpby_ssp_pminus(tmp,beo[s],psi,-ceo[s],psi,s,s+1);
-	axpby_ssp_pplus (tmp,1.0,tmp,-ceo[s],psi,s,s-1);
-      }
-    }
-    // Apply 4d dslash fragment
-    this->DhopDir(tmp,chi,dir,disp);
+template<class Impl>
+void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
+  FermionField tmp(psi._grid);
+  Meo5D(psi,tmp);
+  // Apply 4d dslash fragment
+  this->DhopDir(tmp,chi,dir,disp);
+}
+// force terms; five routines; default to Dhop on diagonal
+template<class Impl>
+void CayleyFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  FermionField Din(V._grid);
+  
+  if ( dag == DaggerNo ) {
+    //      U d/du [D_w D5] V = U d/du DW D5 V
+    Meooe5D(V,Din);
+    this->DhopDeriv(mat,U,Din,dag);
+  } else {
+    //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
+    Meooe5D(U,Din);
+    this->DhopDeriv(mat,Din,V,dag);
  }
-
- template<class Impl>
-  void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
-  {
-    int Ls=this->Ls;
-    for (int s=0;s<Ls;s++){
-      // Assemble the 5d matrix
-      if ( s==0 ) {
-	axpby_ssp_pplus(chi,bee[s],psi,-cee[s+1]  ,psi,s,s+1);
-	axpby_ssp_pminus(chi,1.0,chi,mass*cee[Ls-1],psi,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pplus(chi,bee[s],psi,mass*cee[0],psi,s,0);
-	axpby_ssp_pminus(chi,1.0,chi,-cee[s-1],psi,s,s-1);
-      } else {
-	axpby_ssp_pplus(chi,bee[s],psi,-cee[s+1],psi,s,s+1);
-	axpby_ssp_pminus(chi,1.0   ,chi,-cee[s-1],psi,s,s-1);
-      }
-    }
-  }
-
- template<class Impl>
-  void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
-  {
-    int Ls=this->Ls;
-    // Apply (L^{\prime})^{-1}
-    axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0]
-    for (int s=1;s<Ls;s++){
-      axpby_ssp_pplus(chi,1.0,psi,-lee[s-1],chi,s,s-1);// recursion Psi[s] -lee P_+ chi[s-1]
-    }
-    // L_m^{-1} 
-    for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-      axpby_ssp_pminus(chi,1.0,chi,-leem[s],chi,Ls-1,s);
-    }
-    // U_m^{-1} D^{-1}
-    for (int s=0;s<Ls-1;s++){
-      // Chi[s] + 1/d chi[s] 
-      axpby_ssp_pplus(chi,1.0/dee[s],chi,-ueem[s]/dee[Ls-1],chi,s,Ls-1);
-    }	
-    axpby_ssp(chi,1.0/dee[Ls-1],chi,0.0,chi,Ls-1,Ls-1); // Modest avoidable 
-    
-    // Apply U^{-1}
-    for (int s=Ls-2;s>=0;s--){
-      axpby_ssp_pminus (chi,1.0,chi,-uee[s],chi,s,s+1);  // chi[Ls]
-    }
-  }
-
- template<class Impl>
-  void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
-  {
-    int Ls=this->Ls;
-    // Apply (U^{\prime})^{-dagger}
-    axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0]
-    for (int s=1;s<Ls;s++){
-      axpby_ssp_pminus(chi,1.0,psi,-uee[s-1],chi,s,s-1);
-    }
-    // U_m^{-\dagger} 
-    for (int s=0;s<Ls-1;s++){
-      axpby_ssp_pplus(chi,1.0,chi,-ueem[s],chi,Ls-1,s);
-    }
-    // L_m^{-\dagger} D^{-dagger}
-    for (int s=0;s<Ls-1;s++){
-      axpby_ssp_pminus(chi,1.0/dee[s],chi,-leem[s]/dee[Ls-1],chi,s,Ls-1);
-    }	
-    axpby_ssp(chi,1.0/dee[Ls-1],chi,0.0,chi,Ls-1,Ls-1); // Modest avoidable 
-    
-    // Apply L^{-dagger}
-    for (int s=Ls-2;s>=0;s--){
-      axpby_ssp_pplus (chi,1.0,chi,-lee[s],chi,s,s+1);  // chi[Ls]
-    }
-  }
-
-  // force terms; five routines; default to Dhop on diagonal
-  template<class Impl>
-  void CayleyFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-  {
-    FermionField Din(V._grid);
-
-    if ( dag == DaggerNo ) {
-      //      U d/du [D_w D5] V = U d/du DW D5 V
-      Meooe5D(V,Din);
-      this->DhopDeriv(mat,U,Din,dag);
-    } else {
-      //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
-      Meooe5D(U,Din);
-      this->DhopDeriv(mat,Din,V,dag);
-    }
-  };
- template<class Impl>
-  void CayleyFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-  {
-    FermionField Din(V._grid);
-
-    if ( dag == DaggerNo ) {
-      //      U d/du [D_w D5] V = U d/du DW D5 V
-      Meooe5D(V,Din);
-      this->DhopDerivOE(mat,U,Din,dag);
-    } else {
-      //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
+};
+template<class Impl>
+void CayleyFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  FermionField Din(V._grid);
+  
+  if ( dag == DaggerNo ) {
+    //      U d/du [D_w D5] V = U d/du DW D5 V
+    Meooe5D(V,Din);
+    this->DhopDerivOE(mat,U,Din,dag);
+  } else {
+    //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
      Meooe5D(U,Din);
      this->DhopDerivOE(mat,Din,V,dag);
-    }
-  };
- template<class Impl>
-  void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-  {
-    FermionField Din(V._grid);
-
-    if ( dag == DaggerNo ) {
-      //      U d/du [D_w D5] V = U d/du DW D5 V
-      Meooe5D(V,Din);
-      this->DhopDerivEO(mat,U,Din,dag);
-    } else {
-      //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
-      Meooe5D(U,Din);
-      this->DhopDerivEO(mat,Din,V,dag);
-    }
-  };
+  }
+};
+template<class Impl>
+void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  FermionField Din(V._grid);
  
-  // Tanh
- template<class Impl>
-  void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
-  {
-    SetCoefficientsZolotarev(1.0,zdata,b,c);
-
+  if ( dag == DaggerNo ) {
+    //      U d/du [D_w D5] V = U d/du DW D5 V
+    Meooe5D(V,Din);
+    this->DhopDerivEO(mat,U,Din,dag);
+  } else {
+    //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
+    Meooe5D(U,Din);
+    this->DhopDerivEO(mat,Din,V,dag);
  }
-  //Zolo
- template<class Impl>
-  void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
-  {
-    int Ls=this->Ls;
+};
+  
+// Tanh
+template<class Impl>
+void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
+{
+  SetCoefficientsZolotarev(1.0,zdata,b,c);
+}
+//Zolo
+template<class Impl>
+void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
+{
+  int Ls=this->Ls;

-    ///////////////////////////////////////////////////////////
-    // The Cayley coeffs (unprec)
-    ///////////////////////////////////////////////////////////
-    omega.resize(Ls);
-    bs.resize(Ls);
-    cs.resize(Ls);
-    as.resize(Ls);
+  ///////////////////////////////////////////////////////////
+  // The Cayley coeffs (unprec)
+  ///////////////////////////////////////////////////////////
+  omega.resize(Ls);
+  bs.resize(Ls);
+  cs.resize(Ls);
+  as.resize(Ls);
+  
+  // 
+  // Ts = (    [bs+cs]Dw        )^-1 (    (bs+cs) Dw         )
+  //     -(g5  -------       -1 )    ( g5 ---------     + 1  )
+  //      (   {2+(bs-cs)Dw}     )    (    2+(bs-cs) Dw       )
+  //
+  //  bs = 1/2( (1/omega_s + 1)*b + (1/omega - 1)*c ) = 1/2(  1/omega(b+c) + (b-c) )
+  //  cs = 1/2( (1/omega_s - 1)*b + (1/omega + 1)*c ) = 1/2(  1/omega(b+c) - (b-c) )
+  //
+  // bs+cs = 0.5*( 1/omega(b+c) + (b-c) + 1/omega(b+c) - (b-c) ) = 1/omega(b+c)
+  // bs-cs = 0.5*( 1/omega(b+c) + (b-c) - 1/omega(b+c) + (b-c) ) = b-c
+  //
+  // So 
+  //
+  // Ts = (    [b+c]Dw/omega_s    )^-1 (    (b+c) Dw /omega_s        )
+  //     -(g5  -------         -1 )    ( g5 ---------           + 1  )
+  //      (   {2+(b-c)Dw}         )    (    2+(b-c) Dw               )
+  //
+  // Ts = (    [b+c]Dw            )^-1 (    (b+c) Dw                 )
+  //     -(g5  -------    -omega_s)    ( g5 ---------      + omega_s )
+  //      (   {2+(b-c)Dw}         )    (    2+(b-c) Dw               )
+  // 
    
-    // 
-    // Ts = (    [bs+cs]Dw        )^-1 (    (bs+cs) Dw         )
-    //     -(g5  -------       -1 )    ( g5 ---------     + 1  )
-    //      (   {2+(bs-cs)Dw}     )    (    2+(bs-cs) Dw       )
-    //
-    //  bs = 1/2( (1/omega_s + 1)*b + (1/omega - 1)*c ) = 1/2(  1/omega(b+c) + (b-c) )
-    //  cs = 1/2( (1/omega_s - 1)*b + (1/omega + 1)*c ) = 1/2(  1/omega(b+c) - (b-c) )
-    //
-    // bs+cs = 0.5*( 1/omega(b+c) + (b-c) + 1/omega(b+c) - (b-c) ) = 1/omega(b+c)
-    // bs-cs = 0.5*( 1/omega(b+c) + (b-c) - 1/omega(b+c) + (b-c) ) = b-c
-    //
-    // So 
-    //
-    // Ts = (    [b+c]Dw/omega_s    )^-1 (    (b+c) Dw /omega_s        )
-    //     -(g5  -------         -1 )    ( g5 ---------           + 1  )
-    //      (   {2+(b-c)Dw}         )    (    2+(b-c) Dw               )
-    //
-    // Ts = (    [b+c]Dw            )^-1 (    (b+c) Dw                 )
-    //     -(g5  -------    -omega_s)    ( g5 ---------      + omega_s )
-    //      (   {2+(b-c)Dw}         )    (    2+(b-c) Dw               )
-    // 
+  double bpc = b+c;
+  double bmc = b-c;
+  for(int i=0; i < Ls; i++){
+    as[i] = 1.0;
+    omega[i] = ((double)zdata->gamma[i])*zolo_hi; //NB reciprocal relative to Chroma NEF code
+    bs[i] = 0.5*(bpc/omega[i] + bmc);
+    cs[i] = 0.5*(bpc/omega[i] - bmc);
+  }
+  
+  ////////////////////////////////////////////////////////
+  // Constants for the preconditioned matrix Cayley form
+  ////////////////////////////////////////////////////////
+  bee.resize(Ls);
+  cee.resize(Ls);
+  beo.resize(Ls);
+  ceo.resize(Ls);
+  
+  for(int i=0;i<Ls;i++){
+    bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);
+    cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
+    beo[i]=as[i]*bs[i];
+    ceo[i]=-as[i]*cs[i];
+  }
+  
+  aee.resize(Ls);
+  aeo.resize(Ls);
+  for(int i=0;i<Ls;i++){
+    aee[i]=cee[i];
+    aeo[i]=ceo[i];
+  }
+  
+  //////////////////////////////////////////
+  // LDU decomposition of eeoo
+  //////////////////////////////////////////
+  dee.resize(Ls);
+  lee.resize(Ls);
+  leem.resize(Ls);
+  uee.resize(Ls);
+  ueem.resize(Ls);
+  
+  for(int i=0;i<Ls;i++){
    
-    double bpc = b+c;
-    double bmc = b-c;
-    for(int i=0; i < Ls; i++){
-      as[i] = 1.0;
-      omega[i] = ((double)zdata->gamma[i])*zolo_hi; //NB reciprocal relative to Chroma NEF code
-      bs[i] = 0.5*(bpc/omega[i] + bmc);
-      cs[i] = 0.5*(bpc/omega[i] - bmc);
-    }
-
-    ////////////////////////////////////////////////////////
-    // Constants for the preconditioned matrix Cayley form
-    ////////////////////////////////////////////////////////
-    bee.resize(Ls);
-    cee.resize(Ls);
-    beo.resize(Ls);
-    ceo.resize(Ls);
+    dee[i] = bee[i];
    
-    for(int i=0;i<Ls;i++){
-      bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);
-      cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
-      beo[i]=as[i]*bs[i];
-      ceo[i]=-as[i]*cs[i];
-    }
-
-    aee.resize(Ls);
-    aeo.resize(Ls);
-    for(int i=0;i<Ls;i++){
-      aee[i]=cee[i];
-      aeo[i]=ceo[i];
-    }
-
-    //////////////////////////////////////////
-    // LDU decomposition of eeoo
-    //////////////////////////////////////////
-    dee.resize(Ls);
-    lee.resize(Ls);
-    leem.resize(Ls);
-    uee.resize(Ls);
-    ueem.resize(Ls);
-    
-    for(int i=0;i<Ls;i++){
+    if ( i < Ls-1 ) {
      
-      dee[i] = bee[i];
+      lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
      
-      if ( i < Ls-1 ) {
-	
-	lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
-	    
-	leem[i]=mass*cee[Ls-1]/bee[0];
-	for(int j=0;j<i;j++)  leem[i]*= aee[j]/bee[j+1];
-	
-	uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
-	
-	ueem[i]=mass;
-	for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
-	ueem[i]*= aee[0]/bee[0];
-	    
-      } else { 
-	lee[i] =0.0;
-	leem[i]=0.0;
-	uee[i] =0.0;
-	ueem[i]=0.0;
-      }
-    }
-	
-    { 
-      double delta_d=mass*cee[Ls-1];
-      for(int j=0;j<Ls-1;j++) delta_d *= cee[j]/bee[j];
-      dee[Ls-1] += delta_d;
+      leem[i]=mass*cee[Ls-1]/bee[0];
+      for(int j=0;j<i;j++)  leem[i]*= aee[j]/bee[j+1];
+      
+      uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
+      
+      ueem[i]=mass;
+      for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
+      ueem[i]*= aee[0]/bee[0];
+      
+    } else { 
+      lee[i] =0.0;
+      leem[i]=0.0;
+      uee[i] =0.0;
+      ueem[i]=0.0;
    }
  }
+	
+  { 
+    double delta_d=mass*cee[Ls-1];
+    for(int j=0;j<Ls-1;j++) delta_d *= cee[j]/bee[j];
+    dee[Ls-1] += delta_d;
+  }  
+}
+
+

  FermOpTemplateInstantiate(CayleyFermion5D);
  GparityFermOpTemplateInstantiate(CayleyFermion5D);
--- a/lib/qcd/action/fermion/CayleyFermion5D.h
+++ b/lib/qcd/action/fermion/CayleyFermion5D.h
@ -51,6 +51,29 @@ namespace Grid {
      virtual void   MooeeDag    (const FermionField &in, FermionField &out);
      virtual void   MooeeInv    (const FermionField &in, FermionField &out);
      virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
+      virtual void   Meo5D (const FermionField &psi, FermionField &chi);
+
+      virtual void   M5D   (const FermionField &psi, FermionField &chi);
+      virtual void   M5Ddag(const FermionField &psi, FermionField &chi);
+
+      /////////////////////////////////////////////////////
+      // Instantiate different versions depending on Impl
+      /////////////////////////////////////////////////////
+      void M5D(const FermionField &psi,
+	       const FermionField &phi, 
+	       FermionField &chi,
+	       std::vector<RealD> &lower,
+	       std::vector<RealD> &diag,
+	       std::vector<RealD> &upper);
+
+      void M5Ddag(const FermionField &psi,
+		  const FermionField &phi, 
+		  FermionField &chi,
+		  std::vector<RealD> &lower,
+		  std::vector<RealD> &diag,
+		  std::vector<RealD> &upper);
+      void MooeeInternal(const FermionField &in, FermionField &out,int dag,int inv);
+
      virtual void   Instantiatable(void)=0;

      // force terms; five routines; default to Dhop on diagonal
@ -94,6 +117,8 @@ namespace Grid {
 		      GridRedBlackCartesian &FourDimRedBlackGrid,
 		      RealD _mass,RealD _M5,const ImplParams &p= ImplParams());

+
+
    protected:
      void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
      void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
@ -101,5 +126,15 @@ namespace Grid {

  }
 }
+#define INSTANTIATE_DPERP(A)\
+template void CayleyFermion5D< A >::M5D(const FermionField &psi,const FermionField &phi,FermionField &chi,\
+					std::vector<RealD> &lower,std::vector<RealD> &diag,std::vector<RealD> &upper); \
+template void CayleyFermion5D< A >::M5Ddag(const FermionField &psi,const FermionField &phi,FermionField &chi,\
+					   std::vector<RealD> &lower,std::vector<RealD> &diag,std::vector<RealD> &upper); \
+template void CayleyFermion5D< A >::MooeeInv    (const FermionField &psi, FermionField &chi); \
+template void CayleyFermion5D< A >::MooeeInvDag (const FermionField &psi, FermionField &chi);
+
+#define CAYLEY_DPERP_CACHE
+#undef  CAYLEY_DPERP_LINALG

 #endif
--- a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
@ -0,0 +1,209 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid.h>
+
+
+namespace Grid {
+namespace QCD {
+
+  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
+
+  // Pminus fowards
+  // Pplus  backwards..
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
+				const FermionField &phi, 
+				FermionField &chi,
+				std::vector<RealD> &lower,
+				std::vector<RealD> &diag,
+				std::vector<RealD> &upper)
+{
+  int Ls =this->Ls;
+  GridBase *grid=psi._grid;
+  assert(phi.checkerboard == psi.checkerboard);
+  chi.checkerboard=psi.checkerboard;
+PARALLEL_FOR_LOOP
+  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+    for(int s=0;s<Ls;s++){
+      auto tmp = psi._odata[0];
+      if ( s==0 ) {
+ 	                            spProj5m(tmp,psi._odata[ss+s+1]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	                    spProj5p(tmp,psi._odata[ss+Ls-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      } else if ( s==(Ls-1)) {
+	                            spProj5m(tmp,psi._odata[ss+0]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+ 	                    spProj5p(tmp,psi._odata[ss+s-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      } else { 
+	                            spProj5m(tmp,psi._odata[ss+s+1]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	                    spProj5p(tmp,psi._odata[ss+s-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      }
+    }
+  }
+}
+
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
+				   const FermionField &phi, 
+				   FermionField &chi,
+				   std::vector<RealD> &lower,
+				   std::vector<RealD> &diag,
+				   std::vector<RealD> &upper)
+{
+  int Ls =this->Ls;
+  GridBase *grid=psi._grid;
+  assert(phi.checkerboard == psi.checkerboard);
+  chi.checkerboard=psi.checkerboard;
+
+PARALLEL_FOR_LOOP
+  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+    auto tmp = psi._odata[0];
+    for(int s=0;s<Ls;s++){
+      if ( s==0 ) {
+	spProj5p(tmp,psi._odata[ss+s+1]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	spProj5m(tmp,psi._odata[ss+Ls-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      } else if ( s==(Ls-1)) {
+	spProj5p(tmp,psi._odata[ss+0]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	spProj5m(tmp,psi._odata[ss+s-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      } else { 
+	spProj5p(tmp,psi._odata[ss+s+1]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	spProj5m(tmp,psi._odata[ss+s-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      }
+    }
+  }
+}
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
+{
+  GridBase *grid=psi._grid;
+  int Ls=this->Ls;
+
+  chi.checkerboard=psi.checkerboard;
+
+PARALLEL_FOR_LOOP
+  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+    auto tmp = psi._odata[0];
+
+    // Apply (L^{\prime})^{-1}
+    chi[ss]=psi[ss]; // chi[0]=psi[0]
+    for(int s=1;s<Ls;s++){
+                            spProj5p(tmp,chi[ss+s-1]);  
+      chi[ss+s] = psi[ss+s]-lee[s-1]*tmp;
+    }
+    // L_m^{-1} 
+    for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+                                   spProj5m(tmp,chi[ss+s]);    
+      chi[ss+Ls-1] = chi[ss+Ls-1] - leem[s]*tmp;
+    }
+    // U_m^{-1} D^{-1}
+    for (int s=0;s<Ls-1;s++){
+      // Chi[s] + 1/d chi[s] 
+                                                spProj5p(tmp,chi[ss+Ls-1]); 
+      chi[ss+s] = (1.0/dee[s])*chi[ss+s]-(ueem[s]/dee[Ls-1])*tmp;
+    }	
+    chi[ss+Ls-1]= (1.0/dee[Ls-1])*chi[ss+Ls-1];
+      
+    // Apply U^{-1}
+    for (int s=Ls-2;s>=0;s--){
+                            spProj5m(tmp,chi[ss+s+1]);  
+      chi[ss+s] = chi[ss+s] - uee[s]*tmp;
+    }
+  }
+}
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
+{
+  GridBase *grid=psi._grid;
+  int Ls=this->Ls;
+
+  assert(psi.checkerboard == psi.checkerboard);
+  chi.checkerboard=psi.checkerboard;
+
+
+PARALLEL_FOR_LOOP
+  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+
+    auto tmp = psi._odata[0];
+
+    // Apply (U^{\prime})^{-dagger}
+    chi[ss]=psi[ss];
+    for (int s=1;s<Ls;s++){
+                            spProj5m(tmp,chi[ss+s-1]);
+      chi[ss+s] = psi[ss+s]-uee[s-1]*tmp;
+    }
+    // U_m^{-\dagger} 
+    for (int s=0;s<Ls-1;s++){
+                                   spProj5p(tmp,chi[ss+s]);
+      chi[ss+Ls-1] = chi[ss+Ls-1] - ueem[s]*tmp;
+    }
+
+    // L_m^{-\dagger} D^{-dagger}
+    for (int s=0;s<Ls-1;s++){
+      spProj5m(tmp,chi[ss+Ls-1]);
+      chi[ss+s] = (1.0/dee[s])*chi[ss+s]-(leem[s]/dee[Ls-1])*tmp;
+    }	
+    chi[ss+Ls-1]= (1.0/dee[Ls-1])*chi[ss+Ls-1];
+  
+    // Apply L^{-dagger}
+    for (int s=Ls-2;s>=0;s--){
+      spProj5p(tmp,chi[ss+s+1]);
+      chi[ss+s] = chi[ss+s] - lee[s]*tmp;
+    }
+  }
+}
+
+#ifdef CAYLEY_DPERP_CACHE
+  INSTANTIATE_DPERP(WilsonImplF);
+  INSTANTIATE_DPERP(WilsonImplD);
+  INSTANTIATE_DPERP(GparityWilsonImplF);
+  INSTANTIATE_DPERP(GparityWilsonImplD);
+#endif
+
+}}
--- a/lib/qcd/action/fermion/CayleyFermion5Ddense.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Ddense.cc
@ -0,0 +1,133 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid/Eigen/Dense>
+#include <Grid.h>
+
+
+namespace Grid {
+namespace QCD {
+  /*
+   * Dense matrix versions of routines
+   */
+
+  /*
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
+{
+  this->MooeeInternal(psi,chi,DaggerYes,InverseYes);
+}
+  
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
+{
+  this->MooeeInternal(psi,chi,DaggerNo,InverseYes);
+}
+  */
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
+{
+  int Ls=this->Ls;
+  int LLs = psi._grid->_rdimensions[0];
+  int vol = psi._grid->oSites()/LLs;
+  
+  chi.checkerboard=psi.checkerboard;
+  
+  assert(Ls==LLs);
+  
+  Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
+  Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
+  
+  for(int s=0;s<Ls;s++){
+    Pplus(s,s) = bee[s];
+    Pminus(s,s)= bee[s];
+  }
+  
+  for(int s=0;s<Ls-1;s++){
+    Pminus(s,s+1) = -cee[s];
+  }
+  
+  for(int s=0;s<Ls-1;s++){
+    Pplus(s+1,s) = -cee[s+1];
+  }
+  Pplus (0,Ls-1) = mass*cee[0];
+  Pminus(Ls-1,0) = mass*cee[Ls-1];
+  
+  Eigen::MatrixXd PplusMat ;
+  Eigen::MatrixXd PminusMat;
+  
+  if ( inv ) {
+    PplusMat =Pplus.inverse();
+    PminusMat=Pminus.inverse();
+  } else { 
+    PplusMat =Pplus;
+    PminusMat=Pminus;
+  }
+  
+  if(dag){
+    PplusMat.adjointInPlace();
+    PminusMat.adjointInPlace();
+  }
+
+  // For the non-vectorised s-direction this is simple
+  
+  for(auto site=0;site<vol;site++){
+    
+    SiteSpinor     SiteChi;
+    SiteHalfSpinor SitePplus;
+    SiteHalfSpinor SitePminus;
+    
+    for(int s1=0;s1<Ls;s1++){
+      SiteChi =zero;
+      for(int s2=0;s2<Ls;s2++){
+	int lex2 = s2+Ls*site;
+	
+	if ( PplusMat(s1,s2) != 0.0 ) {
+	  spProj5p(SitePplus,psi[lex2]);
+	  accumRecon5p(SiteChi,PplusMat (s1,s2)*SitePplus);
+	}
+	
+	if ( PminusMat(s1,s2) != 0.0 ) {
+	  spProj5m(SitePminus,psi[lex2]);
+	  accumRecon5m(SiteChi,PminusMat(s1,s2)*SitePminus);
+	}
+      }
+      chi[s1+Ls*site] = SiteChi*0.5;
+    }
+  }
+}
+
+template void CayleyFermion5D<GparityWilsonImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<GparityWilsonImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<WilsonImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<WilsonImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+
+}}
--- a/lib/qcd/action/fermion/CayleyFermion5Dssp.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dssp.cc
@ -0,0 +1,149 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid.h>
+
+
+namespace Grid {
+namespace QCD {
+
+  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
+
+  // Pminus fowards
+  // Pplus  backwards
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
+				const FermionField &phi, 
+				FermionField &chi,
+				std::vector<RealD> &lower,
+				std::vector<RealD> &diag,
+				std::vector<RealD> &upper)
+{
+  int Ls=this->Ls;
+  for(int s=0;s<Ls;s++){
+    if ( s==0 ) {
+      axpby_ssp_pminus(chi,diag[s],phi,upper[s],psi,s,s+1);
+      axpby_ssp_pplus (chi,1.0,chi,lower[s],psi,s,Ls-1);
+    } else if ( s==(Ls-1)) { 
+      axpby_ssp_pminus(chi,diag[s],phi,upper[s],psi,s,0);
+      axpby_ssp_pplus (chi,1.0,chi,lower[s],psi,s,s-1);
+    } else {
+      axpby_ssp_pminus(chi,diag[s],phi,upper[s],psi,s,s+1);
+      axpby_ssp_pplus(chi,1.0,chi,lower[s],psi,s,s-1);
+    }
+  }
+}
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
+				   const FermionField &phi, 
+				   FermionField &chi,
+				   std::vector<RealD> &lower,
+				   std::vector<RealD> &diag,
+				   std::vector<RealD> &upper)
+{
+  int Ls=this->Ls;
+  for(int s=0;s<Ls;s++){
+    if ( s==0 ) {
+      axpby_ssp_pplus (chi,diag[s],phi,upper[s],psi,s,s+1);
+      axpby_ssp_pminus(chi,1.0,chi,lower[s],psi,s,Ls-1);
+    } else if ( s==(Ls-1)) { 
+      axpby_ssp_pplus (chi,diag[s],phi,upper[s],psi,s,0);
+      axpby_ssp_pminus(chi,1.0,chi,lower[s],psi,s,s-1);
+    } else {
+      axpby_ssp_pplus (chi,diag[s],phi,upper[s],psi,s,s+1);
+      axpby_ssp_pminus(chi,1.0,chi,lower[s],psi,s,s-1);
+    }
+  }
+}
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
+{
+  chi.checkerboard=psi.checkerboard;
+  int Ls=this->Ls;
+  // Apply (L^{\prime})^{-1}
+  axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0]
+  for (int s=1;s<Ls;s++){
+    axpby_ssp_pplus(chi,1.0,psi,-lee[s-1],chi,s,s-1);// recursion Psi[s] -lee P_+ chi[s-1]
+  }
+  // L_m^{-1} 
+  for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+    axpby_ssp_pminus(chi,1.0,chi,-leem[s],chi,Ls-1,s);
+  }
+  // U_m^{-1} D^{-1}
+  for (int s=0;s<Ls-1;s++){
+    // Chi[s] + 1/d chi[s] 
+    axpby_ssp_pplus(chi,1.0/dee[s],chi,-ueem[s]/dee[Ls-1],chi,s,Ls-1);
+  }	
+  axpby_ssp(chi,1.0/dee[Ls-1],chi,0.0,chi,Ls-1,Ls-1); // Modest avoidable 
+  
+  // Apply U^{-1}
+  for (int s=Ls-2;s>=0;s--){
+    axpby_ssp_pminus (chi,1.0,chi,-uee[s],chi,s,s+1);  // chi[Ls]
+  }
+}
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
+{
+  chi.checkerboard=psi.checkerboard;
+  int Ls=this->Ls;
+  // Apply (U^{\prime})^{-dagger}
+  axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0]
+  for (int s=1;s<Ls;s++){
+    axpby_ssp_pminus(chi,1.0,psi,-uee[s-1],chi,s,s-1);
+  }
+  // U_m^{-\dagger} 
+  for (int s=0;s<Ls-1;s++){
+    axpby_ssp_pplus(chi,1.0,chi,-ueem[s],chi,Ls-1,s);
+  }
+  // L_m^{-\dagger} D^{-dagger}
+  for (int s=0;s<Ls-1;s++){
+    axpby_ssp_pminus(chi,1.0/dee[s],chi,-leem[s]/dee[Ls-1],chi,s,Ls-1);
+  }	
+  axpby_ssp(chi,1.0/dee[Ls-1],chi,0.0,chi,Ls-1,Ls-1); // Modest avoidable 
+  
+  // Apply L^{-dagger}
+  for (int s=Ls-2;s>=0;s--){
+    axpby_ssp_pplus (chi,1.0,chi,-lee[s],chi,s,s+1);  // chi[Ls]
+  }
+}
+
+
+#ifdef CAYLEY_DPERP_LINALG
+  INSTANTIATE(WilsonImplF);
+  INSTANTIATE(WilsonImplD);
+  INSTANTIATE(GparityWilsonImplF);
+  INSTANTIATE(GparityWilsonImplD);
+#endif
+
+}
+}
--- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
@ -0,0 +1,305 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid/Eigen/Dense>
+#include <Grid.h>
+
+
+namespace Grid {
+namespace QCD {
+  /*
+   * Dense matrix versions of routines
+   */
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
+{
+  this->MooeeInternal(psi,chi,DaggerYes,InverseYes);
+}
+  
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
+{
+  this->MooeeInternal(psi,chi,DaggerNo,InverseYes);
+}
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
+				const FermionField &phi, 
+				FermionField &chi,
+				std::vector<RealD> &lower,
+				std::vector<RealD> &diag,
+				std::vector<RealD> &upper)
+{
+  GridBase *grid=psi._grid;
+  int Ls   = this->Ls;
+  int LLs  = grid->_rdimensions[0];
+  int nsimd= Simd::Nsimd();
+
+  Vector<iSinglet<Simd> > u(LLs);
+  Vector<iSinglet<Simd> > l(LLs);
+  Vector<iSinglet<Simd> > d(LLs);
+
+  assert(Ls/LLs==nsimd);
+  assert(phi.checkerboard == psi.checkerboard);
+
+  chi.checkerboard=psi.checkerboard;
+
+  // just directly address via type pun
+  typedef typename Simd::scalar_type scalar_type;
+  scalar_type * u_p = (scalar_type *)&u[0];
+  scalar_type * l_p = (scalar_type *)&l[0];
+  scalar_type * d_p = (scalar_type *)&d[0];
+
+  for(int o=0;o<LLs;o++){ // outer
+  for(int i=0;i<nsimd;i++){ //inner
+    int s  = o+i*LLs;
+    int ss = o*nsimd+i;
+    u_p[ss] = upper[s];
+    l_p[ss] = lower[s];
+    d_p[ss] = diag[s];
+  }}
+
+PARALLEL_FOR_LOOP
+  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
+
+    alignas(64) SiteHalfSpinor hp;
+    alignas(64) SiteHalfSpinor hm;
+    alignas(64) SiteSpinor fp;
+    alignas(64) SiteSpinor fm;
+
+    for(int v=0;v<LLs;v++){
+
+      int vp=(v+1)%LLs;
+      int vm=(v+LLs-1)%LLs;
+
+      spProj5m(hp,psi[ss+vp]);
+      spProj5p(hm,psi[ss+vm]);
+      
+      if ( vp<=v ) rotate(hp,hp,1);
+      if ( vm>=v ) rotate(hm,hm,nsimd-1);
+
+      hp=hp*0.5;
+      hm=hm*0.5;
+      spRecon5m(fp,hp);
+      spRecon5p(fm,hm);
+
+      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
+      chi[ss+v] = chi[ss+v]     +l[v]*fm;
+
+    }
+  }
+}
+
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
+				   const FermionField &phi, 
+				   FermionField &chi,
+				   std::vector<RealD> &lower,
+				   std::vector<RealD> &diag,
+				   std::vector<RealD> &upper)
+{
+  GridBase *grid=psi._grid;
+  int Ls   = this->Ls;
+  int LLs  = grid->_rdimensions[0];
+  int nsimd= Simd::Nsimd();
+
+  Vector<iSinglet<Simd> > u(LLs);
+  Vector<iSinglet<Simd> > l(LLs);
+  Vector<iSinglet<Simd> > d(LLs);
+
+  assert(Ls/LLs==nsimd);
+  assert(phi.checkerboard == psi.checkerboard);
+
+  chi.checkerboard=psi.checkerboard;
+
+  // just directly address via type pun
+  typedef typename Simd::scalar_type scalar_type;
+  scalar_type * u_p = (scalar_type *)&u[0];
+  scalar_type * l_p = (scalar_type *)&l[0];
+  scalar_type * d_p = (scalar_type *)&d[0];
+
+  for(int o=0;o<LLs;o++){ // outer
+  for(int i=0;i<nsimd;i++){ //inner
+    int s  = o+i*LLs;
+    int ss = o*nsimd+i;
+    u_p[ss] = upper[s];
+    l_p[ss] = lower[s];
+    d_p[ss] = diag[s];
+  }}
+
+PARALLEL_FOR_LOOP
+  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
+
+    alignas(64) SiteHalfSpinor hp;
+    alignas(64) SiteHalfSpinor hm;
+    alignas(64) SiteSpinor fp;
+    alignas(64) SiteSpinor fm;
+
+    for(int v=0;v<LLs;v++){
+
+      int vp=(v+1)%LLs;
+      int vm=(v+LLs-1)%LLs;
+
+      spProj5p(hp,psi[ss+vp]);
+      spProj5m(hm,psi[ss+vm]);
+
+      if ( vp<=v ) rotate(hp,hp,1);
+      if ( vm>=v ) rotate(hm,hm,nsimd-1);
+      
+      hp=hp*0.5;
+      hm=hm*0.5;
+      spRecon5p(fp,hp);
+      spRecon5m(fm,hm);
+
+      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
+      chi[ss+v] = chi[ss+v]     +l[v]*fm;
+
+    }
+  }
+}
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
+{
+  int Ls=this->Ls;
+  int LLs = psi._grid->_rdimensions[0];
+  int vol = psi._grid->oSites()/LLs;
+
+  chi.checkerboard=psi.checkerboard;
+  
+  Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
+  Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
+  
+  for(int s=0;s<Ls;s++){
+    Pplus(s,s) = bee[s];
+    Pminus(s,s)= bee[s];
+  }
+  
+  for(int s=0;s<Ls-1;s++){
+    Pminus(s,s+1) = -cee[s];
+  }
+  
+  for(int s=0;s<Ls-1;s++){
+    Pplus(s+1,s) = -cee[s+1];
+  }
+  Pplus (0,Ls-1) = mass*cee[0];
+  Pminus(Ls-1,0) = mass*cee[Ls-1];
+  
+  Eigen::MatrixXd PplusMat ;
+  Eigen::MatrixXd PminusMat;
+  
+  if ( inv ) {
+    PplusMat =Pplus.inverse();
+    PminusMat=Pminus.inverse();
+  } else { 
+    PplusMat =Pplus;
+    PminusMat=Pminus;
+  }
+  
+  if(dag){
+    PplusMat.adjointInPlace();
+    PminusMat.adjointInPlace();
+  }
+  
+  typedef typename SiteHalfSpinor::scalar_type scalar_type;
+  const int Nsimd=Simd::Nsimd();
+  Vector<iSinglet<Simd> > Matp(Ls*LLs);
+  Vector<iSinglet<Simd> > Matm(Ls*LLs);
+
+  for(int s2=0;s2<Ls;s2++){
+  for(int s1=0;s1<LLs;s1++){
+    int istride = LLs;
+    int ostride = 1;
+      Simd Vp;
+      Simd Vm;
+      scalar_type *sp = (scalar_type *)&Vp;
+      scalar_type *sm = (scalar_type *)&Vm;
+      for(int l=0;l<Nsimd;l++){
+	sp[l] = PplusMat (l*istride+s1*ostride ,s2);
+	sm[l] = PminusMat(l*istride+s1*ostride,s2);
+      }
+      Matp[LLs*s2+s1] = Vp;
+      Matm[LLs*s2+s1] = Vm;
+    }
+  }
+  
+  // Dynamic allocate on stack to get per thread without serialised heap acces
+PARALLEL_FOR_LOOP
+  for(auto site=0;site<vol;site++){
+    
+    //    SiteHalfSpinor *SitePplus =(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
+    //    SiteHalfSpinor *SitePminus=(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
+    //    SiteSpinor     *SiteChi   =(SiteSpinor *)     alloca(LLs*sizeof(SiteSpinor));
+
+    Vector<SiteHalfSpinor> SitePplus(LLs);
+    Vector<SiteHalfSpinor> SitePminus(LLs);
+    Vector<SiteHalfSpinor> SiteChiP(LLs);
+    Vector<SiteHalfSpinor> SiteChiM(LLs);
+    Vector<SiteSpinor>     SiteChi(LLs);
+
+    SiteHalfSpinor BcastP;
+    SiteHalfSpinor BcastM;
+
+    for(int s=0;s<LLs;s++){
+      int lex = s+LLs*site;
+      spProj5p(SitePplus[s] ,psi[lex]);
+      spProj5m(SitePminus[s],psi[lex]);
+      SiteChiP[s]=zero;
+      SiteChiM[s]=zero;
+    }
+      
+    int s=0;
+    for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
+      for(int s2=0;s2<LLs;s2++){ // Column loop of right hand side
+	vbroadcast(BcastP,SitePplus [s2],l);
+	vbroadcast(BcastM,SitePminus[s2],l);
+	for(int s1=0;s1<LLs;s1++){ // Column loop of reduction variables
+	  SiteChiP[s1]=SiteChiP[s1]+Matp[LLs*s+s1]*BcastP;
+	  SiteChiM[s1]=SiteChiM[s1]+Matm[LLs*s+s1]*BcastM;
+	}
+      s++;
+    }}
+
+    for(int s=0;s<LLs;s++){
+      int lex = s+LLs*site;
+      spRecon5p(SiteChi[s],SiteChiP[s]);
+      accumRecon5m(SiteChi[s],SiteChiM[s]);
+      chi[lex] = SiteChi[s]*0.5;
+    }
+  }
+}
+
+INSTANTIATE_DPERP(DomainWallVec5dImplD);
+INSTANTIATE_DPERP(DomainWallVec5dImplF);
+
+template void CayleyFermion5D<DomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<DomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+
+}}
--- a/lib/qcd/action/fermion/DomainWallFermion.h
+++ b/lib/qcd/action/fermion/DomainWallFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_DOMAIN_WALL_FERMION_H
 #define  GRID_QCD_DOMAIN_WALL_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@ -75,7 +75,7 @@ namespace Grid {
    //
    //
    // template<class Impl>
-    // class MyOp : pubic<Impl> { 
+    // class MyOp : public<Impl> { 
    // public:
    //
    //    INHERIT_ALL_IMPL_TYPES(Impl);
@ -99,7 +99,7 @@ namespace Grid {
    typedef typename Impl::SiteSpinor               SiteSpinor;		\
    typedef typename Impl::SiteHalfSpinor       SiteHalfSpinor;		\
    typedef typename Impl::Compressor               Compressor;		\
-    typedef typename Impl::StencilImpl              StencilImpl;	\
+    typedef typename Impl::StencilImpl             StencilImpl;	  \
    typedef typename Impl::ImplParams ImplParams;

 #define INHERIT_IMPL_TYPES(Base) \
@ -110,9 +110,11 @@ namespace Grid {
    // Single flavour four spinors with colour index
    ///////
    template<class S,int Nrepresentation=Nc>
-    class WilsonImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
+    class WilsonImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S, Nrepresentation> > { 
    public:

+      const bool LsVectorised=false;
+
      typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl;

      INHERIT_GIMPL_TYPES(Gimpl);
@ -191,8 +193,10 @@ PARALLEL_FOR_LOOP
    // Single flavour four spinors with colour index, 5d redblack
    ///////
    template<class S,int Nrepresentation=Nc>
-    class DomainWallRedBlack5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
+    class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
    public:
+    
+      const bool LsVectorised=true;

      typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl;

@ -221,7 +225,7 @@ PARALLEL_FOR_LOOP

      ImplParams Params;

-      DomainWallRedBlack5dImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
+      DomainWallVec5dImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 

      bool overlapCommsCompute(void) { return false; };
    
@ -287,6 +291,8 @@ PARALLEL_FOR_LOOP
    class GparityWilsonImpl : public ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> >{ 
    public:

+      const bool LsVectorised=false;
+
      typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;

      INHERIT_GIMPL_TYPES(Gimpl);
@ -446,10 +452,10 @@ PARALLEL_FOR_LOOP
 	// DhopDir provides U or Uconj depending on coor/flavour.
 	GaugeLinkField link(mat._grid);
 	// use lorentz for flavour as hack.
-	auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde,A));  
 PARALLEL_FOR_LOOP
-        for(auto ss=tmp.begin();ss<tmp.end();ss++){
-	  link[ss]() = tmp[ss](0,0) - conjugate(tmp[ss](1,1)) ;
+        for(auto ss=link.begin();ss<link.end();ss++){
+	  auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde[ss],A[ss]));  
+	  link[ss]() = ttmp(0,0) + conjugate(ttmp(1,1)) ; 
 	}
 	PokeIndex<LorentzIndex>(mat,link,mu);
 	return;
@ -477,9 +483,9 @@ PARALLEL_FOR_LOOP
    typedef WilsonImpl<vComplexF,Nc> WilsonImplF; // Float
    typedef WilsonImpl<vComplexD,Nc> WilsonImplD; // Double

-    typedef DomainWallRedBlack5dImpl<vComplex ,Nc> DomainWallRedBlack5dImplR; // Real.. whichever prec
-    typedef DomainWallRedBlack5dImpl<vComplexF,Nc> DomainWallRedBlack5dImplF; // Float
-    typedef DomainWallRedBlack5dImpl<vComplexD,Nc> DomainWallRedBlack5dImplD; // Double
+    typedef DomainWallVec5dImpl<vComplex ,Nc> DomainWallVec5dImplR; // Real.. whichever prec
+    typedef DomainWallVec5dImpl<vComplexF,Nc> DomainWallVec5dImplF; // Float
+    typedef DomainWallVec5dImpl<vComplexD,Nc> DomainWallVec5dImplD; // Double

    typedef GparityWilsonImpl<vComplex ,Nc> GparityWilsonImplR; // Real.. whichever prec
    typedef GparityWilsonImpl<vComplexF,Nc> GparityWilsonImplF; // Float
--- a/lib/qcd/action/fermion/MobiusFermion.h
+++ b/lib/qcd/action/fermion/MobiusFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_MOBIUS_FERMION_H
 #define  GRID_QCD_MOBIUS_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/MobiusZolotarevFermion.h
+++ b/lib/qcd/action/fermion/MobiusZolotarevFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_MOBIUS_ZOLOTAREV_FERMION_H
 #define  GRID_QCD_MOBIUS_ZOLOTAREV_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef OVERLAP_WILSON_CAYLEY_TANH_FERMION_H
 #define OVERLAP_WILSON_CAYLEY_TANH_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  OVERLAP_WILSON_CAYLEY_ZOLOTAREV_FERMION_H
 #define  OVERLAP_WILSON_CAYLEY_ZOLOTAREV_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef OVERLAP_WILSON_CONTFRAC_TANH_FERMION_H
 #define OVERLAP_WILSON_CONTFRAC_TANH_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef OVERLAP_WILSON_CONTFRAC_ZOLOTAREV_FERMION_H
 #define OVERLAP_WILSON_CONTFRAC_ZOLOTAREV_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef OVERLAP_WILSON_PARTFRAC_TANH_FERMION_H
 #define OVERLAP_WILSON_PARTFRAC_TANH_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef OVERLAP_WILSON_PARTFRAC_ZOLOTAREV_FERMION_H
 #define OVERLAP_WILSON_PARTFRAC_ZOLOTAREV_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/ScaledShamirFermion.h
+++ b/lib/qcd/action/fermion/ScaledShamirFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_SCALED_SHAMIR_FERMION_H
 #define  GRID_QCD_SCALED_SHAMIR_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/ShamirZolotarevFermion.h
+++ b/lib/qcd/action/fermion/ShamirZolotarevFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_SHAMIR_ZOLOTAREV_FERMION_H
 #define  GRID_QCD_SHAMIR_ZOLOTAREV_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@ -48,9 +48,9 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
 				       GridRedBlackCartesian &FourDimRedBlackGrid,
 				       RealD _M5,const ImplParams &p) :
  Kernels(p),
-  _FiveDimGrid(&FiveDimGrid),
+  _FiveDimGrid        (&FiveDimGrid),
  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
-  _FourDimGrid(&FourDimGrid),
+  _FourDimGrid        (&FourDimGrid),
  _FourDimRedBlackGrid(&FourDimRedBlackGrid),
  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements),
  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
@ -62,60 +62,83 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  Lebesgue(_FourDimGrid),
  LebesgueEvenOdd(_FourDimRedBlackGrid)
 {
-  // some assertions
-  assert(FiveDimGrid._ndimension==5);
-  assert(FourDimGrid._ndimension==4);
-  assert(FiveDimRedBlackGrid._ndimension==5);
-  assert(FourDimRedBlackGrid._ndimension==4);
-  assert(FiveDimRedBlackGrid._checker_dim==1);
+  if (Impl::LsVectorised) { 

-  // Dimension zero of the five-d is the Ls direction
-  Ls=FiveDimGrid._fdimensions[0];
-  assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
-  assert(FiveDimRedBlackGrid._processors[0] ==1);
-  assert(FiveDimRedBlackGrid._simd_layout[0]==1);
-  assert(FiveDimGrid._processors[0]         ==1);
-  assert(FiveDimGrid._simd_layout[0]        ==1);
+    int nsimd = Simd::Nsimd();
+    
+    // some assertions
+    assert(FiveDimGrid._ndimension==5);
+    assert(FiveDimRedBlackGrid._ndimension==5);
+    assert(FiveDimRedBlackGrid._checker_dim==1); // Don't checker the s direction
+    assert(FourDimGrid._ndimension==4);

-  // Other dimensions must match the decomposition of the four-D fields 
-  for(int d=0;d<4;d++){
-    assert(FourDimRedBlackGrid._fdimensions[d]  ==FourDimGrid._fdimensions[d]);
-    assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
+    // Dimension zero of the five-d is the Ls direction
+    Ls=FiveDimGrid._fdimensions[0];
+    assert(FiveDimGrid._processors[0]         ==1);
+    assert(FiveDimGrid._simd_layout[0]        ==nsimd);

-    assert(FourDimRedBlackGrid._processors[d]   ==FourDimGrid._processors[d]);
-    assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
+    assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
+    assert(FiveDimRedBlackGrid._processors[0] ==1);
+    assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);

-    assert(FourDimRedBlackGrid._simd_layout[d]  ==FourDimGrid._simd_layout[d]);
-    assert(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]);
+    // Other dimensions must match the decomposition of the four-D fields 
+    for(int d=0;d<4;d++){
+      assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
+      assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
+      
+      assert(FourDimGrid._simd_layout[d]=1);
+      assert(FourDimRedBlackGrid._simd_layout[d]=1);
+      assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);

-    assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
-    assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
-    assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
+      assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
+      assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
+      assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
+    }
+
+  } else {
+
+    // some assertions
+    assert(FiveDimGrid._ndimension==5);
+    assert(FourDimGrid._ndimension==4);
+    assert(FiveDimRedBlackGrid._ndimension==5);
+    assert(FourDimRedBlackGrid._ndimension==4);
+    assert(FiveDimRedBlackGrid._checker_dim==1);
+    
+    // Dimension zero of the five-d is the Ls direction
+    Ls=FiveDimGrid._fdimensions[0];
+    assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
+    assert(FiveDimRedBlackGrid._processors[0] ==1);
+    assert(FiveDimRedBlackGrid._simd_layout[0]==1);
+    assert(FiveDimGrid._processors[0]         ==1);
+    assert(FiveDimGrid._simd_layout[0]        ==1);
+    
+    // Other dimensions must match the decomposition of the four-D fields 
+    for(int d=0;d<4;d++){
+      assert(FourDimRedBlackGrid._fdimensions[d]  ==FourDimGrid._fdimensions[d]);
+      assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
+      
+      assert(FourDimRedBlackGrid._processors[d]   ==FourDimGrid._processors[d]);
+      assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
+      
+      assert(FourDimRedBlackGrid._simd_layout[d]  ==FourDimGrid._simd_layout[d]);
+      assert(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]);
+      
+      assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
+      assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
+      assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
+    }
  }
-
+    
  // Allocate the required comms buffer
  ImportGauge(_Umu);
-}  
-
+}
+  /*
 template<class Impl>
 WilsonFermion5D<Impl>::WilsonFermion5D(int simd,GaugeField &_Umu,
 				       GridCartesian         &FiveDimGrid,
 				       GridRedBlackCartesian &FiveDimRedBlackGrid,
 				       GridCartesian         &FourDimGrid,
 				       RealD _M5,const ImplParams &p) :
-  Kernels(p),
-  _FiveDimGrid        (&FiveDimGrid),
-  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
-  _FourDimGrid        (&FourDimGrid),
-  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements),
-  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
-  StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd
-  M5(_M5),
-  Umu(_FourDimGrid),
-  UmuEven(_FourDimGrid),
-  UmuOdd (_FourDimGrid),
-  Lebesgue(_FourDimGrid),
-  LebesgueEvenOdd(_FourDimGrid)
 {
  int nsimd = Simd::Nsimd();

@ -148,15 +171,10 @@ WilsonFermion5D<Impl>::WilsonFermion5D(int simd,GaugeField &_Umu,
  }

  {
-    GaugeField HUmu(_Umu._grid);
-    HUmu = _Umu*(-0.5);
-    Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
-    UmuEven=Umu;// Really want a reference.
-    UmuOdd =Umu;
  }
 }  
-
-
+  */
+     
 template<class Impl>
 void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
 {
@ -376,8 +394,6 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag

 FermOpTemplateInstantiate(WilsonFermion5D);
 GparityFermOpTemplateInstantiate(WilsonFermion5D);
-template class WilsonFermion5D<DomainWallRedBlack5dImplF>;		
-template class WilsonFermion5D<DomainWallRedBlack5dImplD>;
  
 }}

--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@ -125,12 +125,14 @@ namespace Grid {
 		      double _M5,const ImplParams &p= ImplParams());

      // Constructors
+      /*
      WilsonFermion5D(int simd, 
 		      GaugeField &_Umu,
 		      GridCartesian         &FiveDimGrid,
 		      GridRedBlackCartesian &FiveDimRedBlackGrid,
 		      GridCartesian         &FourDimGrid,
 		      double _M5,const ImplParams &p= ImplParams());
+      */

      // DoubleStore
      void ImportGauge(const GaugeField &_Umu);
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@ -572,7 +572,4 @@ void WilsonKernels<Impl>::DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,

  FermOpTemplateInstantiate(WilsonKernels);

-template class WilsonKernels<DomainWallRedBlack5dImplF>;		
-template class WilsonKernels<DomainWallRedBlack5dImplD>;
-
 }}
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@ -90,7 +90,7 @@ void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrd
 #define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
 template<>
-void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
@ -110,10 +110,10 @@ template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl
 template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
+template void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
+template void WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 }}
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@ -867,16 +867,16 @@ template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(Stencil
 									 int ss,int sU,const FermionField &in, FermionField &out);


-template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallVec5dImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallVec5dImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallVec5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+template void WilsonKernels<DomainWallVec5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);

--- a/lib/qcd/action/fermion/WilsonTMFermion.h
+++ b/lib/qcd/action/fermion/WilsonTMFermion.h
@ -28,7 +28,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_WILSON_TM_FERMION_H
 #define  GRID_QCD_WILSON_TM_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/gauge/GaugeImpl.h
+++ b/lib/qcd/action/gauge/GaugeImpl.h
@ -1,181 +1,188 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/qcd/action/gauge/GaugeImpl.h
+Source file: ./lib/qcd/action/gauge/GaugeImpl.h

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: paboyle <paboyle@ph.ed.ac.uk>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef  GRID_QCD_GAUGE_IMPL_H
-#define  GRID_QCD_GAUGE_IMPL_H
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_QCD_GAUGE_IMPL_H
+#define GRID_QCD_GAUGE_IMPL_H

 namespace Grid {

-  namespace QCD {
+namespace QCD {

-    
-    ////////////////////////////////////////////////////////////////////////
-    // Implementation dependent gauge types
-    ////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Implementation dependent gauge types
+////////////////////////////////////////////////////////////////////////

-template<class Gimpl> class WilsonLoops;
+template <class Gimpl> class WilsonLoops;

-#define INHERIT_GIMPL_TYPES(GImpl) \
-    typedef typename GImpl::Simd                           Simd;\
-    typedef typename GImpl::GaugeLinkField       GaugeLinkField;\
-    typedef typename GImpl::GaugeField               GaugeField;\
-    typedef typename GImpl::SiteGaugeField       SiteGaugeField;\
-    typedef typename GImpl::SiteGaugeLink         SiteGaugeLink;
+#define INHERIT_GIMPL_TYPES(GImpl)                                             \
+  typedef typename GImpl::Simd Simd;                                           \
+  typedef typename GImpl::GaugeLinkField GaugeLinkField;                       \
+  typedef typename GImpl::GaugeField GaugeField;                               \
+  typedef typename GImpl::SiteGaugeField SiteGaugeField;                       \
+  typedef typename GImpl::SiteGaugeLink SiteGaugeLink;

+//
+template <class S, int Nrepresentation = Nc> class GaugeImplTypes {
+public:
+  typedef S Simd;

-    // 
-    template<class S,int Nrepresentation=Nc>
-    class GaugeImplTypes { 
-    public:
-    
-      typedef S Simd;
-    
-      template<typename vtype> using iImplGaugeLink          = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
-      template<typename vtype> using iImplGaugeField         = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd  >;
-    
-      typedef iImplGaugeLink    <Simd>           SiteGaugeLink;
-      typedef iImplGaugeField   <Simd>           SiteGaugeField;
-    
-      typedef Lattice<SiteGaugeLink>                GaugeLinkField; // bit ugly naming; polarised gauge field, lorentz... all ugly
-      typedef Lattice<SiteGaugeField>                   GaugeField;
+  template <typename vtype>
+  using iImplGaugeLink = iScalar<iScalar<iMatrix<vtype, Nrepresentation>>>;
+  template <typename vtype>
+  using iImplGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation>>, Nd>;

-    };
+  typedef iImplGaugeLink<Simd> SiteGaugeLink;
+  typedef iImplGaugeField<Simd> SiteGaugeField;

-    // Composition with smeared link, bc's etc.. probably need multiple inheritance
-    // Variable precision "S" and variable Nc
-    template<class GimplTypes>
-    class PeriodicGaugeImpl : public GimplTypes  { 
-    public:
+  typedef Lattice<SiteGaugeLink> GaugeLinkField; // bit ugly naming; polarised
+                                                 // gauge field, lorentz... all
+                                                 // ugly
+  typedef Lattice<SiteGaugeField> GaugeField;

-    INHERIT_GIMPL_TYPES(GimplTypes);
-
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // Support needed for the assembly of loops including all boundary condition effects such as conjugate bcs
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    
-      template<class covariant>  static inline
-      Lattice<covariant> CovShiftForward (const GaugeLinkField &Link, int mu, const Lattice<covariant> &field) {
-	return PeriodicBC::CovShiftForward(Link,mu,field);
-      }
-
-      template<class covariant> static inline
-      Lattice<covariant> CovShiftBackward(const GaugeLinkField &Link, int mu,const Lattice<covariant> &field) {
-	return PeriodicBC::CovShiftBackward(Link,mu,field);
-      }
-      static inline
-      GaugeLinkField CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
-	return Cshift(adj(Link),mu,-1);
-      }
-      static inline
-      GaugeLinkField CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
-	return Link;
-      }
-      static inline
-      GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) {
-	return Cshift(Link,mu,1);
-      }
-
-      static inline bool isPeriodicGaugeField(void) {
-	return true;
-      }
-
-    };
-
-    
-    // Composition with smeared link, bc's etc.. probably need multiple inheritance
-    // Variable precision "S" and variable Nc
-    template<class GimplTypes>
-    class ConjugateGaugeImpl : public GimplTypes { 
-    public:
-
-      INHERIT_GIMPL_TYPES(GimplTypes);
-
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // Support needed for the assembly of loops including all boundary condition effects such as Gparity.
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    template<class covariant>  static
-    Lattice<covariant> CovShiftForward (const GaugeLinkField &Link, int mu, const Lattice<covariant> &field) {
-      return ConjugateBC::CovShiftForward(Link,mu,field);
+  // Move this elsewhere?
+  static inline void AddGaugeLink(GaugeField &U, GaugeLinkField &W,
+                                  int mu) { // U[mu] += W
+    PARALLEL_FOR_LOOP
+    for (auto ss = 0; ss < U._grid->oSites(); ss++) {
+      U._odata[ss]._internal[mu] =
+          U._odata[ss]._internal[mu] + W._odata[ss]._internal;
    }
-
-    template<class covariant> static
-    Lattice<covariant> CovShiftBackward(const GaugeLinkField &Link, int mu,const Lattice<covariant> &field) {
-      return ConjugateBC::CovShiftBackward(Link,mu,field);
-    }
-
-    static inline
-    GaugeLinkField CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
-      GridBase *grid = Link._grid;
-      int Lmu = grid->GlobalDimensions()[mu]-1;
-      
-      Lattice<iScalar<vInteger> > coor(grid);    LatticeCoordinate(coor,mu);
-
-      GaugeLinkField tmp (grid);
-      tmp=adj(Link);
-      tmp = where(coor==Lmu,conjugate(tmp),tmp);
-      return Cshift(tmp,mu,-1);// moves towards positive mu
-    }
-    static inline
-    GaugeLinkField CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
-      return Link;
-    }
-
-    static inline
-    GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) {
-      GridBase *grid = Link._grid;
-      int Lmu = grid->GlobalDimensions()[mu]-1;
-      
-      Lattice<iScalar<vInteger> > coor(grid);    LatticeCoordinate(coor,mu);
-
-      GaugeLinkField tmp (grid);
-      tmp=Cshift(Link,mu,1);
-      tmp=where(coor==Lmu,conjugate(tmp),tmp);
-      return tmp;
-    }
-
-    static inline bool isPeriodicGaugeField(void) {
-      return false;
-    }
-    
-    };
-
-    typedef GaugeImplTypes<vComplex,Nc>     GimplTypesR;
-    typedef GaugeImplTypes<vComplexF,Nc>    GimplTypesF;
-    typedef GaugeImplTypes<vComplexD,Nc>    GimplTypesD;
-
-    typedef PeriodicGaugeImpl<GimplTypesR> PeriodicGimplR; // Real.. whichever prec
-    typedef PeriodicGaugeImpl<GimplTypesF> PeriodicGimplF; // Float
-    typedef PeriodicGaugeImpl<GimplTypesD> PeriodicGimplD; // Double
-
-    typedef ConjugateGaugeImpl<GimplTypesR> ConjugateGimplR; // Real.. whichever prec
-    typedef ConjugateGaugeImpl<GimplTypesF> ConjugateGimplF; // Float
-    typedef ConjugateGaugeImpl<GimplTypesD> ConjugateGimplD; // Double
-
  }
+};
+
+// Composition with smeared link, bc's etc.. probably need multiple inheritance
+// Variable precision "S" and variable Nc
+template <class GimplTypes> class PeriodicGaugeImpl : public GimplTypes {
+public:
+  INHERIT_GIMPL_TYPES(GimplTypes);
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Support needed for the assembly of loops including all boundary condition
+  // effects such as conjugate bcs
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  template <class covariant>
+  static inline Lattice<covariant>
+  CovShiftForward(const GaugeLinkField &Link, int mu,
+                  const Lattice<covariant> &field) {
+    return PeriodicBC::CovShiftForward(Link, mu, field);
+  }
+
+  template <class covariant>
+  static inline Lattice<covariant>
+  CovShiftBackward(const GaugeLinkField &Link, int mu,
+                   const Lattice<covariant> &field) {
+    return PeriodicBC::CovShiftBackward(Link, mu, field);
+  }
+  static inline GaugeLinkField
+  CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
+    return Cshift(adj(Link), mu, -1);
+  }
+  static inline GaugeLinkField
+  CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
+    return Link;
+  }
+  static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) {
+    return Cshift(Link, mu, 1);
+  }
+
+  static inline bool isPeriodicGaugeField(void) { return true; }
+};
+
+// Composition with smeared link, bc's etc.. probably need multiple inheritance
+// Variable precision "S" and variable Nc
+template <class GimplTypes> class ConjugateGaugeImpl : public GimplTypes {
+public:
+  INHERIT_GIMPL_TYPES(GimplTypes);
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Support needed for the assembly of loops including all boundary condition
+  // effects such as Gparity.
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  template <class covariant>
+  static Lattice<covariant> CovShiftForward(const GaugeLinkField &Link, int mu,
+                                            const Lattice<covariant> &field) {
+    return ConjugateBC::CovShiftForward(Link, mu, field);
+  }
+
+  template <class covariant>
+  static Lattice<covariant> CovShiftBackward(const GaugeLinkField &Link, int mu,
+                                             const Lattice<covariant> &field) {
+    return ConjugateBC::CovShiftBackward(Link, mu, field);
+  }
+
+  static inline GaugeLinkField
+  CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
+    GridBase *grid = Link._grid;
+    int Lmu = grid->GlobalDimensions()[mu] - 1;
+
+    Lattice<iScalar<vInteger>> coor(grid);
+    LatticeCoordinate(coor, mu);
+
+    GaugeLinkField tmp(grid);
+    tmp = adj(Link);
+    tmp = where(coor == Lmu, conjugate(tmp), tmp);
+    return Cshift(tmp, mu, -1); // moves towards positive mu
+  }
+  static inline GaugeLinkField
+  CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
+    return Link;
+  }
+
+  static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) {
+    GridBase *grid = Link._grid;
+    int Lmu = grid->GlobalDimensions()[mu] - 1;
+
+    Lattice<iScalar<vInteger>> coor(grid);
+    LatticeCoordinate(coor, mu);
+
+    GaugeLinkField tmp(grid);
+    tmp = Cshift(Link, mu, 1);
+    tmp = where(coor == Lmu, conjugate(tmp), tmp);
+    return tmp;
+  }
+
+  static inline bool isPeriodicGaugeField(void) { return false; }
+};
+
+typedef GaugeImplTypes<vComplex, Nc> GimplTypesR;
+typedef GaugeImplTypes<vComplexF, Nc> GimplTypesF;
+typedef GaugeImplTypes<vComplexD, Nc> GimplTypesD;
+
+typedef PeriodicGaugeImpl<GimplTypesR> PeriodicGimplR; // Real.. whichever prec
+typedef PeriodicGaugeImpl<GimplTypesF> PeriodicGimplF; // Float
+typedef PeriodicGaugeImpl<GimplTypesD> PeriodicGimplD; // Double
+
+typedef ConjugateGaugeImpl<GimplTypesR>
+    ConjugateGimplR; // Real.. whichever prec
+typedef ConjugateGaugeImpl<GimplTypesF> ConjugateGimplF; // Float
+typedef ConjugateGaugeImpl<GimplTypesD> ConjugateGimplD; // Double
+}
 }

 #endif
--- a/lib/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
+++ b/lib/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
@ -1,212 +1,214 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
+Source file: ./lib/qcd/action/pseudofermion/OneFlavourEvenOddRational.h

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: Peter Boyle <paboyle@ph.ed.ac.uk>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef QCD_PSEUDOFERMION_ONE_FLAVOUR_EVEN_ODD_RATIONAL_H
 #define QCD_PSEUDOFERMION_ONE_FLAVOUR_EVEN_ODD_RATIONAL_H

-namespace Grid{
-  namespace QCD{
+namespace Grid {
+namespace QCD {

-    ///////////////////////////////////////
-    // One flavour rational
-    ///////////////////////////////////////
+///////////////////////////////////////
+// One flavour rational
+///////////////////////////////////////

-    // S_f = chi^dag *  N(Mpc^dag*Mpc)/D(Mpc^dag*Mpc) * chi
+// S_f = chi^dag *  N(Mpc^dag*Mpc)/D(Mpc^dag*Mpc) * chi
+//
+// Here, M is some operator
+// N and D makeup the rat. poly
+//
+
+template <class Impl>
+class OneFlavourEvenOddRationalPseudoFermionAction
+    : public Action<typename Impl::GaugeField> {
+ public:
+  INHERIT_IMPL_TYPES(Impl);
+
+  typedef OneFlavourRationalParams Params;
+  Params param;
+
+  MultiShiftFunction PowerHalf;
+  MultiShiftFunction PowerNegHalf;
+  MultiShiftFunction PowerQuarter;
+  MultiShiftFunction PowerNegQuarter;
+
+ private:
+  FermionOperator<Impl> &FermOp;  // the basic operator
+
+  // NOT using "Nroots"; IroIro is -- perhaps later, but this wasn't good for us
+  // historically
+  // and hasenbusch works better
+
+  FermionField PhiEven;  // the pseudo fermion field for this trajectory
+  FermionField PhiOdd;   // the pseudo fermion field for this trajectory
+
+ public:
+  OneFlavourEvenOddRationalPseudoFermionAction(FermionOperator<Impl> &Op,
+                                               Params &p)
+      : FermOp(Op),
+        PhiEven(Op.FermionRedBlackGrid()),
+        PhiOdd(Op.FermionRedBlackGrid()),
+        param(p) {
+    AlgRemez remez(param.lo, param.hi, param.precision);
+
+    // MdagM^(+- 1/2)
+    std::cout << GridLogMessage << "Generating degree " << param.degree
+              << " for x^(1/2)" << std::endl;
+    remez.generateApprox(param.degree, 1, 2);
+    PowerHalf.Init(remez, param.tolerance, false);
+    PowerNegHalf.Init(remez, param.tolerance, true);
+
+    // MdagM^(+- 1/4)
+    std::cout << GridLogMessage << "Generating degree " << param.degree
+              << " for x^(1/4)" << std::endl;
+    remez.generateApprox(param.degree, 1, 4);
+    PowerQuarter.Init(remez, param.tolerance, false);
+    PowerNegQuarter.Init(remez, param.tolerance, true);
+  };
+
+  virtual void refresh(const GaugeField &U, GridParallelRNG &pRNG) {
+    // P(phi) = e^{- phi^dag (MpcdagMpc)^-1/2 phi}
+    //        = e^{- phi^dag (MpcdagMpc)^-1/4 (MpcdagMpc)^-1/4 phi}
+    // Phi = MpcdagMpc^{1/4} eta
    //
-    // Here, M is some operator 
-    // N and D makeup the rat. poly 
+    // P(eta) = e^{- eta^dag eta}
    //
-  
-    template<class Impl>
-    class OneFlavourEvenOddRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
-    public:
-      INHERIT_IMPL_TYPES(Impl);
+    // e^{x^2/2 sig^2} => sig^2 = 0.5.
+    //
+    // So eta should be of width sig = 1/sqrt(2).

-      typedef OneFlavourRationalParams Params;
-      Params param;
+    RealD scale = std::sqrt(0.5);

-      MultiShiftFunction PowerHalf   ;
-      MultiShiftFunction PowerNegHalf;
-      MultiShiftFunction PowerQuarter;
-      MultiShiftFunction PowerNegQuarter;
+    FermionField eta(FermOp.FermionGrid());
+    FermionField etaOdd(FermOp.FermionRedBlackGrid());
+    FermionField etaEven(FermOp.FermionRedBlackGrid());

-    private:
-     
-      FermionOperator<Impl> & FermOp;// the basic operator
+    gaussian(pRNG, eta);
+    eta = eta * scale;

-      // NOT using "Nroots"; IroIro is -- perhaps later, but this wasn't good for us historically
-      // and hasenbusch works better
+    pickCheckerboard(Even, etaEven, eta);
+    pickCheckerboard(Odd, etaOdd, eta);

-      FermionField PhiEven; // the pseudo fermion field for this trajectory
-      FermionField PhiOdd; // the pseudo fermion field for this trajectory
-                        
+    FermOp.ImportGauge(U);

-    public:
+    // mutishift CG
+    SchurDifferentiableOperator<Impl> Mpc(FermOp);
+    ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter, PowerQuarter);
+    msCG(Mpc, etaOdd, PhiOdd);

-      OneFlavourEvenOddRationalPseudoFermionAction(FermionOperator<Impl>  &Op, 
-						   Params & p ) : FermOp(Op), 
-	PhiEven(Op.FermionRedBlackGrid()), 
-	PhiOdd (Op.FermionRedBlackGrid()), 
-	param(p) 
-      {
-	AlgRemez remez(param.lo,param.hi,param.precision);
+    //////////////////////////////////////////////////////
+    // FIXME : Clover term not yet..
+    //////////////////////////////////////////////////////

-	// MdagM^(+- 1/2)
-	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
-	remez.generateApprox(param.degree,1,2);
-	PowerHalf.Init(remez,param.tolerance,false);
-	PowerNegHalf.Init(remez,param.tolerance,true);
+    assert(FermOp.ConstEE() == 1);
+    PhiEven = zero;
+  };

-	// MdagM^(+- 1/4)
-	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
-	remez.generateApprox(param.degree,1,4);
-   	PowerQuarter.Init(remez,param.tolerance,false);
-	PowerNegQuarter.Init(remez,param.tolerance,true);
-      };
-      
-      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
+  //////////////////////////////////////////////////////
+  // S = phi^dag (Mdag M)^-1/2 phi
+  //////////////////////////////////////////////////////
+  virtual RealD S(const GaugeField &U) {
+    FermOp.ImportGauge(U);

-	// P(phi) = e^{- phi^dag (MpcdagMpc)^-1/2 phi}
-	//        = e^{- phi^dag (MpcdagMpc)^-1/4 (MpcdagMpc)^-1/4 phi}
-	// Phi = MpcdagMpc^{1/4} eta 
-	//
-	// P(eta) = e^{- eta^dag eta}
-	//
-	// e^{x^2/2 sig^2} => sig^2 = 0.5.
-	// 
-	// So eta should be of width sig = 1/sqrt(2).
+    FermionField Y(FermOp.FermionRedBlackGrid());

-	RealD scale = std::sqrt(0.5);
+    SchurDifferentiableOperator<Impl> Mpc(FermOp);

-	FermionField eta    (FermOp.FermionGrid());
-	FermionField etaOdd (FermOp.FermionRedBlackGrid());
-	FermionField etaEven(FermOp.FermionRedBlackGrid());
+    ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,
+                                                   PowerNegQuarter);

-	gaussian(pRNG,eta);	eta=eta*scale;
+    msCG(Mpc, PhiOdd, Y);

-	pickCheckerboard(Even,etaEven,eta);
-	pickCheckerboard(Odd,etaOdd,eta);
+    RealD action = norm2(Y);
+    std::cout << GridLogMessage << "Pseudofermion action FIXME -- is -1/4 "
+                                   "solve or -1/2 solve faster??? "
+              << action << std::endl;

-	FermOp.ImportGauge(U);
+    return action;
+  };

-	// mutishift CG
-	SchurDifferentiableOperator<Impl> Mpc(FermOp);
-	ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,PowerQuarter);
-	msCG(Mpc,etaOdd,PhiOdd);
+  //////////////////////////////////////////////////////
+  // Need
+  // dS_f/dU = chi^dag   d[N/D]  chi
+  //
+  // N/D is expressed as partial fraction expansion:
+  //
+  //           a0 + \sum_k ak/(M^dagM + bk)
+  //
+  // d[N/D] is then
+  //
+  //          \sum_k -ak [M^dagM+bk]^{-1}  [ dM^dag M + M^dag dM ] [M^dag M +
+  //          bk]^{-1}
+  //
+  // Need
+  //       Mf Phi_k = [MdagM+bk]^{-1} Phi
+  //       Mf Phi   = \sum_k ak [MdagM+bk]^{-1} Phi
+  //
+  // With these building blocks
+  //
+  //       dS/dU =  \sum_k -ak Mf Phi_k^dag      [ dM^dag M + M^dag dM ] Mf
+  //       Phi_k
+  //        S    = innerprodReal(Phi,Mf Phi);
+  //////////////////////////////////////////////////////
+  virtual void deriv(const GaugeField &U, GaugeField &dSdU) {
+    const int Npole = PowerNegHalf.poles.size();

-	//////////////////////////////////////////////////////
-	// FIXME : Clover term not yet..
-	//////////////////////////////////////////////////////
+    std::vector<FermionField> MPhi_k(Npole, FermOp.FermionRedBlackGrid());

-	assert(FermOp.ConstEE() == 1);
-	PhiEven = zero;
-	
-      };
+    FermionField X(FermOp.FermionRedBlackGrid());
+    FermionField Y(FermOp.FermionRedBlackGrid());

-      //////////////////////////////////////////////////////
-      // S = phi^dag (Mdag M)^-1/2 phi
-      //////////////////////////////////////////////////////
-      virtual RealD S(const GaugeField &U) {
+    GaugeField tmp(FermOp.GaugeGrid());

-	FermOp.ImportGauge(U);
+    FermOp.ImportGauge(U);

-	FermionField Y(FermOp.FermionRedBlackGrid());
-	
-	SchurDifferentiableOperator<Impl> Mpc(FermOp);
+    SchurDifferentiableOperator<Impl> Mpc(FermOp);

-	ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,PowerNegQuarter);
+    ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter, PowerNegHalf);

-	msCG(Mpc,PhiOdd,Y);
+    msCG(Mpc, PhiOdd, MPhi_k);

-	RealD action = norm2(Y);
-	std::cout << GridLogMessage << "Pseudofermion action FIXME -- is -1/4 solve or -1/2 solve faster??? "<<action<<std::endl;
+    dSdU = zero;
+    for (int k = 0; k < Npole; k++) {
+      RealD ak = PowerNegHalf.residues[k];

-	return action;
-      };
+      X = MPhi_k[k];

-      //////////////////////////////////////////////////////
-      // Need
-      // dS_f/dU = chi^dag   d[N/D]  chi
-      //
-      // N/D is expressed as partial fraction expansion:
-      //
-      //           a0 + \sum_k ak/(M^dagM + bk)
-      //
-      // d[N/D] is then
-      //
-      //          \sum_k -ak [M^dagM+bk]^{-1}  [ dM^dag M + M^dag dM ] [M^dag M + bk]^{-1}
-      //
-      // Need
-      //       Mf Phi_k = [MdagM+bk]^{-1} Phi
-      //       Mf Phi   = \sum_k ak [MdagM+bk]^{-1} Phi
-      //
-      // With these building blocks
-      //
-      //       dS/dU =  \sum_k -ak Mf Phi_k^dag      [ dM^dag M + M^dag dM ] Mf Phi_k
-      //        S    = innerprodReal(Phi,Mf Phi);
-      //////////////////////////////////////////////////////
-      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
+      Mpc.Mpc(X, Y);
+      Mpc.MpcDeriv(tmp, Y, X);
+      dSdU = dSdU + ak * tmp;
+      Mpc.MpcDagDeriv(tmp, X, Y);
+      dSdU = dSdU + ak * tmp;
+    }

-	const int Npole = PowerNegHalf.poles.size();
-
-	std::vector<FermionField> MPhi_k (Npole,FermOp.FermionRedBlackGrid());
-
-	FermionField X(FermOp.FermionRedBlackGrid());
-	FermionField Y(FermOp.FermionRedBlackGrid());
-
-	GaugeField   tmp(FermOp.GaugeGrid());
-
-	FermOp.ImportGauge(U);
-
-	SchurDifferentiableOperator<Impl> Mpc(FermOp);
-
-	ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,PowerNegHalf);
-
-	msCG(Mpc,PhiOdd,MPhi_k);
-
-	dSdU = zero;
-	for(int k=0;k<Npole;k++){
-
-	  RealD ak = PowerNegHalf.residues[k];
-
-	  X  = MPhi_k[k];
-
-	  Mpc.Mpc(X,Y);
-	  Mpc.MpcDeriv   (tmp , Y, X );  dSdU=dSdU+ak*tmp;
-	  Mpc.MpcDagDeriv(tmp , X, Y );  dSdU=dSdU+ak*tmp;
-
-	}
-
-	dSdU = Ta(dSdU);
-
-      };
-    };
-  }
+    // dSdU = Ta(dSdU);
+  };
+};
+}
 }

-
 #endif
--- a/lib/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
+++ b/lib/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
@ -256,7 +256,7 @@ namespace Grid{

 	}

-	dSdU = Ta(dSdU);
+	//dSdU = Ta(dSdU);

      };
    };
--- a/lib/qcd/action/pseudofermion/OneFlavourRational.h
+++ b/lib/qcd/action/pseudofermion/OneFlavourRational.h
@ -186,7 +186,7 @@ namespace Grid{

 	}

-	dSdU = Ta(dSdU);
+	//dSdU = Ta(dSdU);

      };
    };
--- a/lib/qcd/action/pseudofermion/OneFlavourRationalRatio.h
+++ b/lib/qcd/action/pseudofermion/OneFlavourRationalRatio.h
@ -242,7 +242,7 @@ namespace Grid{

 	}

-	dSdU = Ta(dSdU);
+	//dSdU = Ta(dSdU);

      };
    };
--- a/lib/qcd/action/pseudofermion/TwoFlavour.h
+++ b/lib/qcd/action/pseudofermion/TwoFlavour.h
@ -137,7 +137,7 @@ namespace Grid{
 	FermOp.MDeriv(tmp , Y, X,DaggerNo );  dSdU=tmp;
 	FermOp.MDeriv(tmp , X, Y,DaggerYes);  dSdU=dSdU+tmp;
 	
-	dSdU = Ta(dSdU);
+	//dSdU = Ta(dSdU);

      };

--- a/lib/qcd/action/pseudofermion/TwoFlavourEvenOdd.h
+++ b/lib/qcd/action/pseudofermion/TwoFlavourEvenOdd.h
@ -100,7 +100,7 @@ namespace Grid{

 	PhiOdd =PhiOdd*scale;
 	PhiEven=PhiEven*scale;
-	
+
      };

      //////////////////////////////////////////////////////
@ -173,7 +173,7 @@ namespace Grid{
 	FermOp.MeeDeriv(tmp , X, Y,DaggerYes);  dSdU=dSdU+tmp;
 	*/
 	
-	dSdU = Ta(dSdU);
+	//dSdU = Ta(dSdU);

      };

--- a/lib/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
+++ b/lib/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
@ -188,8 +188,9 @@ namespace Grid{
 	assert(NumOp.ConstEE() == 1);
 	assert(DenOp.ConstEE() == 1);

-	dSdU = -Ta(dSdU);
-
+	//dSdU = -Ta(dSdU);
+	dSdU = -dSdU;
+	
      };
    };
  }
--- a/lib/qcd/action/pseudofermion/TwoFlavourRatio.h
+++ b/lib/qcd/action/pseudofermion/TwoFlavourRatio.h
@ -155,7 +155,8 @@ namespace Grid{
 	DenOp.MDeriv(force,Y,X,DaggerNo);   dSdU=dSdU-force;
 	DenOp.MDeriv(force,X,Y,DaggerYes);  dSdU=dSdU-force;

-	dSdU = - Ta(dSdU);
+	dSdU *= -1.0;
+	//dSdU = - Ta(dSdU);

      };
    };
--- a/lib/qcd/hmc/HMC.h
+++ b/lib/qcd/hmc/HMC.h
@ -1,33 +1,34 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/qcd/hmc/HMC.h
+Source file: ./lib/qcd/hmc/HMC.h

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
 Author: paboyle <paboyle@ph.ed.ac.uk>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 //--------------------------------------------------------------------
 /*! @file HMC.h
 * @brief Classes for Hybrid Monte Carlo update
@ -41,172 +42,195 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 #include <string>

+namespace Grid {
+namespace QCD {

-namespace Grid{
-  namespace QCD{
-    
+struct HMCparameters {
+  Integer StartTrajectory;
+  Integer Trajectories; /* @brief Number of sweeps in this run */
+  bool MetropolisTest;
+  Integer NoMetropolisUntil;

-    struct HMCparameters{
+  HMCparameters() {
+    ////////////////////////////// Default values
+    MetropolisTest = true;
+    NoMetropolisUntil = 10;
+    StartTrajectory = 0;
+    Trajectories = 200;
+    /////////////////////////////////
+  }

-      Integer StartTrajectory;
-      Integer Trajectories; /* @brief Number of sweeps in this run */
-      bool    MetropolisTest;
-      Integer NoMetropolisUntil;
+  void print() const {
+    std::cout << GridLogMessage << "[HMC parameter] Trajectories            : " << Trajectories << "\n";
+    std::cout << GridLogMessage << "[HMC parameter] Start trajectory        : " << StartTrajectory << "\n";
+    std::cout << GridLogMessage << "[HMC parameter] Metropolis test (on/off): " << MetropolisTest << "\n";
+    std::cout << GridLogMessage << "[HMC parameter] Thermalization trajs    : " << NoMetropolisUntil << "\n";
+  }
+  
+};

-      HMCparameters(){
-	////////////////////////////// Default values
-	MetropolisTest      = true;
-	NoMetropolisUntil   = 10;
-	StartTrajectory     = 0;
-	Trajectories        = 200;
-	/////////////////////////////////
-      }
-    };
+template <class GaugeField>
+class HmcObservable {
+ public:
+  virtual void TrajectoryComplete(int traj, GaugeField &U, GridSerialRNG &sRNG,
+                                  GridParallelRNG &pRNG) = 0;
+};

-    template<class GaugeField> 
-    class HmcObservable {
-    public:
-      virtual void TrajectoryComplete (int traj, GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG & pRNG )=0;
-    };
+template <class Gimpl>
+class PlaquetteLogger : public HmcObservable<typename Gimpl::GaugeField> {
+ private:
+  std::string Stem;

-    template<class Gimpl> 
-    class PlaquetteLogger : public HmcObservable<typename Gimpl::GaugeField> {
-    private:
-      std::string Stem;
-    public:
-      INHERIT_GIMPL_TYPES(Gimpl);
-      PlaquetteLogger(std::string cf) {
-        Stem  = cf;
-      };
+ public:
+  INHERIT_GIMPL_TYPES(Gimpl);
+  PlaquetteLogger(std::string cf) { Stem = cf; };

-      void TrajectoryComplete(int traj, GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG & pRNG )
-      {
-	  std::string file;   { std::ostringstream os; os << Stem     <<"."<< traj; file = os.str(); }
-	  std::ofstream of(file);
+  void TrajectoryComplete(int traj, GaugeField &U, GridSerialRNG &sRNG,
+                          GridParallelRNG &pRNG) {
+    std::string file;
+    {
+      std::ostringstream os;
+      os << Stem << "." << traj;
+      file = os.str();
+    }
+    std::ofstream of(file);

-	  RealD peri_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(U);
-	  RealD peri_rect = WilsonLoops<PeriodicGimplR>::avgRectangle(U);
+    RealD peri_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(U);
+    RealD peri_rect = WilsonLoops<PeriodicGimplR>::avgRectangle(U);

-	  RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(U);
-	  RealD impl_rect = WilsonLoops<Gimpl>::avgRectangle(U);
+    RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(U);
+    RealD impl_rect = WilsonLoops<Gimpl>::avgRectangle(U);

-	  of << traj<<" "<< impl_plaq << " " << impl_rect << "  "<< peri_plaq<<" "<<peri_rect<<std::endl;
-	  std::cout<< GridLogMessage<< "traj"<<" "<< "plaq " << " " << " rect  " << "  "<< "peri_plaq" <<" "<<"peri_rect"<<std::endl;
-	  std::cout<< GridLogMessage<< traj<<" "<< impl_plaq << " " << impl_rect << "  "<< peri_plaq<<" "<<peri_rect<<std::endl;
-      }
-    };
+    of << traj << " " << impl_plaq << " " << impl_rect << "  " << peri_plaq
+       << " " << peri_rect << std::endl;
+    std::cout << GridLogMessage << "traj"
+              << " "
+              << "plaq "
+              << " "
+              << " rect  "
+              << "  "
+              << "peri_plaq"
+              << " "
+              << "peri_rect" << std::endl;
+    std::cout << GridLogMessage << traj << " " << impl_plaq << " " << impl_rect
+              << "  " << peri_plaq << " " << peri_rect << std::endl;
+  }
+};

-    //    template <class GaugeField, class Integrator, class Smearer, class Boundary> 
-    template <class GaugeField, class IntegratorType>
-    class HybridMonteCarlo {
-    private:
+//    template <class GaugeField, class Integrator, class Smearer, class
+//    Boundary>
+template <class GaugeField, class IntegratorType>
+class HybridMonteCarlo {
+ private:
+  const HMCparameters Params;

-      const HMCparameters Params;
-      
-      GridSerialRNG   &sRNG; // Fixme: need a RNG management strategy.
-      GridParallelRNG &pRNG; // Fixme: need a RNG management strategy.
-      GaugeField      & Ucur;
+  GridSerialRNG &sRNG;    // Fixme: need a RNG management strategy.
+  GridParallelRNG &pRNG;  // Fixme: need a RNG management strategy.
+  GaugeField &Ucur;

-      IntegratorType &TheIntegrator;
-      std::vector<HmcObservable<GaugeField> *> Observables;
+  IntegratorType &TheIntegrator;
+  std::vector<HmcObservable<GaugeField> *> Observables;

-      /////////////////////////////////////////////////////////
-      // Metropolis step
-      /////////////////////////////////////////////////////////
-      bool metropolis_test(const RealD DeltaH){
+  /////////////////////////////////////////////////////////
+  // Metropolis step
+  /////////////////////////////////////////////////////////
+  bool metropolis_test(const RealD DeltaH) {
+    RealD rn_test;

-	RealD rn_test;
+    RealD prob = std::exp(-DeltaH);

-	RealD prob = std::exp(-DeltaH);
+    random(sRNG, rn_test);

-	random(sRNG,rn_test);
-      
-	std::cout<<GridLogMessage<< "--------------------------------------------\n";
-	std::cout<<GridLogMessage<< "dH = "<<DeltaH << "  Random = "<< rn_test <<"\n";
-	std::cout<<GridLogMessage<< "Acc. Probability = " << ((prob<1.0)? prob: 1.0)<< "   ";
-      
-	if((prob >1.0) || (rn_test <= prob)){       // accepted
-	  std::cout<<GridLogMessage <<"-- ACCEPTED\n";
-	  return true;
-	} else {                               // rejected
-	  std::cout<<GridLogMessage <<"-- REJECTED\n";
-	  return false;
-	}
+    std::cout << GridLogMessage
+              << "--------------------------------------------------\n";
+    std::cout << GridLogMessage << "exp(-dH) = " << prob
+              << "  Random = " << rn_test << "\n";
+    std::cout << GridLogMessage
+              << "Acc. Probability = " << ((prob < 1.0) ? prob : 1.0) << "\n";

+    if ((prob > 1.0) || (rn_test <= prob)) {  // accepted
+      std::cout << GridLogMessage << "Metropolis_test -- ACCEPTED\n";
+      std::cout << GridLogMessage
+                << "--------------------------------------------------\n";
+      return true;
+    } else {  // rejected
+      std::cout << GridLogMessage << "Metropolis_test -- REJECTED\n";
+      std::cout << GridLogMessage
+                << "--------------------------------------------------\n";
+      return false;
+    }
+  }
+
+  /////////////////////////////////////////////////////////
+  // Evolution
+  /////////////////////////////////////////////////////////
+  RealD evolve_step(GaugeField &U) {
+    TheIntegrator.refresh(U, pRNG);  // set U and initialize P and phi's
+
+    RealD H0 = TheIntegrator.S(U);  // initial state action
+
+    std::streamsize current_precision = std::cout.precision();
+    std::cout.precision(17);
+    std::cout << GridLogMessage << "Total H before trajectory = " << H0 << "\n";
+    std::cout.precision(current_precision);
+
+    TheIntegrator.integrate(U);
+
+    RealD H1 = TheIntegrator.S(U);  // updated state action
+
+    std::cout.precision(17);
+    std::cout << GridLogMessage << "Total H after trajectory  = " << H1
+              << "  dH = " << H1 - H0 << "\n";
+    std::cout.precision(current_precision);
+
+    return (H1 - H0);
+  }
+
+ public:
+  /////////////////////////////////////////
+  // Constructor
+  /////////////////////////////////////////
+  HybridMonteCarlo(HMCparameters Pams, IntegratorType &_Int,
+                   GridSerialRNG &_sRNG, GridParallelRNG &_pRNG, GaugeField &_U)
+      : Params(Pams), TheIntegrator(_Int), sRNG(_sRNG), pRNG(_pRNG), Ucur(_U) {}
+  ~HybridMonteCarlo(){};
+
+  void AddObservable(HmcObservable<GaugeField> *obs) {
+    Observables.push_back(obs);
+  }
+
+  void evolve(void) {
+    Real DeltaH;
+
+    GaugeField Ucopy(Ucur._grid);
+
+    Params.print();
+
+    // Actual updates (evolve a copy Ucopy then copy back eventually)
+    for (int traj = Params.StartTrajectory;
+         traj < Params.Trajectories + Params.StartTrajectory; ++traj) {
+      std::cout << GridLogMessage << "-- # Trajectory = " << traj << "\n";
+      Ucopy = Ucur;
+
+      DeltaH = evolve_step(Ucopy);
+
+      bool accept = true;
+      if (traj >= Params.NoMetropolisUntil) {
+        accept = metropolis_test(DeltaH);
      }

-      /////////////////////////////////////////////////////////
-      // Evolution
-      /////////////////////////////////////////////////////////
-      RealD evolve_step(GaugeField& U){
-
-	TheIntegrator.refresh(U,pRNG); // set U and initialize P and phi's 
-
-	RealD H0 = TheIntegrator.S(U); // initial state action  
-
-	std::cout<<GridLogMessage<<"Total H before = "<< H0 << "\n";
-
-	TheIntegrator.integrate(U);
-      
-	RealD H1 = TheIntegrator.S(U); // updated state action            
-
-	std::cout<<GridLogMessage<<"Total H after = "<< H1 << "\n";
-
-	return (H1-H0);
-      }
-      
-    public:
-
-      /////////////////////////////////////////
-      // Constructor
-      /////////////////////////////////////////
-      HybridMonteCarlo(HMCparameters Pms,  IntegratorType &_Int, GridSerialRNG &_sRNG, GridParallelRNG &_pRNG, GaugeField &_U ) :
-        Params(Pms), 
-	TheIntegrator(_Int), 
-	sRNG(_sRNG),
-	pRNG(_pRNG),
-	Ucur(_U)
-      {
-      }
-      ~HybridMonteCarlo(){};
-
-      void AddObservable(HmcObservable<GaugeField> *obs) {
-	Observables.push_back(obs);
+      if (accept) {
+        Ucur = Ucopy;
      }

-      void evolve(void){
-
-	Real DeltaH;
-
-	GaugeField Ucopy(Ucur._grid);
-	
-	// Actual updates (evolve a copy Ucopy then copy back eventually)
-	for(int traj=Params.StartTrajectory; traj < Params.Trajectories+Params.StartTrajectory; ++traj){
-
-	  std::cout<<GridLogMessage << "-- # Trajectory = "<< traj <<  "\n";
-	  Ucopy = Ucur;
-
-	  DeltaH = evolve_step(Ucopy);
-
-	  bool accept = true;
-	  if ( traj > Params.NoMetropolisUntil) { 
-	    accept = metropolis_test(DeltaH);
-	  }
-	  
-	  if ( accept ) {
-	    Ucur = Ucopy;
-	  }
-
-	  for(int obs = 0;obs<Observables.size();obs++){
-	    Observables[obs]->TrajectoryComplete (traj+1,Ucur,sRNG,pRNG);
-	  }
-
-	}
+      for (int obs = 0; obs < Observables.size(); obs++) {
+        Observables[obs]->TrajectoryComplete(traj + 1, Ucur, sRNG, pRNG);
      }
-    };
-    
-  }// QCD
-}// Grid
+    }
+  }
+};

+}  // QCD
+}  // Grid

-#endif 
+#endif
--- a/lib/qcd/hmc/HmcRunner.h
+++ b/lib/qcd/hmc/HmcRunner.h
@ -47,7 +47,7 @@ public:
  GridRedBlackCartesian * UrbGrid ;
  GridRedBlackCartesian * FrbGrid ;

-  virtual void BuildTheAction (int argc, char **argv) = 0;
+  virtual void BuildTheAction (int argc, char **argv) = 0; // necessary?

  
  void Run (int argc, char  **argv){
@ -81,55 +81,78 @@ public:
      NumTraj = ivec[0];
    }

-    // Create integrator
-    typedef MinimumNorm2<GaugeField>  IntegratorType;// change here to change the algorithm
-    IntegratorParameters MDpar(20);
-    IntegratorType MDynamics(UGrid,MDpar, TheAction);
+    int NumThermalizations = 10;
+    if( GridCmdOptionExists(argv,argv+argc,"--Thermalizations") ){
+      arg= GridCmdOptionPayload(argv,argv+argc,"--Thermalizations");
+      std::vector<int> ivec(0);
+      GridCmdOptionIntVector(arg,ivec);
+      NumThermalizations = ivec[0];
+    }

+
+    GridSerialRNG    sRNG;
+    GridParallelRNG  pRNG(UGrid);
+    LatticeGaugeField  U(UGrid); // change this to an extended field (smearing class)
+
+    std::vector<int> SerSeed({1,2,3,4,5});
+    std::vector<int> ParSeed({6,7,8,9,10});
+
+    
+    // Create integrator, including the smearing policy
+    // Smearing policy
+    std::cout << GridLogDebug << " Creating the Stout class\n";
+    double rho = 0.1; // smearing parameter, now hardcoded
+    int Nsmear = 1;   // number of smearing levels
+    Smear_Stout<Gimpl> Stout(rho);
+    std::cout << GridLogDebug << " Creating the SmearedConfiguration class\n";
+    SmearedConfiguration<Gimpl> SmearingPolicy(UGrid, Nsmear, Stout);
+    std::cout << GridLogDebug << " done\n";
+    //////////////
+    typedef MinimumNorm2<GaugeField, SmearedConfiguration<Gimpl> >  IntegratorType;// change here to change the algorithm
+    IntegratorParameters MDpar(20);
+    IntegratorType MDynamics(UGrid, MDpar, TheAction, SmearingPolicy);
+
+    
    // Checkpoint strategy
    NerscHmcCheckpointer<Gimpl> Checkpoint(std::string("ckpoint_lat"),std::string("ckpoint_rng"),1);
    PlaquetteLogger<Gimpl>      PlaqLog(std::string("plaq"));

    HMCparameters HMCpar;
-    HMCpar.StartTrajectory = StartTraj;
-    HMCpar.Trajectories    = NumTraj;
+    HMCpar.StartTrajectory   = StartTraj;
+    HMCpar.Trajectories      = NumTraj;
+    HMCpar.NoMetropolisUntil = NumThermalizations;
    
-    GridSerialRNG    sRNG;
-    GridParallelRNG  pRNG(UGrid);
-    LatticeGaugeField  U(UGrid);
-
-    std::vector<int> SerSeed({1,2,3,4,5});
-    std::vector<int> ParSeed({6,7,8,9,10});

    if ( StartType == HotStart ) {
      // Hot start
-      HMCpar.NoMetropolisUntil =10;
      HMCpar.MetropolisTest = true;
      sRNG.SeedFixedIntegers(SerSeed);
      pRNG.SeedFixedIntegers(ParSeed);
      SU3::HotConfiguration(pRNG, U);
    } else if ( StartType == ColdStart ) { 
      // Cold start
-      HMCpar.NoMetropolisUntil =10;
      HMCpar.MetropolisTest = true;
      sRNG.SeedFixedIntegers(SerSeed);
      pRNG.SeedFixedIntegers(ParSeed);
      SU3::ColdConfiguration(pRNG, U);
    } else if ( StartType == TepidStart ) {       
      // Tepid start
-      HMCpar.NoMetropolisUntil =10;
      HMCpar.MetropolisTest = true;
      sRNG.SeedFixedIntegers(SerSeed);
      pRNG.SeedFixedIntegers(ParSeed);
      SU3::TepidConfiguration(pRNG, U);
    } else if ( StartType == CheckpointStart ) { 
-      HMCpar.NoMetropolisUntil =10;
      HMCpar.MetropolisTest = true;
      // CheckpointRestart
      Checkpoint.CheckpointRestore(StartTraj, U, sRNG, pRNG);
    }

-    HybridMonteCarlo<GaugeField,IntegratorType>  HMC(HMCpar, MDynamics,sRNG,pRNG,U);
+    // Attach the gauge field to the smearing Policy and create the fill the smeared set
+    // notice that the unit configuration is singular in this procedure
+    std::cout << GridLogMessage << "Filling the smeared set\n"; 
+    SmearingPolicy.set_GaugeField(U);
+    
+    HybridMonteCarlo<GaugeField,IntegratorType>  HMC(HMCpar, MDynamics,sRNG,pRNG,U); 
    HMC.AddObservable(&Checkpoint);
    HMC.AddObservable(&PlaqLog);
    
--- a/lib/qcd/hmc/integrators/Integrator.h
+++ b/lib/qcd/hmc/integrators/Integrator.h
@ -44,40 +44,40 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 #include <memory>

-namespace Grid{
-  namespace QCD{
+ namespace Grid{
+ 	namespace QCD{

-    struct IntegratorParameters{
+ 		struct IntegratorParameters{

-      int Nexp;
+ 			int Nexp;
      int MDsteps;  // number of outer steps
      RealD trajL;  // trajectory length 
      RealD stepsize;

      IntegratorParameters(int MDsteps_, 
-			   RealD trajL_=1.0,
-			   int Nexp_=12):
-        Nexp(Nexp_),
-	MDsteps(MDsteps_),
-	trajL(trajL_),
-	stepsize(trajL/MDsteps)
-        {
+      	RealD trajL_=1.0,
+      	int Nexp_=12):
+      Nexp(Nexp_),
+      MDsteps(MDsteps_),
+      trajL(trajL_),
+      stepsize(trajL/MDsteps)
+      {
 	  // empty body constructor
-	};
+      };

-    };
+  };

    /*! @brief Class for Molecular Dynamics management */   
-    template<class GaugeField>
-    class Integrator {
+    template<class GaugeField, class SmearingPolicy>
+  class Integrator {

-    protected:
+  protected:

-      typedef IntegratorParameters ParameterType;
+  	typedef IntegratorParameters ParameterType;

-      IntegratorParameters Params;
+  	IntegratorParameters Params;

-      const ActionSet<GaugeField> as;
+  	const ActionSet<GaugeField> as;

      int levels;              //
      double t_U;              // Track time passing on each level and for U and for P
@ -85,17 +85,19 @@ namespace Grid{

      GaugeField P;

+      SmearingPolicy &Smearer;
+      
      // Should match any legal (SU(n)) gauge field
      // Need to use this template to match Ncol to pass to SU<N> class
      template<int Ncol,class vec> void generate_momenta(Lattice< iVector< iScalar< iMatrix<vec,Ncol> >, Nd> > & P,GridParallelRNG& pRNG){
-	typedef Lattice< iScalar< iScalar< iMatrix<vec,Ncol> > > > GaugeLinkField;
-	GaugeLinkField Pmu(P._grid);
-	Pmu = zero;
-	for(int mu=0;mu<Nd;mu++){
-	  SU<Ncol>::GaussianLieAlgebraMatrix(pRNG, Pmu);
-	  PokeIndex<LorentzIndex>(P, Pmu, mu);
-	}
+      typedef Lattice< iScalar< iScalar< iMatrix<vec,Ncol> > > > GaugeLinkField;
+      GaugeLinkField Pmu(P._grid);
+      Pmu = zero;
+      for(int mu=0;mu<Nd;mu++){
+      	SU<Ncol>::GaussianLieAlgebraMatrix(pRNG, Pmu);
+      	PokeIndex<LorentzIndex>(P, Pmu, mu);
      }
+  }


      //ObserverList observers; // not yet
@ -103,110 +105,128 @@ namespace Grid{
      //      void register_observers();
      //      void notify_observers();

-      void update_P(GaugeField&U, int level,double ep){
-	t_P[level]+=ep;
-	update_P(P,U,level,ep);
+  void update_P(GaugeField&U, int level, double ep){
+  	t_P[level]+=ep;
+  	update_P(P,U,level,ep);

-	std::cout<<GridLogIntegrator<<"["<<level<<"] P " << " dt "<< ep <<" : t_P "<< t_P[level] <<std::endl;
-      }
+  	std::cout<<GridLogIntegrator<<"["<<level<<"] P " << " dt "<< ep <<" : t_P "<< t_P[level] <<std::endl;
+  }

-      void update_P(GaugeField &Mom,GaugeField&U, int level,double ep){
-	for(int a=0; a<as[level].actions.size(); ++a){
-	  GaugeField force(U._grid);
-	  as[level].actions.at(a)->deriv(U,force);
-	  Mom = Mom - force*ep;
+  void update_P(GaugeField &Mom,GaugeField&U, int level,double ep){
+  	// input U actually not used... 
+  	for(int a=0; a<as[level].actions.size(); ++a){
+  		GaugeField force(U._grid);
+  		GaugeField& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared);
+  		as[level].actions.at(a)->deriv(Us,force); // deriv should NOT include Ta
+
+	  	std::cout<< GridLogIntegrator << "Smearing (on/off): "<<as[level].actions.at(a)->is_smeared <<std::endl;
+	  	if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);
+	  	force = Ta(force);
+	  	std::cout<< GridLogIntegrator << "Force average: "<< norm2(force)/(U._grid->gSites()) <<std::endl;
+	  	Mom -= force*ep;
+	  }
 	}
-      }

-      void update_U(GaugeField&U, double ep){
-	update_U(P,U,ep);
+	void update_U(GaugeField&U, double ep){
+		update_U(P,U,ep);

-	t_U+=ep;
-	int fl = levels-1;
-	std::cout<<GridLogIntegrator<<"   "<<"["<<fl<<"] U " << " dt "<< ep <<" : t_U "<< t_U <<std::endl;
+		t_U+=ep;
+		int fl = levels-1;
+		std::cout<< GridLogIntegrator <<"   "<<"["<<fl<<"] U " << " dt "<< ep <<" : t_U "<< t_U <<std::endl;

-      }
-      void update_U(GaugeField &Mom, GaugeField&U, double ep){
+	}
+	void update_U(GaugeField &Mom, GaugeField&U, double ep){
 	//rewrite exponential to deal automatically  with the lorentz index?
 	//	GaugeLinkField Umu(U._grid);
 	//	GaugeLinkField Pmu(U._grid);
-	for (int mu = 0; mu < Nd; mu++){
-	  auto Umu=PeekIndex<LorentzIndex>(U, mu);
-	  auto Pmu=PeekIndex<LorentzIndex>(Mom, mu);
-	  Umu = expMat(Pmu, ep, Params.Nexp)*Umu;
-	  ProjectOnGroup(Umu);
-	  PokeIndex<LorentzIndex>(U, Umu, mu);
+		for (int mu = 0; mu < Nd; mu++){
+			auto Umu=PeekIndex<LorentzIndex>(U, mu);
+			auto Pmu=PeekIndex<LorentzIndex>(Mom, mu);
+			Umu = expMat(Pmu, ep, Params.Nexp)*Umu;
+			ProjectOnGroup(Umu);
+			PokeIndex<LorentzIndex>(U, Umu, mu);
+		}
+	// Update the smeared fields, can be implemented as observer
+		Smearer.set_GaugeField(U);
 	}
-      }
-      
-      virtual void step (GaugeField& U,int level, int first,int last)=0;

-    public:
+	virtual void step (GaugeField& U,int level, int first,int last)=0;

-      Integrator(GridBase* grid, 
-		 IntegratorParameters Par,
-		 ActionSet<GaugeField> & Aset):
-          Params(Par),
-    	  as(Aset),
-	  P(grid),
-	  levels(Aset.size())
-      {
-	t_P.resize(levels,0.0);
-	t_U=0.0;
-      };
-      
-      virtual ~Integrator(){}
+public:
+
+	Integrator(GridBase* grid, 
+		IntegratorParameters Par,
+		ActionSet<GaugeField> & Aset,
+		SmearingPolicy &Sm):
+	Params(Par),
+	as(Aset),
+	P(grid),
+	levels(Aset.size()),
+	Smearer(Sm)
+	{
+		t_P.resize(levels,0.0);
+		t_U=0.0;
+	// initialization of smearer delegated outside of Integrator
+	};
+
+	virtual ~Integrator(){}

      //Initialization of momenta and actions
-      void refresh(GaugeField& U,GridParallelRNG &pRNG){
-	std::cout<<GridLogIntegrator<< "Integrator refresh\n";
-	generate_momenta(P,pRNG);
-	for(int level=0; level< as.size(); ++level){
-	  for(int actionID=0; actionID<as[level].actions.size(); ++actionID){
-	    as[level].actions.at(actionID)->refresh(U, pRNG);
-	  }
+	void refresh(GaugeField& U,GridParallelRNG &pRNG){
+		std::cout<<GridLogIntegrator<< "Integrator refresh\n";
+		generate_momenta(P,pRNG);
+		for(int level=0; level< as.size(); ++level){
+			for(int actionID=0; actionID<as[level].actions.size(); ++actionID){
+	    // get gauge field from the SmearingPolicy and
+	    // based on the boolean is_smeared in actionID
+				GaugeField& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
+				as[level].actions.at(actionID)->refresh(Us, pRNG);
+			}
+		}
 	}
-      }

      // Calculate action
-      RealD S(GaugeField& U){
+	RealD S(GaugeField& U){// here also U not used

-	LatticeComplex Hloc(U._grid);	Hloc = zero;
+		LatticeComplex Hloc(U._grid);	Hloc = zero;
 	// Momenta
-	for (int mu=0; mu <Nd; mu++){
-	  auto Pmu = PeekIndex<LorentzIndex>(P, mu);
-	  Hloc -= trace(Pmu*Pmu);
-	}
-	Complex Hsum = sum(Hloc);
-	
-	RealD H = Hsum.real();
-	RealD Hterm;
-	std::cout<<GridLogMessage << "Momentum action H_p = "<< H << "\n";
+		for (int mu=0; mu <Nd; mu++){
+			auto Pmu = PeekIndex<LorentzIndex>(P, mu);
+			Hloc -= trace(Pmu*Pmu);
+		}
+		Complex Hsum = sum(Hloc);
+
+		RealD H = Hsum.real();
+		RealD Hterm;
+		std::cout<<GridLogMessage << "Momentum action H_p = "<< H << "\n";

 	// Actions
-	for(int level=0; level<as.size(); ++level){
-	  for(int actionID=0; actionID<as[level].actions.size(); ++actionID){
-	    Hterm = as[level].actions.at(actionID)->S(U);
-	    std::cout<<GridLogMessage << "Level "<<level<<" term "<<actionID<<" H = "<<Hterm<<std::endl;
-	    H += Hterm;
-	  }
-	}
-	
-	return H;
-      }
+		for(int level=0; level<as.size(); ++level){
+			for(int actionID=0; actionID<as[level].actions.size(); ++actionID){
+	    // get gauge field from the SmearingPolicy and
+	    // based on the boolean is_smeared in actionID
+				GaugeField& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
+				Hterm = as[level].actions.at(actionID)->S(Us);
+				std::cout<<GridLogMessage << "S Level "<<level<<" term "<<actionID<<" H = "<<Hterm<<std::endl;
+				H += Hterm;
+			}
+		}

-      void integrate(GaugeField& U){
+		return H;
+	}
+
+	void integrate(GaugeField& U){

 	// reset the clocks
-	t_U=0;
-	for(int level=0; level<as.size(); ++level){
-	  t_P[level]=0;
-	}	
+		t_U=0;
+		for(int level=0; level<as.size(); ++level){
+			t_P[level]=0;
+		}	

 	for(int step=0; step< Params.MDsteps; ++step){   // MD step
-	  int first_step = (step==0);
-	  int  last_step = (step==Params.MDsteps-1);
-	  this->step(U,0,first_step,last_step);
+		int first_step = (step==0);
+		int  last_step = (step==Params.MDsteps-1);
+		this->step(U,0,first_step,last_step);
 	}

 	// Check the clocks all match on all levels
@ -219,9 +239,9 @@ namespace Grid{
 	assert(fabs(t_U-Params.trajL) < 1.0e-6);


-      }
-    };
-    
-  }
+}
+};
+
+}
 }
 #endif//INTEGRATOR_INCLUDED
--- a/lib/qcd/hmc/integrators/Integrator_algorithm.h
+++ b/lib/qcd/hmc/integrators/Integrator_algorithm.h
@ -91,14 +91,17 @@ namespace Grid{
    *  P 1/2                            P 1/2
    */    

-    template<class GaugeField> class LeapFrog : public Integrator<GaugeField> {
+    template<class GaugeField, class SmearingPolicy> class LeapFrog :
+      public Integrator<GaugeField, SmearingPolicy> {
    public:

-      typedef LeapFrog<GaugeField> Algorithm;
+      typedef LeapFrog<GaugeField, SmearingPolicy> Algorithm;

      LeapFrog(GridBase* grid, 
 	       IntegratorParameters Par,
-	       ActionSet<GaugeField> & Aset): Integrator<GaugeField>(grid,Par,Aset) {};
+	       ActionSet<GaugeField> & Aset,
+	       SmearingPolicy & Sm):
+	Integrator<GaugeField, SmearingPolicy>(grid,Par,Aset,Sm) {};


      void step (GaugeField& U, int level,int _first, int _last){
@ -135,7 +138,8 @@ namespace Grid{
      }
    };

-    template<class GaugeField> class MinimumNorm2 : public Integrator<GaugeField> {
+    template<class GaugeField, class SmearingPolicy> class MinimumNorm2 :
+      public Integrator<GaugeField, SmearingPolicy> {
    private:
      const RealD lambda = 0.1931833275037836;

@ -143,7 +147,9 @@ namespace Grid{

      MinimumNorm2(GridBase* grid, 
 		   IntegratorParameters Par,
-		   ActionSet<GaugeField> & Aset): Integrator<GaugeField>(grid,Par,Aset) {};
+		   ActionSet<GaugeField> & Aset,
+		   SmearingPolicy& Sm):
+	Integrator<GaugeField, SmearingPolicy>(grid,Par,Aset,Sm) {};

      void step (GaugeField& U, int level, int _first,int _last){

@ -191,7 +197,8 @@ namespace Grid{
    };


-    template<class GaugeField> class ForceGradient : public Integrator<GaugeField> {
+    template<class GaugeField, class SmearingPolicy> class ForceGradient :
+      public Integrator<GaugeField, SmearingPolicy> {
    private:
      const RealD lambda = 1.0/6.0;;
      const RealD chi    = 1.0/72.0;
@ -202,7 +209,9 @@ namespace Grid{
      // Looks like dH scales as dt^4. tested wilson/wilson 2 level.
    ForceGradient(GridBase* grid, 
 		  IntegratorParameters Par,
-		  ActionSet<GaugeField> & Aset): Integrator<GaugeField>(grid,Par,Aset) {};
+		  ActionSet<GaugeField> & Aset,
+		  SmearingPolicy &Sm):
+      Integrator<GaugeField, SmearingPolicy>(grid,Par,Aset, Sm) {};


      void FG_update_P(GaugeField&U, int level,double fg_dt,double ep){
--- a/lib/qcd/smearing/APEsmearing.h
+++ b/lib/qcd/smearing/APEsmearing.h
@ -0,0 +1,130 @@
+/*!
+  @brief Declaration of Smear_APE class for APE smearing
+*/
+
+#ifndef APE_SMEAR_
+#define APE_SMEAR_
+
+  namespace Grid {
+  	namespace QCD {
+
+
+    /*!  @brief APE type smearing of link variables. */
+    template <class Gimpl> 
+  		class Smear_APE: public Smear<Gimpl>{
+  		private:
+      	const std::vector<double> rho;/*!< Array of weights */
+
+//This member must be private - we do not want to control from outside 
+  			std::vector<double> set_rho(const double common_rho) const {
+  				std::vector<double> res;
+
+  				for(int mn=0; mn<Nd*Nd; ++mn) res.push_back(common_rho);
+  					for(int mu=0; mu<Nd; ++mu) res[mu + mu*Nd] = 0.0;
+  						return res;
+  				}
+
+  			public:
+      // Defines the gauge field types
+  				INHERIT_GIMPL_TYPES(Gimpl)
+
+
+      // Constructors and destructors
+  				Smear_APE(const std::vector<double>& rho_):rho(rho_){} // check vector size
+  				Smear_APE(double rho_val):rho(set_rho(rho_val)){}
+  				Smear_APE():rho(set_rho(1.0)){}
+  				~Smear_APE(){}
+
+      ///////////////////////////////////////////////////////////////////////////////
+  				void smear(GaugeField& u_smr, const GaugeField& U)const{
+  					GridBase *grid = U._grid;
+  					GaugeLinkField Cup(grid), tmp_stpl(grid);
+  					WilsonLoops<Gimpl> WL;
+  					u_smr = zero; 
+
+  					for(int mu=0; mu<Nd; ++mu){
+  						Cup = zero;
+  						for(int nu=0; nu<Nd; ++nu){
+  							if (nu != mu) {
+  								// get the staple in direction mu, nu
+	      						WL.Staple(tmp_stpl, U, mu, nu);  //nb staple conventions of IroIro and Grid differ by a dagger
+	      						Cup += tmp_stpl*rho[mu + Nd * nu];
+	      					}
+	      				}
+	  					// save the Cup link-field on the u_smr gauge-field
+	  					pokeLorentz(u_smr, adj(Cup), mu); // u_smr[mu] = Cup^dag   see conventions for Staple
+	  				}
+	  			}
+
+////////////////////////////////////////////////////////////////////////////////
+	  			void derivative(GaugeField& SigmaTerm,
+	  				const GaugeField& iLambda,
+	  				const GaugeField& U)const{
+
+	// Reference 
+	// Morningstar, Peardon, Phys.Rev.D69,054501(2004)
+	// Equation 75
+    // Computing Sigma_mu, derivative of S[fat links] with respect to the thin links
+    // Output SigmaTerm
+
+	  				GridBase *grid = U._grid;
+
+	  				WilsonLoops<Gimpl> WL;
+	  				GaugeLinkField staple(grid), u_tmp(grid);
+	  				GaugeLinkField iLambda_mu(grid), iLambda_nu(grid);
+	  				GaugeLinkField U_mu(grid), U_nu(grid);
+	  				GaugeLinkField sh_field(grid), temp_Sigma(grid);
+	  				Real rho_munu, rho_numu;
+
+	  				for(int mu = 0; mu < Nd; ++mu){
+	  					U_mu       = peekLorentz(      U, mu);
+	  					iLambda_mu = peekLorentz(iLambda, mu);
+
+	  					for(int nu = 0; nu < Nd; ++nu){
+	  						if(nu==mu) continue;
+	  						U_nu       = peekLorentz(      U, nu);
+	  						iLambda_nu = peekLorentz(iLambda, nu);
+
+	  						rho_munu = rho[mu + Nd * nu];
+	  						rho_numu = rho[nu + Nd * mu];
+
+	  						WL.StapleUpper(staple, U, mu, nu);
+
+	  						temp_Sigma = -rho_numu*staple*iLambda_nu;  //ok
+	        				//-r_numu*U_nu(x+mu)*Udag_mu(x+nu)*Udag_nu(x)*Lambda_nu(x)
+	  						Gimpl::AddGaugeLink(SigmaTerm, temp_Sigma, mu);
+
+	    					sh_field = Cshift(iLambda_nu, mu, 1);// general also for Gparity?
+
+	    					temp_Sigma = rho_numu*sh_field*staple; //ok
+	    					//r_numu*Lambda_nu(mu)*U_nu(x+mu)*Udag_mu(x+nu)*Udag_nu(x)
+	    					Gimpl::AddGaugeLink(SigmaTerm, temp_Sigma, mu);
+
+	    					sh_field = Cshift(iLambda_mu, nu, 1);
+
+	    					temp_Sigma = -rho_munu*staple*U_nu*sh_field*adj(U_nu); //ok
+	    					//-r_munu*U_nu(x+mu)*Udag_mu(x+nu)*Lambda_mu(x+nu)*Udag_nu(x)
+	    					Gimpl::AddGaugeLink(SigmaTerm, temp_Sigma, mu);
+
+	    					staple = zero;
+	    					sh_field = Cshift(U_nu, mu, 1);
+
+	    					temp_Sigma = -rho_munu*adj(sh_field)*adj(U_mu)*iLambda_mu*U_nu;
+	    					temp_Sigma += rho_numu*adj(sh_field)*adj(U_mu)*iLambda_nu*U_nu;
+
+	    					u_tmp = adj(U_nu)*iLambda_nu;
+	    					sh_field = Cshift(u_tmp, mu, 1);
+	    					temp_Sigma += -rho_numu*sh_field*adj(U_mu)*U_nu;
+	    					sh_field = Cshift(temp_Sigma, nu, -1);
+	    					Gimpl::AddGaugeLink(SigmaTerm, sh_field, mu);
+
+	    				}
+	    			}
+	    		}
+	    	};
+
+
+
+  }// namespace QCD
+}//namespace Grid
+#endif  
--- a/lib/qcd/smearing/BaseSmearing.h
+++ b/lib/qcd/smearing/BaseSmearing.h
@ -0,0 +1,17 @@
+/*
+  @brief Declares base smearing class Smear
+ */
+#ifndef BASE_SMEAR_
+#define BASE_SMEAR_
+
+template <class Gimpl> 
+class Smear{
+public:
+  INHERIT_GIMPL_TYPES(Gimpl) // inherits the types for the gauge fields
+
+  virtual ~Smear(){}
+  virtual void smear     (GaugeField&,const GaugeField&)const = 0;
+  virtual void derivative(GaugeField&,
+			  const GaugeField&,const GaugeField&) const = 0;
+};
+#endif
--- a/lib/qcd/smearing/GaugeConfiguration.h
+++ b/lib/qcd/smearing/GaugeConfiguration.h
@ -0,0 +1,262 @@
+/*!
+  @file GaugeConfiguration.h
+
+  @brief Declares the GaugeConfiguration class
+*/
+#ifndef GAUGE_CONFIG_
+#define GAUGE_CONFIG_
+
+namespace Grid {
+
+namespace QCD {
+
+/*!
+  @brief Smeared configuration container
+
+  It will behave like a configuration from the point of view of
+  the HMC update and integrators.
+  An "advanced configuration" object that can provide not only the
+  data to store the gauge configuration but also operations to manipulate
+  it, like smearing.
+
+  It stores a list of smeared configurations.
+*/
+template <class Gimpl>
+class SmearedConfiguration {
+ public:
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+ private:
+  const unsigned int smearingLevels;
+  Smear_Stout<Gimpl> StoutSmearing;
+  std::vector<GaugeField> SmearedSet;
+
+  // Member functions
+  //====================================================================
+  void fill_smearedSet(GaugeField& U) {
+    ThinLinks = &U;  // attach the smearing routine to the field U
+
+    // check the pointer is not null
+    if (ThinLinks == NULL)
+      std::cout << GridLogError
+                << "[SmearedConfiguration] Error in ThinLinks pointer\n";
+
+    if (smearingLevels > 0) {
+      std::cout << GridLogDebug
+                << "[SmearedConfiguration] Filling SmearedSet\n";
+      GaugeField previous_u(ThinLinks->_grid);
+
+      previous_u = *ThinLinks;
+      for (int smearLvl = 0; smearLvl < smearingLevels; ++smearLvl) {
+        StoutSmearing.smear(SmearedSet[smearLvl], previous_u);
+        previous_u = SmearedSet[smearLvl];
+
+        // For debug purposes
+        RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(previous_u);
+        std::cout << GridLogDebug
+                  << "[SmearedConfiguration] Plaq: " << impl_plaq << std::endl;
+      }
+    }
+  }
+  //====================================================================
+  GaugeField AnalyticSmearedForce(const GaugeField& SigmaKPrime,
+                                  const GaugeField& GaugeK) const {
+    GridBase* grid = GaugeK._grid;
+    GaugeField C(grid), SigmaK(grid), iLambda(grid);
+    GaugeLinkField iLambda_mu(grid);
+    GaugeLinkField iQ(grid), e_iQ(grid);
+    GaugeLinkField SigmaKPrime_mu(grid);
+    GaugeLinkField GaugeKmu(grid), Cmu(grid);
+
+    StoutSmearing.BaseSmear(C, GaugeK);
+    SigmaK = zero;
+    iLambda = zero;
+
+    for (int mu = 0; mu < Nd; mu++) {
+      Cmu = peekLorentz(C, mu);
+      GaugeKmu = peekLorentz(GaugeK, mu);
+      SigmaKPrime_mu = peekLorentz(SigmaKPrime, mu);
+      iQ = Ta(Cmu * adj(GaugeKmu));
+      set_iLambda(iLambda_mu, e_iQ, iQ, SigmaKPrime_mu, GaugeKmu);
+      pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu);
+      pokeLorentz(iLambda, iLambda_mu, mu);
+    }
+    StoutSmearing.derivative(SigmaK, iLambda,
+                             GaugeK);  // derivative of SmearBase
+    return SigmaK;
+  }
+
+  /*! @brief Returns smeared configuration at level 'Level' */
+  const GaugeField& get_smeared_conf(int Level) const {
+    return SmearedSet[Level];
+  }
+
+  //====================================================================
+  void set_iLambda(GaugeLinkField& iLambda, GaugeLinkField& e_iQ,
+                   const GaugeLinkField& iQ, const GaugeLinkField& Sigmap,
+                   const GaugeLinkField& GaugeK) const {
+    GridBase* grid = iQ._grid;
+    GaugeLinkField iQ2(grid), iQ3(grid), B1(grid), B2(grid), USigmap(grid);
+    GaugeLinkField unity(grid);
+    unity = 1.0;
+
+    LatticeComplex u(grid), w(grid);
+    LatticeComplex f0(grid), f1(grid), f2(grid);
+    LatticeComplex xi0(grid), xi1(grid), tmp(grid);
+    LatticeComplex u2(grid), w2(grid), cosw(grid);
+    LatticeComplex emiu(grid), e2iu(grid), qt(grid), fden(grid);
+    LatticeComplex r01(grid), r11(grid), r21(grid), r02(grid), r12(grid);
+    LatticeComplex r22(grid), tr1(grid), tr2(grid);
+    LatticeComplex b10(grid), b11(grid), b12(grid), b20(grid), b21(grid),
+        b22(grid);
+    LatticeComplex LatticeUnitComplex(grid);
+
+    LatticeUnitComplex = 1.0;
+
+    // Exponential
+    iQ2 = iQ * iQ;
+    iQ3 = iQ * iQ2;
+    StoutSmearing.set_uw(u, w, iQ2, iQ3);
+    StoutSmearing.set_fj(f0, f1, f2, u, w);
+    e_iQ = f0 * unity + timesMinusI(f1) * iQ - f2 * iQ2;
+
+    // Getting B1, B2, Gamma and Lambda
+    // simplify this part, reduntant calculations in set_fj
+    xi0 = StoutSmearing.func_xi0(w);
+    xi1 = StoutSmearing.func_xi1(w);
+    u2 = u * u;
+    w2 = w * w;
+    cosw = cos(w);
+
+    emiu = cos(u) - timesI(sin(u));
+    e2iu = cos(2.0 * u) + timesI(sin(2.0 * u));
+
+    r01 = (2.0 * u + timesI(2.0 * (u2 - w2))) * e2iu +
+          emiu * ((16.0 * u * cosw + 2.0 * u * (3.0 * u2 + w2) * xi0) +
+                  timesI(-8.0 * u2 * cosw + 2.0 * (9.0 * u2 + w2) * xi0));
+
+    r11 = (2.0 * LatticeUnitComplex + timesI(4.0 * u)) * e2iu +
+          emiu * ((-2.0 * cosw + (3.0 * u2 - w2) * xi0) +
+                  timesI((2.0 * u * cosw + 6.0 * u * xi0)));
+
+    r21 =
+        2.0 * timesI(e2iu) + emiu * (-3.0 * u * xi0 + timesI(cosw - 3.0 * xi0));
+
+    r02 = -2.0 * e2iu +
+          emiu * (-8.0 * u2 * xi0 +
+                  timesI(2.0 * u * (cosw + xi0 + 3.0 * u2 * xi1)));
+
+    r12 = emiu * (2.0 * u * xi0 + timesI(-cosw - xi0 + 3.0 * u2 * xi1));
+
+    r22 = emiu * (xi0 - timesI(3.0 * u * xi1));
+
+    fden = LatticeUnitComplex / (2.0 * (9.0 * u2 - w2) * (9.0 * u2 - w2));
+
+    b10 = 2.0 * u * r01 + (3.0 * u2 - w2) * r02 - (30.0 * u2 + 2.0 * w2) * f0;
+    b11 = 2.0 * u * r11 + (3.0 * u2 - w2) * r12 - (30.0 * u2 + 2.0 * w2) * f1;
+    b12 = 2.0 * u * r21 + (3.0 * u2 - w2) * r22 - (30.0 * u2 + 2.0 * w2) * f2;
+
+    b20 = r01 - (3.0 * u) * r02 - (24.0 * u) * f0;
+    b21 = r11 - (3.0 * u) * r12 - (24.0 * u) * f1;
+    b22 = r21 - (3.0 * u) * r22 - (24.0 * u) * f2;
+
+    b10 *= fden;
+    b11 *= fden;
+    b12 *= fden;
+    b20 *= fden;
+    b21 *= fden;
+    b22 *= fden;
+
+    B1 = b10 * unity + timesMinusI(b11) * iQ - b12 * iQ2;
+    B2 = b20 * unity + timesMinusI(b21) * iQ - b22 * iQ2;
+    USigmap = GaugeK * Sigmap;
+
+    tr1 = trace(USigmap * B1);
+    tr2 = trace(USigmap * B2);
+
+    GaugeLinkField QUS = iQ * USigmap;
+    GaugeLinkField USQ = USigmap * iQ;
+
+    GaugeLinkField iGamma = tr1 * iQ - timesI(tr2) * iQ2 +
+                            timesI(f1) * USigmap + f2 * QUS + f2 * USQ;
+
+    iLambda = Ta(iGamma);
+  }
+
+  //====================================================================
+ public:
+  GaugeField*
+      ThinLinks; /*!< @brief Pointer to the thin
+                                                         links configuration */
+
+  /*! @brief Standard constructor */
+  SmearedConfiguration(GridCartesian* UGrid, unsigned int Nsmear,
+                       Smear_Stout<Gimpl>& Stout)
+      : smearingLevels(Nsmear), StoutSmearing(Stout), ThinLinks(NULL) {
+    for (unsigned int i = 0; i < smearingLevels; ++i)
+      SmearedSet.push_back(*(new GaugeField(UGrid)));
+  }
+
+  /*! For just thin links */
+  SmearedConfiguration()
+      : smearingLevels(0), StoutSmearing(), SmearedSet(), ThinLinks(NULL) {}
+
+  // attach the smeared routines to the thin links U and fill the smeared set
+  void set_GaugeField(GaugeField& U) { fill_smearedSet(U); }
+
+  //====================================================================
+  void smeared_force(GaugeField& SigmaTilde) const {
+    if (smearingLevels > 0) {
+      GaugeField force = SigmaTilde; // actually = U*SigmaTilde
+      GaugeLinkField tmp_mu(SigmaTilde._grid);
+
+      for (int mu = 0; mu < Nd; mu++) {
+        // to get just SigmaTilde
+        tmp_mu = adj(peekLorentz(SmearedSet[smearingLevels - 1], mu)) *
+                 peekLorentz(force, mu);
+        pokeLorentz(force, tmp_mu, mu);
+      }
+
+      for (int ismr = smearingLevels - 1; ismr > 0; --ismr)
+        force = AnalyticSmearedForce(force, get_smeared_conf(ismr - 1));
+
+      force = AnalyticSmearedForce(force, *ThinLinks);
+
+      for (int mu = 0; mu < Nd; mu++) {
+        tmp_mu = peekLorentz(*ThinLinks, mu) * peekLorentz(force, mu);
+        pokeLorentz(SigmaTilde, tmp_mu, mu);
+      }
+    }  // if smearingLevels = 0 do nothing
+  }
+  //====================================================================
+
+  GaugeField& get_SmearedU() { return SmearedSet[smearingLevels - 1]; }
+
+  GaugeField& get_U(bool smeared = false) {
+    // get the config, thin links by default
+    if (smeared) {
+      if (smearingLevels) {
+        RealD impl_plaq =
+            WilsonLoops<Gimpl>::avgPlaquette(SmearedSet[smearingLevels - 1]);
+        std::cout << GridLogDebug << "getting Usmr Plaq: " << impl_plaq
+                  << std::endl;
+        return get_SmearedU();
+
+      } else {
+        RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(*ThinLinks);
+        std::cout << GridLogDebug << "getting Thin Plaq: " << impl_plaq
+                  << std::endl;
+        return *ThinLinks;
+      }
+    } else {
+      RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(*ThinLinks);
+      std::cout << GridLogDebug << "getting Thin Plaq: " << impl_plaq
+                << std::endl;
+      return *ThinLinks;
+    }
+  }
+};
+}
+}
+
+#endif
--- a/lib/qcd/smearing/Smearing.h
+++ b/lib/qcd/smearing/Smearing.h
@ -0,0 +1,9 @@
+#ifndef GRID_QCD_SMEARING_H
+#define GRID_QCD_SMEARING_H
+
+#include <Grid/qcd/smearing/BaseSmearing.h>
+#include <Grid/qcd/smearing/APEsmearing.h>
+#include <Grid/qcd/smearing/StoutSmearing.h>
+#include <Grid/qcd/smearing/GaugeConfiguration.h>
+
+#endif
--- a/lib/qcd/smearing/StoutSmearing.h
+++ b/lib/qcd/smearing/StoutSmearing.h
@ -0,0 +1,160 @@
+/*
+  @file stoutSmear.hpp
+  @brief Declares Stout smearing class
+*/
+#ifndef STOUT_SMEAR_
+#define STOUT_SMEAR_
+
+namespace Grid {
+namespace QCD {
+
+/*!  @brief Stout smearing of link variable. */
+template <class Gimpl>
+class Smear_Stout : public Smear<Gimpl> {
+ private:
+  const Smear<Gimpl>* SmearBase;
+
+ public:
+  INHERIT_GIMPL_TYPES(Gimpl)
+
+  Smear_Stout(Smear<Gimpl>* base) : SmearBase(base) {
+    static_assert(Nc == 3,
+                  "Stout smearing currently implemented only for Nc==3");
+  }
+
+  /*! Default constructor */
+  Smear_Stout(double rho = 1.0) : SmearBase(new Smear_APE<Gimpl>(rho)) {
+    static_assert(Nc == 3,
+                  "Stout smearing currently implemented only for Nc==3");
+  }
+
+  ~Smear_Stout() {}  // delete SmearBase...
+
+  void smear(GaugeField& u_smr, const GaugeField& U) const {
+    GaugeField C(U._grid);
+    GaugeLinkField tmp(U._grid), iq_mu(U._grid), Umu(U._grid);
+
+    std::cout << GridLogDebug << "Stout smearing started\n";
+
+    // Smear the configurations
+    SmearBase->smear(C, U);
+
+    for (int mu = 0; mu < Nd; mu++) {
+      tmp = peekLorentz(C, mu);
+      Umu = peekLorentz(U, mu);
+      iq_mu = Ta(
+          tmp *
+          adj(Umu));  // iq_mu = Ta(Omega_mu) to match the signs with the paper
+      exponentiate_iQ(tmp, iq_mu);
+      pokeLorentz(u_smr, tmp * Umu, mu);  // u_smr = exp(iQ_mu)*U_mu
+    }
+    std::cout << GridLogDebug << "Stout smearing completed\n";
+  };
+
+  void derivative(GaugeField& SigmaTerm, const GaugeField& iLambda,
+                  const GaugeField& Gauge) const {
+    SmearBase->derivative(SigmaTerm, iLambda, Gauge);
+  };
+
+  void BaseSmear(GaugeField& C, const GaugeField& U) const {
+    SmearBase->smear(C, U);
+  };
+
+  void exponentiate_iQ(GaugeLinkField& e_iQ, const GaugeLinkField& iQ) const {
+    // Put this outside
+    // only valid for SU(3) matrices
+
+    // only one Lorentz direction at a time
+
+    // notice that it actually computes
+    // exp ( input matrix )
+    // the i sign is coming from outside
+    // input matrix is anti-hermitian NOT hermitian
+
+    GridBase* grid = iQ._grid;
+    GaugeLinkField unity(grid);
+    unity = 1.0;
+
+    GaugeLinkField iQ2(grid), iQ3(grid);
+    LatticeComplex u(grid), w(grid);
+    LatticeComplex f0(grid), f1(grid), f2(grid);
+
+    iQ2 = iQ * iQ;
+    iQ3 = iQ * iQ2;
+
+    set_uw(u, w, iQ2, iQ3);
+    set_fj(f0, f1, f2, u, w);
+
+    e_iQ = f0 * unity + timesMinusI(f1) * iQ - f2 * iQ2;
+  };
+
+  void set_uw(LatticeComplex& u, LatticeComplex& w, GaugeLinkField& iQ2,
+              GaugeLinkField& iQ3) const {
+    Complex one_over_three = 1.0 / 3.0;
+    Complex one_over_two = 1.0 / 2.0;
+
+    GridBase* grid = u._grid;
+    LatticeComplex c0(grid), c1(grid), tmp(grid), c0max(grid), theta(grid);
+
+    // sign in c0 from the conventions on the Ta
+    c0 = -imag(trace(iQ3)) * one_over_three;  
+    c1 = -real(trace(iQ2)) * one_over_two;
+
+    // Cayley Hamilton checks to machine precision, tested
+    tmp = c1 * one_over_three;
+    c0max = 2.0 * pow(tmp, 1.5);
+
+    theta = acos(c0 / c0max) *
+            one_over_three;  // divide by three here, now leave as it is
+    u = sqrt(tmp) * cos(theta);
+    w = sqrt(c1) * sin(theta);
+  }
+
+  void set_fj(LatticeComplex& f0, LatticeComplex& f1, LatticeComplex& f2,
+              const LatticeComplex& u, const LatticeComplex& w) const {
+    GridBase* grid = u._grid;
+    LatticeComplex xi0(grid), u2(grid), w2(grid), cosw(grid);
+    LatticeComplex fden(grid);
+    LatticeComplex h0(grid), h1(grid), h2(grid);
+    LatticeComplex e2iu(grid), emiu(grid), ixi0(grid), qt(grid);
+    LatticeComplex unity(grid);
+    unity = 1.0;
+
+    xi0 = func_xi0(w);
+    u2 = u * u;
+    w2 = w * w;
+    cosw = cos(w);
+
+    ixi0 = timesI(xi0);
+    emiu = cos(u) - timesI(sin(u));
+    e2iu = cos(2.0 * u) + timesI(sin(2.0 * u));
+
+    h0 = e2iu * (u2 - w2) +
+         emiu * ((8.0 * u2 * cosw) + (2.0 * u * (3.0 * u2 + w2) * ixi0));
+    h1 = e2iu * (2.0 * u) - emiu * ((2.0 * u * cosw) - (3.0 * u2 - w2) * ixi0);
+    h2 = e2iu - emiu * (cosw + (3.0 * u) * ixi0);
+
+    fden = unity / (9.0 * u2 - w2);  // reals
+    f0 = h0 * fden;
+    f1 = h1 * fden;
+    f2 = h2 * fden;
+  }
+
+  LatticeComplex func_xi0(const LatticeComplex& w) const {
+    // Define a function to do the check
+    // if( w < 1e-4 ) std::cout << GridLogWarning<< "[Smear_stout] w too small:
+    // "<< w <<"\n";
+    return sin(w) / w;
+  }
+
+  LatticeComplex func_xi1(const LatticeComplex& w) const {
+    // Define a function to do the check
+    // if( w < 1e-4 ) std::cout << GridLogWarning << "[Smear_stout] w too small:
+    // "<< w <<"\n";
+    return cos(w) / (w * w) - sin(w) / (w * w * w);
+  }
+};
+}
+}
+
+#endif
--- a/Show More
+++ b/Show More