Merge pull request #50 from waterret/develop

use sha256 based splittable rng
Cache face tables to improve performance.
2025-07-26 01:17:06 +01:00 · 2016-10-20 16:40:31 +01:00 · 2016-10-18 09:51:37 +01:00 · 2016-10-17 14:20:59 -04:00 · 2016-10-17 13:31:31 -04:00 · 2016-10-17 13:31:31 -04:00
257 changed files with 16357 additions and 7283 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -5,7 +5,6 @@
 *.o
 *.obj

-
 # Editor files #
 ################
 *~
@@ -48,6 +47,7 @@ Config.h.in
 config.log
 config.status
 .deps
+*.inc

 # http://www.gnu.org/software/autoconf #
 ########################################
@@ -62,19 +62,8 @@ stamp-h1
 config.sub
 config.guess
 INSTALL
-
-# Packages #
-############
-# it's better to unpack these files and commit the raw source
-# git has its own built in compression methods
-*.7z
-*.dmg
-*.gz
-*.iso
-*.jar
-*.rar
-*.tar
-*.zip
+.dirstamp
+ltmain.sh
 
 # Logs and databases #
 ######################
@@ -100,3 +89,16 @@ build*/*
 #####################
 *.xcodeproj/*
 build.sh
+
+# Eigen source #
+################
+lib/Eigen/*
+
+# FFTW source #
+################
+lib/fftw/*
+
+# libtool macros #
+##################
+m4/lt*
+m4/libtool.m4
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,10 +9,6 @@ matrix:
    - os:        osx
      osx_image: xcode7.2
      compiler: clang
-    - os:        osx
-      osx_image: xcode7.2
-      compiler: gcc
-      env: VERSION=-5
    - compiler: gcc
      addons:
        apt:
@@ -23,6 +19,8 @@ matrix:
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
+            - libopenmpi-dev
+            - openmpi-bin
            - binutils-dev
      env: VERSION=-4.9
    - compiler: gcc
@@ -35,6 +33,8 @@ matrix:
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
+            - libopenmpi-dev
+            - openmpi-bin
            - binutils-dev
      env: VERSION=-5
    - compiler: clang
@@ -47,6 +47,8 @@ matrix:
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
+            - libopenmpi-dev
+            - openmpi-bin
            - binutils-dev
      env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
    - compiler: clang
@@ -59,6 +61,8 @@ matrix:
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
+            - libopenmpi-dev
+            - openmpi-bin
            - binutils-dev
      env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
      
@@ -69,6 +73,7 @@ before_install:
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi
+    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install openmpi; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]] && [[ "$CC" == "gcc" ]]; then brew install gcc5; fi
    
 install:
@@ -82,9 +87,20 @@ install:
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
    
 script:
-    - ./scripts/reconfigure_script
+    - ./bootstrap.sh
    - mkdir build
    - cd build
-    - ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=single --enable-simd=SSE4 --enable-comms=none
+    - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none
+    - make -j4 
+    - ./benchmarks/Benchmark_dwf --threads 1
+    - echo make clean
+    - ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none
    - make -j4
    - ./benchmarks/Benchmark_dwf --threads 1
+    - echo make clean
+    - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; fi
+    - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto
+    - make -j4
+    - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
+    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then mpirun -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
+
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,5 +1,5 @@
 # additional include paths necessary to compile the C++ library
-AM_CXXFLAGS = -I$(top_srcdir)/
-SUBDIRS = lib tests benchmarks
+SUBDIRS = lib benchmarks tests

-filelist: $(SUBDIRS)
+AM_CXXFLAGS += -I$(top_builddir)/include
+ACLOCAL_AMFLAGS = -I m4
--- a/README.md
+++ b/README.md
@@ -1,8 +1,28 @@
-# Grid [![Build Status](https://travis-ci.org/paboyle/Grid.svg?branch=master)](https://travis-ci.org/paboyle/Grid)
-Data parallel C++ mathematical object library
+# Grid
+<table>
+<tr>
+    <td>Last stable release</td>
+    <td><a href="https://travis-ci.org/paboyle/Grid">
+    <img src="https://travis-ci.org/paboyle/Grid.svg?branch=master"></a>
+    </td>
+</tr>
+<tr>
+    <td>Development branch</td>
+    <td><a href="https://travis-ci.org/paboyle/Grid">
+    <img src="https://travis-ci.org/paboyle/Grid.svg?branch=develop"></a>
+    </td>
+</tr>
+</table>

-Last update 2015/7/30
+**Data parallel C++ mathematical object library.**

+Please send all pull requests to the `develop` branch.
+
+License: GPL v2.
+
+Last update 2016/08/03.
+
+### Description
 This library provides data parallel C++ container classes with internal memory layout
 that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
 are provided, similar to HPF and cmfortran, and user control is given over the mapping of
@@ -22,37 +42,75 @@ optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a signifi
 for most programmers.

 The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
-Presently SSE4 (128 bit) AVX, AVX2 (256 bit) and IMCI and AVX512 (512 bit) targets are supported (ARM NEON on the way).
+Presently SSE4 (128 bit) AVX, AVX2 (256 bit) and IMCI and AVX512 (512 bit) targets are supported (ARM NEON and BG/Q QPX on the way).

-These are presented as 
-
-     vRealF, vRealD, vComplexF, vComplexD 
-
-internal vector data types. These may be useful in themselves for other programmers.
-The corresponding scalar types are named
-
-     RealF, RealD, ComplexF, ComplexD
+These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. These may be useful in themselves for other programmers.
+The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.

 MPI, OpenMP, and SIMD parallelism are present in the library.
+Please see https://arxiv.org/abs/1512.03487 for more detail.

-   You can give `configure' initial values for configuration parameters
-by setting variables in the command line or in the environment.  Here
-are examples:
+### Installation
+First, start by cloning the repository:

-     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -msse4" --enable-simd=SSE4
+``` bash
+git clone https://github.com/paboyle/Grid.git
+```

-     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx" --enable-simd=AVX
+Then enter the cloned directory and set up the build system:

-     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx2" --enable-simd=AVX2
+``` bash
+cd Grid
+./bootstrap.sh
+```

-     ./configure CXX=icpc CXXFLAGS="-std=c++11 -O3 -mmic" --enable-simd=AVX512 --host=none
-     
-Note: Before running configure it could be necessary to execute the script 
-       
-       script/filelist
+Now you can execute the `configure` script to generate makefiles (here from a build directory):

+``` bash
+mkdir build; cd build
+../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path>
+```

-     
-For developers:
-Use reconfigure_script in the scripts/ directory to create the autotools environment 
+where `--enable-precision=` set the default precision (`single` or `double`),
+`--enable-simd=` set the SIMD type (see possible values below), `--enable-
+comms=` set the protocol used for communications (`none`, `mpi`, `mpi-auto` or
+`shmem`), and `<path>` should be replaced by the prefix path where you want to
+install Grid. The `mpi-auto` communication option set `configure` to determine
+automatically how to link to MPI. Other options are available, use `configure
+--help` to display them. Like with any other program using GNU autotool, the
+`CXX`, `CXXFLAGS`, `LDFLAGS`, ... environment variables can be modified to
+customise the build.

+Finally, you can build and install Grid:
+
+``` bash
+make; make install
+```
+
+To minimise the build time, only the tests at the root of the `tests` directory are built by default. If you want to build tests in the sub-directory `<subdir>` you can execute:
+
+``` bash
+make -C tests/<subdir> tests
+```
+
+### Possible SIMD types
+
+The following options can be use with the `--enable-simd=` option to target different SIMD instruction sets:
+
+| String      | Description                            |
+| ----------- | -------------------------------------- |
+| `GEN`       | generic portable vector code           |
+| `SSE4`      | SSE 4.2 (128 bit)                      |
+| `AVX`       | AVX (256 bit)                          |
+| `AVXFMA4`   | AVX (256 bit) + FMA                    |
+| `AVX2`      | AVX 2 (256 bit)                        |
+| `AVX512`    | AVX 512 bit                            |
+| `AVX512MIC` | AVX 512 bit for Intel MIC architecture |
+| `ICMI`      | Intel ICMI instructions (512 bit)      |
+
+Alternatively, some CPU codenames can be directly used:
+
+| String      | Description                            |
+| ----------- | -------------------------------------- |
+| `KNC`       | [Intel Knights Corner](http://ark.intel.com/products/codename/57721/Knights-Corner) |
+| `KNL`       | [Intel Knights Landing](http://ark.intel.com/products/codename/48999/Knights-Landing) |
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -25,7 +25,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
@@ -194,7 +194,128 @@ int main (int argc, char ** argv)
    }
  }  

+#if 0

+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking sequential persistent halo exchange in "<<nmu<<" dimensions"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
+
+
+  for(int lat=4;lat<=32;lat+=2){
+    for(int Ls=1;Ls<=16;Ls*=2){
+
+      std::vector<int> latt_size  ({lat,lat,lat,lat});
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+
+      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+
+
+      int ncomm;
+      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+
+
+      std::vector<CartesianCommunicator::CommsRequest_t> empty;
+      std::vector<std::vector<CartesianCommunicator::CommsRequest_t> > requests_fwd(Nd,empty);
+      std::vector<std::vector<CartesianCommunicator::CommsRequest_t> > requests_bwd(Nd,empty);
+
+      for(int mu=0;mu<4;mu++){
+	ncomm=0;
+	if (mpi_layout[mu]>1 ) {
+	  ncomm++;
+
+	  int comm_proc;
+	  int xmit_to_rank;
+	  int recv_from_rank;
+
+	  comm_proc=1;
+	  Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	  Grid.SendToRecvFromInit(requests_fwd[mu],
+				  (void *)&xbuf[mu][0],
+				  xmit_to_rank,
+				  (void *)&rbuf[mu][0],
+				  recv_from_rank,
+				  bytes);
+
+	  comm_proc = mpi_layout[mu]-1;
+	  Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	  Grid.SendToRecvFromInit(requests_bwd[mu],
+				  (void *)&xbuf[mu+4][0],
+				  xmit_to_rank,
+				  (void *)&rbuf[mu+4][0],
+				  recv_from_rank,
+				  bytes);
+
+	}
+      }
+
+      {
+	double start=usecond();
+	for(int i=0;i<Nloop;i++){
+	  
+	  for(int mu=0;mu<4;mu++){
+	    
+	    if (mpi_layout[mu]>1 ) {
+	      
+	      Grid.SendToRecvFromBegin(requests_fwd[mu]);
+	      Grid.SendToRecvFromComplete(requests_fwd[mu]);
+	      Grid.SendToRecvFromBegin(requests_bwd[mu]);
+	      Grid.SendToRecvFromComplete(requests_bwd[mu]);
+	    }
+	  }
+	  Grid.Barrier();
+	}
+	
+	double stop=usecond();
+	
+	double dbytes    = bytes;
+	double xbytes    = Nloop*dbytes*2.0*ncomm;
+	double rbytes    = xbytes;
+	double bidibytes = xbytes+rbytes;
+	
+	double time = stop-start;
+	
+	std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
+
+      }
+
+
+      {
+	double start=usecond();
+	for(int i=0;i<Nloop;i++){
+	  
+	  for(int mu=0;mu<4;mu++){
+	    
+	    if (mpi_layout[mu]>1 ) {
+	      
+	      Grid.SendToRecvFromBegin(requests_fwd[mu]);
+	      Grid.SendToRecvFromBegin(requests_bwd[mu]);
+	      Grid.SendToRecvFromComplete(requests_fwd[mu]);
+	      Grid.SendToRecvFromComplete(requests_bwd[mu]);
+	    }
+	  }
+	  Grid.Barrier();
+	}
+	
+	double stop=usecond();
+	
+	double dbytes    = bytes;
+	double xbytes    = Nloop*dbytes*2.0*ncomm;
+	double rbytes    = xbytes;
+	double bidibytes = xbytes+rbytes;
+	
+	double time = stop-start;
+	
+	std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
+
+      }
+
+    }
+  }
+
+#endif

  Grid_finalize();
 }
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -26,8 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
-#include <PerfCount.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
@@ -46,9 +45,9 @@ struct scal {
  };

 bool overlapComms = false;
-typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
-typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
-typedef WilsonFermion5D<DomainWallRedBlack5dImplD> WilsonFermion5DD;
+typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
+typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
+typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;


 int main (int argc, char ** argv)
@@ -71,8 +70,8 @@ int main (int argc, char ** argv)

  std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
+  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
-  std::cout << GridLogMessage << "Making s innermost rb grids"<<std::endl;
  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);

  std::vector<int> seeds4({1,2,3,4});
@@ -87,8 +86,6 @@ int main (int argc, char ** argv)
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);

-  ColourMatrix cm = Complex(1.0,0.0);
-
  LatticeGaugeField Umu(UGrid); 
  random(RNG4,Umu);

@@ -127,21 +124,20 @@ int main (int argc, char ** argv)
  RealD mass=0.1;
  RealD M5  =1.8;

-  typename DomainWallFermionR::ImplParams params; 
-  params.overlapCommsCompute = overlapComms;
-  
  RealD NP = UGrid->_Nprocessors;

  for(int doasm=1;doasm<2;doasm++){

    QCD::WilsonKernelsStatic::AsmOpt=doasm;

-  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
+  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  
-  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
-  int ncall =10;
+  std::cout<<GridLogMessage << "Naive wilson implementation "<<std::endl;
+  std::cout << GridLogMessage<< "Calling Dw"<<std::endl;
+  int ncall =100;
  if (1) {

+    Dw.ZeroCounters();
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      __SSC_START;
@@ -160,16 +156,17 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
    err = ref-result; 
    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-    //    Dw.Report();
+    Dw.Report();
  }

  if (1)
  {
-    typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
-    LatticeFermionF ssrc(sFGrid);
-    LatticeFermionF sref(sFGrid);
-    LatticeFermionF sresult(sFGrid);
-    WilsonFermion5DF sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5,params);
+    typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
+    LatticeFermion ssrc(sFGrid);
+    LatticeFermion sref(sFGrid);
+    LatticeFermion sresult(sFGrid);
+
+    WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
  
    for(int x=0;x<latt4[0];x++){
    for(int y=0;y<latt4[1];y++){
@@ -177,12 +174,13 @@ int main (int argc, char ** argv)
    for(int t=0;t<latt4[3];t++){
    for(int s=0;s<Ls;s++){
      std::vector<int> site({s,x,y,z,t});
-      SpinColourVectorF tmp;
+      SpinColourVector tmp;
      peekSite(tmp,src,site);
      pokeSite(tmp,ssrc,site);
    }}}}}
-
+    std::cout<<GridLogMessage<< "src norms "<< norm2(src)<<" " <<norm2(ssrc)<<std::endl;
    double t0=usecond();
+    sDw.ZeroCounters();
    for(int i=0;i<ncall;i++){
      __SSC_START;
      sDw.Dhop(ssrc,sresult,0);
@@ -192,22 +190,23 @@ int main (int argc, char ** argv)
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=1344*volume*ncall;

-    std::cout<<GridLogMessage << "Called Dw sinner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
+    std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
-    //  sDw.Report();
+    sDw.Report();
  
    if(0){
      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
-	sDw.Dhop(ssrc,sresult,0);
-	PerformanceCounter Counter(i);
-	Counter.Start();
-	sDw.Dhop(ssrc,sresult,0);
-	Counter.Stop();
-	Counter.Report();
+  sDw.Dhop(ssrc,sresult,0);
+  PerformanceCounter Counter(i);
+  Counter.Start();
+  sDw.Dhop(ssrc,sresult,0);
+  Counter.Stop();
+  Counter.Report();
      }
    }

+    std::cout<<GridLogMessage<< "res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl;


    RealF sum=0;
@@ -217,21 +216,23 @@ int main (int argc, char ** argv)
    for(int t=0;t<latt4[3];t++){
    for(int s=0;s<Ls;s++){
      std::vector<int> site({s,x,y,z,t});
-      SpinColourVectorF normal, simd;
+      SpinColourVector normal, simd;
      peekSite(normal,result,site);
      peekSite(simd,sresult,site);
      sum=sum+norm2(normal-simd);
-      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
-      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<normal<<std::endl;
-      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<simd<<std::endl;
+      if (norm2(normal-simd) > 1.0e-6 ) {
+  std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
+  std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" normal "<<normal<<std::endl;
+  std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" simd   "<<simd<<std::endl;
+      }
    }}}}}
-    std::cout<<" difference between normal and simd is "<<sum<<std::endl;
+    std::cout<<GridLogMessage<<" difference between normal and simd is "<<sum<<std::endl;


    if (1) {

-      LatticeFermionF sr_eo(sFGrid);
-      LatticeFermionF serr(sFGrid);
+      LatticeFermion sr_eo(sFGrid);
+      LatticeFermion serr(sFGrid);

      LatticeFermion ssrc_e (sFrbGrid);
      LatticeFermion ssrc_o (sFrbGrid);
@@ -249,17 +250,21 @@ int main (int argc, char ** argv)
      sr_e = zero;
      sr_o = zero;

+      sDw.ZeroCounters();
+      sDw.stat.init("DhopEO");
      double t0=usecond();
-      for(int i=0;i<ncall;i++){
-	sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
+      for (int i = 0; i < ncall; i++) {
+        sDw.DhopEO(ssrc_o, sr_e, DaggerNo);
      }
      double t1=usecond();
+      sDw.stat.print();

      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
      double flops=(1344.0*volume*ncall)/2;

      std::cout<<GridLogMessage << "sDeo mflop/s =   "<< flops/(t1-t0)<<std::endl;
      std::cout<<GridLogMessage << "sDeo mflop/s per node   "<< flops/(t1-t0)/NP<<std::endl;
+      sDw.Report();

      sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
      sDw.DhopOE(ssrc_e,sr_o,DaggerNo);
@@ -268,9 +273,9 @@ int main (int argc, char ** argv)
      pickCheckerboard(Even,ssrc_e,sresult);
      pickCheckerboard(Odd ,ssrc_o,sresult);
      ssrc_e = ssrc_e - sr_e;
-      std::cout<<GridLogMessage << "sE norm diff   "<< norm2(ssrc_e)<<std::endl;
+      std::cout<<GridLogMessage << "sE norm diff   "<< norm2(ssrc_e)<< "  vec nrm"<<norm2(sr_e) <<std::endl;
      ssrc_o = ssrc_o - sr_o;
-      std::cout<<GridLogMessage << "sO norm diff   "<< norm2(ssrc_o)<<std::endl;
+      std::cout<<GridLogMessage << "sO norm diff   "<< norm2(ssrc_o)<< "  vec nrm"<<norm2(sr_o) <<std::endl;
    }


@@ -284,18 +289,19 @@ int main (int argc, char ** argv)
      //    ref =  src - Gamma(Gamma::GammaX)* src ; // 1+gamma_x
      tmp = U[mu]*Cshift(src,mu+1,1);
      for(int i=0;i<ref._odata.size();i++){
-	ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
+  ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
      }

      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu+1,-1);
      for(int i=0;i<ref._odata.size();i++){
-	ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
+  ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
      }
    }
    ref = -0.5*ref;
  }
  Dw.Dhop(src,result,1);
+  std::cout << GridLogMessage << "Naive wilson implementation Dag" << std::endl;
  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
@@ -317,6 +323,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;

  {
+    Dw.ZeroCounters();
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      Dw.DhopEO(src_o,r_e,DaggerNo);
@@ -328,6 +335,7 @@ int main (int argc, char ** argv)

    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NP<<std::endl;
+    Dw.Report();
  }
  Dw.DhopEO(src_o,r_e,DaggerNo);
  Dw.DhopOE(src_e,r_o,DaggerNo);
--- a/benchmarks/Benchmark_dwf_ntpf.cc
+++ b/benchmarks/Benchmark_dwf_ntpf.cc
@@ -26,8 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
-#include <PerfCount.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
--- a/benchmarks/Benchmark_dwf_sweep.cc
+++ b/benchmarks/Benchmark_dwf_sweep.cc
@@ -1,4 +1,3 @@
-
    /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 
@@ -27,8 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
-#include <PerfCount.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
@@ -53,7 +51,7 @@ int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);

-  const int Ls=16;
+  const int Ls=8;
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;

@@ -63,6 +61,8 @@ int main (int argc, char ** argv)
    QCD::WilsonKernelsStatic::AsmOpt=0;
  }

+  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking DWF"<<std::endl;
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
  std::cout<<GridLogMessage << "Volume \t\t\tProcs \t Dw \t eoDw \t sDw \t eosDw (Mflop/s)  "<<std::endl;
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
@@ -127,7 +127,6 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )

  ColourMatrix cm = Complex(1.0,0.0);

-
  LatticeGaugeField Umu5d(FGrid); 

  // replicate across fifth dimension
@@ -146,11 +145,10 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
  }

 #ifdef CHECK
-  if (1)
-  {
+  if (1) {
+
    ref = zero;
    for(int mu=0;mu<Nd;mu++){
-
      tmp = U[mu]*Cshift(src,mu+1,1);
      ref=ref + tmp - Gamma(Gmu[mu])*tmp;

@@ -194,20 +192,19 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
    Counter.Report();
  }
  
-  if ( ! report ) 
-    {
-      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-      double flops=1344*volume*ncall;
-      std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t";
-    }
+  if ( ! report ) {
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=1344*volume*ncall;
+    std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t";
+  }
  
 #ifdef CHECK
-    err = ref-result; 
-    RealD errd = norm2(err);
-    if ( errd> 1.0e-4 ) {
-      std::cout<<GridLogMessage << "oops !!! norm diff   "<< norm2(err)<<std::endl;
-      exit(-1);
-    }
+  err = ref-result; 
+  RealD errd = norm2(err);
+  if ( errd> 1.0e-4 ) {
+    std::cout<<GridLogMessage << "oops !!! norm diff   "<< norm2(err)<<std::endl;
+    exit(-1);
+  }
 #endif
    
  LatticeFermion src_e (FrbGrid);
@@ -233,10 +230,9 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
      std::cout<< flops/(t1-t0);
    }
  }
-  
 }

-#undef CHECK_SDW
+#define CHECK_SDW
 void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
 {

@@ -244,7 +240,9 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
+  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);

@@ -278,93 +276,89 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
    }
  }

-
  RealD mass=0.1;
  RealD M5  =1.8;

-    typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
-    LatticeFermionF ssrc(sFGrid);
-    LatticeFermionF sref(sFGrid);
-    LatticeFermionF sresult(sFGrid);
-    WilsonFermion5DF sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5);
+  typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
+  LatticeFermion ssrc(sFGrid);
+  LatticeFermion sref(sFGrid);
+  LatticeFermion sresult(sFGrid);
+  WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
  
-    for(int x=0;x<latt4[0];x++){
-    for(int y=0;y<latt4[1];y++){
-    for(int z=0;z<latt4[2];z++){
-    for(int t=0;t<latt4[3];t++){
-    for(int s=0;s<Ls;s++){
-      std::vector<int> site({s,x,y,z,t});
-      SpinColourVectorF tmp;
-      peekSite(tmp,src,site);
-      pokeSite(tmp,ssrc,site);
-    }}}}}
+  for(int x=0;x<latt4[0];x++){
+  for(int y=0;y<latt4[1];y++){
+  for(int z=0;z<latt4[2];z++){
+  for(int t=0;t<latt4[3];t++){
+  for(int s=0;s<Ls;s++){
+    std::vector<int> site({s,x,y,z,t});
+    SpinColourVector tmp;
+    peekSite(tmp,src,site);
+    pokeSite(tmp,ssrc,site);
+  }}}}}

-    double t0=usecond();
-    sDw.Dhop(ssrc,sresult,0);
-    double t1=usecond();
+  double t0=usecond();
+  sDw.Dhop(ssrc,sresult,0);
+  double t1=usecond();

 #ifdef TIMERS_OFF
-    int ncall =10;
+  int ncall =10;
 #else 
-    int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
+  int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
 #endif

-    PerformanceCounter Counter(8);
-    Counter.Start();
-    t0=usecond();
-    for(int i=0;i<ncall;i++){
-      sDw.Dhop(ssrc,sresult,0);
-    }
-    t1=usecond();
-    Counter.Stop();
+  PerformanceCounter Counter(8);
+  Counter.Start();
+  t0=usecond();
+  for(int i=0;i<ncall;i++){
+    sDw.Dhop(ssrc,sresult,0);
+  }
+  t1=usecond();
+  Counter.Stop();
+  
+  if ( report ) {
+    Counter.Report();
+  } else { 
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=1344*volume*ncall;
+    std::cout<<"\t"<< flops/(t1-t0);
+  }

-    if ( report ) {
-      Counter.Report();
-    } else { 
-
-      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-      double flops=1344*volume*ncall;
-      std::cout<<"\t"<< flops/(t1-t0);
-    }
-
-
-    LatticeFermionF sr_eo(sFGrid);
-    LatticeFermionF serr(sFGrid);
-    
-    LatticeFermion ssrc_e (sFrbGrid);
-    LatticeFermion ssrc_o (sFrbGrid);
-    LatticeFermion sr_e   (sFrbGrid);
-    LatticeFermion sr_o   (sFrbGrid);
+  LatticeFermion sr_eo(sFGrid);
+  LatticeFermion serr(sFGrid);
+  
+  LatticeFermion ssrc_e (sFrbGrid);
+  LatticeFermion ssrc_o (sFrbGrid);
+  LatticeFermion sr_e   (sFrbGrid);
+  LatticeFermion sr_o   (sFrbGrid);
      
-    pickCheckerboard(Even,ssrc_e,ssrc);
-    pickCheckerboard(Odd,ssrc_o,ssrc);
-
-    setCheckerboard(sr_eo,ssrc_o);
-    setCheckerboard(sr_eo,ssrc_e);
-    
-    sr_e = zero;
-    sr_o = zero;
+  pickCheckerboard(Even,ssrc_e,ssrc);
+  pickCheckerboard(Odd,ssrc_o,ssrc);
+  
+  setCheckerboard(sr_eo,ssrc_o);
+  setCheckerboard(sr_eo,ssrc_e);
    
+  sr_e = zero;
+  sr_o = zero;
+  
+  sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
+  PerformanceCounter CounterSdw(8);
+  CounterSdw.Start();
+  t0=usecond();
+  for(int i=0;i<ncall;i++){
+    __SSC_START;
    sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
-    PerformanceCounter CounterSdw(8);
-    CounterSdw.Start();
-    t0=usecond();
-    for(int i=0;i<ncall;i++){
-      __SSC_START;
-      sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
-      __SSC_STOP;
-    }
-    t1=usecond();
-    CounterSdw.Stop();
+    __SSC_STOP;
+  }
+  t1=usecond();
+  CounterSdw.Stop();

-    if ( report ) { 
-      CounterSdw.Report();
-    } else {
-
-      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-      double flops=(1344.0*volume*ncall)/2;
-      std::cout<<"\t"<< flops/(t1-t0);
-    }
+  if ( report ) { 
+    CounterSdw.Report();
+  } else {
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=(1344.0*volume*ncall)/2;
+    std::cout<<"\t"<< flops/(t1-t0);
+  }
 }


--- a/benchmarks/Benchmark_memory_asynch.cc
+++ b/benchmarks/Benchmark_memory_asynch.cc
@@ -26,7 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
--- a/benchmarks/Benchmark_memory_bandwidth.cc
+++ b/benchmarks/Benchmark_memory_bandwidth.cc
@@ -26,7 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@@ -26,7 +26,7 @@ Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
--- a/benchmarks/Benchmark_wilson.cc
+++ b/benchmarks/Benchmark_wilson.cc
@@ -26,7 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 using namespace std;
 using namespace Grid;
--- a/benchmarks/Benchmark_wilson_sweep.cc
+++ b/benchmarks/Benchmark_wilson_sweep.cc
@@ -0,0 +1,117 @@
+/*************************************************************************************
+    Grid physics library, www.github.com/paboyle/Grid 
+    Source file: ./benchmarks/Benchmark_wilson.cc
+    Copyright (C) 2015
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Richard Rollins <rprollins@users.noreply.github.com>
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+Gamma::GammaMatrix Gmu [] = {
+  Gamma::GammaX,
+  Gamma::GammaY,
+  Gamma::GammaZ,
+  Gamma::GammaT
+};
+
+bool overlapComms = false;
+
+void bench_wilson (
+		   LatticeFermion &    src,
+		   LatticeFermion & result,
+		   WilsonFermionR &     Dw,
+		   double const     volume,
+		   int const           dag );
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+  if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){ overlapComms = true; }
+  typename WilsonFermionR::ImplParams params;
+  params.overlapCommsCompute = overlapComms;
+
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+  std::vector<int> seeds({1,2,3,4});
+  RealD mass = 0.1;
+
+  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking Wilson" << std::endl;
+  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
+  std::cout<<GridLogMessage << "Volume\t\t\tWilson/MFLOPs\tWilsonDag/MFLOPs" << std::endl;
+  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
+
+  int Lmax = 32;
+  int dmin = 0;
+  if ( getenv("LMAX") ) Lmax=atoi(getenv("LMAX"));
+  if ( getenv("DMIN") ) dmin=atoi(getenv("DMIN"));
+  for (int L=8; L<=Lmax; L*=2)
+    {
+      std::vector<int> latt_size = std::vector<int>(4,L);
+      for(int d=4; d>dmin; d--)
+	{
+	  if ( d<=3 ) { latt_size[d] *= 2; }
+
+	  std::cout << GridLogMessage;
+	  std::copy( latt_size.begin(), --latt_size.end(), std::ostream_iterator<int>( std::cout, std::string("x").c_str() ) );
+	  std::cout << latt_size.back() << "\t\t";
+
+	  GridCartesian           Grid(latt_size,simd_layout,mpi_layout);
+	  GridRedBlackCartesian RBGrid(latt_size,simd_layout,mpi_layout);
+
+	  GridParallelRNG  pRNG(&Grid); pRNG.SeedFixedIntegers(seeds);
+	  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
+	  LatticeFermion    src(&Grid); random(pRNG,src);
+	  LatticeFermion result(&Grid); result=zero;
+
+	  double volume = std::accumulate(latt_size.begin(),latt_size.end(),1,std::multiplies<int>());
+
+	  WilsonFermionR Dw(Umu,Grid,RBGrid,mass,params);
+      
+	  bench_wilson(src,result,Dw,volume,DaggerNo);
+	  bench_wilson(src,result,Dw,volume,DaggerYes);
+	  std::cout << std::endl;
+	}
+    }
+
+  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
+  Grid_finalize();
+}
+
+void bench_wilson (
+		   LatticeFermion &    src,
+		   LatticeFermion & result,
+		   WilsonFermionR &     Dw,
+		   double const     volume,
+		   int const           dag )
+{
+  int ncall    = 1000;
+  double t0    = usecond();
+  for(int i=0; i<ncall; i++) { Dw.Dhop(src,result,dag); }
+  double t1    = usecond();
+  double flops = 1344 * volume * ncall;
+  std::cout << flops/(t1-t0) << "\t\t";
+}
--- a/benchmarks/Benchmark_zmm.cc
+++ b/benchmarks/Benchmark_zmm.cc
@@ -25,8 +25,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
-#include <PerfCount.h>
+#include <Grid/Grid.h>


 using namespace Grid;
@@ -41,14 +40,20 @@ int main(int argc,char **argv)
  std::ofstream os("zmm.dat");

  os << "#V Ls Lxy Lzt C++ Asm OMP L1 " <<std::endl;
+  std::cout<<GridLogMessage << "====================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking ZMM"<<std::endl;
+  std::cout<<GridLogMessage << "====================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "Volume \t\t\t\tC++DW/MFLOPs\tASM-DW/MFLOPs\tdiff"<<std::endl;
+  std::cout<<GridLogMessage << "====================================================================="<<std::endl;
  for(int L=4;L<=32;L+=4){
    for(int m=1;m<=2;m++){
      for(int Ls=8;Ls<=16;Ls+=8){
 	std::vector<int> grid({L,L,m*L,m*L});
+  std::cout << GridLogMessage <<"\t";
 	for(int i=0;i<4;i++) { 
 	  std::cout << grid[i]<<"x";
 	}
-	std::cout << Ls<<std::endl;
+	std::cout << Ls<<"\t\t";
 	bench(os,grid,Ls);
      }
    }
@@ -105,7 +110,6 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
  RealD M5  =1.8;
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);

-  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
  int ncall=50;
  double t0=usecond();
  for(int i=0;i<ncall;i++){
@@ -117,7 +121,7 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
  double flops=1344*volume/2;

  mfc = flops*ncall/(t1-t0);
-  std::cout<<GridLogMessage << "Called C++ Dw"<< " mflop/s =   "<< mfc<<std::endl;
+  std::cout<<mfc<<"\t\t";

  QCD::WilsonKernelsStatic::AsmOpt=1;
  t0=usecond();
@@ -126,7 +130,7 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
  }
  t1=usecond();
  mfa = flops*ncall/(t1-t0);
-  std::cout<<GridLogMessage << "Called ASM Dw"<< " mflop/s =   "<< mfa<<std::endl;
+  std::cout<<mfa<<"\t\t";
  /*
  int dag=DaggerNo;
  t0=usecond();
@@ -164,8 +168,7 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
  //resulta = (-0.5) * resulta;

  diff = resulto-resulta;
-  std::cout<<GridLogMessage << "diff "<< norm2(diff)<<std::endl;
-  std::cout<<std::endl;
+  std::cout<<norm2(diff)<<std::endl;
  return 0;
 }

--- a/benchmarks/Make.inc
+++ b/benchmarks/Make.inc
@@ -1,39 +0,0 @@
-
-bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_dwf_ntpf Benchmark_dwf_sweep Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson Benchmark_zmm
-
-
-Benchmark_comms_SOURCES=Benchmark_comms.cc
-Benchmark_comms_LDADD=-lGrid
-
-
-Benchmark_dwf_SOURCES=Benchmark_dwf.cc
-Benchmark_dwf_LDADD=-lGrid
-
-
-Benchmark_dwf_ntpf_SOURCES=Benchmark_dwf_ntpf.cc
-Benchmark_dwf_ntpf_LDADD=-lGrid
-
-
-Benchmark_dwf_sweep_SOURCES=Benchmark_dwf_sweep.cc
-Benchmark_dwf_sweep_LDADD=-lGrid
-
-
-Benchmark_memory_asynch_SOURCES=Benchmark_memory_asynch.cc
-Benchmark_memory_asynch_LDADD=-lGrid
-
-
-Benchmark_memory_bandwidth_SOURCES=Benchmark_memory_bandwidth.cc
-Benchmark_memory_bandwidth_LDADD=-lGrid
-
-
-Benchmark_su3_SOURCES=Benchmark_su3.cc
-Benchmark_su3_LDADD=-lGrid
-
-
-Benchmark_wilson_SOURCES=Benchmark_wilson.cc
-Benchmark_wilson_LDADD=-lGrid
-
-
-Benchmark_zmm_SOURCES=Benchmark_zmm.cc
-Benchmark_zmm_LDADD=-lGrid
-
--- a/benchmarks/Makefile.am
+++ b/benchmarks/Makefile.am
@@ -1,8 +1 @@
-# additional include paths necessary to compile the C++ library
-AM_CXXFLAGS = -I$(top_srcdir)/lib
-AM_LDFLAGS = -L$(top_builddir)/lib
-
-#
-# Test code
-#
 include Make.inc
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.2.9.tar.bz2'
+FFTW_URL=http://www.fftw.org/fftw-3.3.4.tar.gz
+
+echo "-- deploying Eigen source..."
+wget ${EIGEN_URL} --no-check-certificate
+./scripts/update_eigen.sh `basename ${EIGEN_URL}`
+rm `basename ${EIGEN_URL}`
+
+echo "-- copying fftw prototypes..."
+wget ${FFTW_URL}
+./scripts/update_fftw.sh `basename ${FFTW_URL}`
+rm `basename ${FFTW_URL}`
+
+echo '-- generating Make.inc files...'
+./scripts/filelist
+echo '-- generating configure script...'
+autoreconf -fvi
--- a/configure.ac
+++ b/configure.ac
@@ -1,315 +1,362 @@
-#                         -*- Autoconf -*-
-# Process this file with autoconf to produce a configure script.
-#
-# Project Grid package  
-# 
-# Time-stamp: <2015-07-10 17:46:21 neo>
-
 AC_PREREQ([2.63])
-AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk])
-AC_CANONICAL_SYSTEM
+AC_INIT([Grid], [0.5.1-dev], [https://github.com/paboyle/Grid], [Grid])
+AC_CANONICAL_BUILD
+AC_CANONICAL_HOST
+AC_CANONICAL_TARGET
 AM_INIT_AUTOMAKE(subdir-objects)
 AC_CONFIG_MACRO_DIR([m4])
 AC_CONFIG_SRCDIR([lib/Grid.h])
 AC_CONFIG_HEADERS([lib/Config.h])
 m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])

-AC_MSG_NOTICE([

-:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-Configuring $PACKAGE v$VERSION  for $host
-:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-])
-
-# Checks for programs.
+############### Checks for programs
 AC_LANG(C++)
+CXXFLAGS="-O3 $CXXFLAGS"
 AC_PROG_CXX
-AC_OPENMP
 AC_PROG_RANLIB
-#AX_CXX_COMPILE_STDCXX_11(noext, mandatory)
-AX_EXT

-# Checks for libraries.
-#AX_GCC_VAR_ATTRIBUTE(aligned)
+############ openmp  ###############
+AC_OPENMP

-# Checks for header files.
+ac_openmp=no
+
+if test "${OPENMP_CXXFLAGS}X" != "X"; then
+ac_openmp=yes
+AM_CXXFLAGS="$OPENMP_CXXFLAGS $AM_CXXFLAGS"
+AM_LDFLAGS="$OPENMP_CXXFLAGS $AM_LDFLAGS"
+fi
+
+############### Checks for header files
 AC_CHECK_HEADERS(stdint.h)
 AC_CHECK_HEADERS(mm_malloc.h)
 AC_CHECK_HEADERS(malloc/malloc.h)
 AC_CHECK_HEADERS(malloc.h)
 AC_CHECK_HEADERS(endian.h)
 AC_CHECK_HEADERS(execinfo.h)
-AC_CHECK_HEADERS(gmp.h)
 AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
 AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])

-# Checks for typedefs, structures, and compiler characteristics.
+############### Checks for typedefs, structures, and compiler characteristics
 AC_TYPE_SIZE_T
 AC_TYPE_UINT32_T
 AC_TYPE_UINT64_T

-# Checks for library functions.
-echo
-echo Checking libraries 
-echo :::::::::::::::::::::::::::::::::::::::::::
+############### GMP and MPFR #################
+AC_ARG_WITH([gmp],
+    [AS_HELP_STRING([--with-gmp=prefix],
+    [try this for a non-standard install prefix of the GMP library])],
+    [AM_CXXFLAGS="-I$with_gmp/include $AM_CXXFLAGS"]
+    [AM_LDFLAGS="-L$with_gmp/lib $AM_LDFLAGS"])
+AC_ARG_WITH([mpfr],
+    [AS_HELP_STRING([--with-mpfr=prefix],
+    [try this for a non-standard install prefix of the MPFR library])],
+    [AM_CXXFLAGS="-I$with_mpfr/include $AM_CXXFLAGS"]
+    [AM_LDFLAGS="-L$with_mpfr/lib $AM_LDFLAGS"])

+################## lapack ####################
+AC_ARG_ENABLE([lapack],
+    [AC_HELP_STRING([--enable-lapack=yes|no|prefix], [enable LAPACK])], 
+    [ac_LAPACK=${enable_lapack}],[ac_LAPACK=no])
+
+case ${ac_LAPACK} in
+    no)
+        ;;
+    yes)
+        AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
+    *)
+        AM_CXXFLAGS="-I$ac_LAPACK/include $AM_CXXFLAGS"
+        AM_LDFLAGS="-L$ac_LAPACK/lib $AM_LDFLAGS"
+        AC_DEFINE([USE_LAPACK],[1],[use LAPACK])
+esac
+
+################## first-touch ####################
+AC_ARG_ENABLE([numa],
+    [AC_HELP_STRING([--enable-numa=yes|no|prefix], [enable first touch numa opt])], 
+    [ac_NUMA=${enable_NUMA}],[ac_NUMA=no])
+
+case ${ac_NUMA} in
+    no)
+        ;;
+    yes)
+        AC_DEFINE([GRID_NUMA],[1],[First touch numa locality]);;
+    *)
+        AC_DEFINE([GRID_NUMA],[1],[First touch numa locality]);;
+esac
+
+################## FFTW3 ####################
+AC_ARG_WITH([fftw],    
+            [AS_HELP_STRING([--with-fftw=prefix],
+            [try this for a non-standard install prefix of the FFTW3 library])],
+            [AM_CXXFLAGS="-I$with_fftw/include $AM_CXXFLAGS"]
+            [AM_LDFLAGS="-L$with_fftw/lib $AM_LDFLAGS"])
+
+################ Get compiler informations
+AC_LANG([C++])
+AX_CXX_COMPILE_STDCXX_11([noext],[mandatory])
+AX_COMPILER_VENDOR
+AC_DEFINE_UNQUOTED([CXX_COMP_VENDOR],["$ax_cv_cxx_compiler_vendor"],
+      [vendor of C++ compiler that will compile the code])
+AX_GXX_VERSION
+AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"],
+      [version of g++ that will compile the code])
+
+############### Checks for library functions
+CXXFLAGS_CPY=$CXXFLAGS
+LDFLAGS_CPY=$LDFLAGS
+CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS"
+LDFLAGS="$AM_LDFLAGS $LDFLAGS"
 AC_CHECK_FUNCS([gettimeofday])
+AC_CHECK_LIB([gmp],[__gmpf_init],
+             [AC_CHECK_LIB([mpfr],[mpfr_init],
+                 [AC_DEFINE([HAVE_LIBMPFR], [1], [Define to 1 if you have the `MPFR' library (-lmpfr).])]
+                 [have_mpfr=true]
+                 [LIBS="$LIBS -lmpfr"],
+                 [AC_MSG_ERROR([MPFR library not found])])]
+   	     [AC_DEFINE([HAVE_LIBGMP], [1], [Define to 1 if you have the `GMP' library (-lgmp).])]
+             [have_gmp=true]
+             [LIBS="$LIBS -lgmp"],
+             [AC_MSG_WARN([**** GMP library not found, Grid can still compile but RHMC will not work ****])])

-#AC_CHECK_LIB([gmp],[__gmpf_init],,
-#        [AC_MSG_ERROR(GNU Multiple Precision GMP library was not found in your system.
-#Please install or provide the correct path to your installation
-#Info at: http://www.gmplib.org)])
+if test "${ac_LAPACK}x" != "nox"; then
+    AC_CHECK_LIB([lapack],[LAPACKE_sbdsdc],[],
+                 [AC_MSG_ERROR("LAPACK enabled but library not found")])
+fi
+AC_CHECK_LIB([fftw3],[fftw_execute],
+  [AC_DEFINE([HAVE_FFTW],[1],[Define to 1 if you have the `FFTW' library (-lfftw3).])]
+  [have_fftw=true]
+  [LIBS="$LIBS -lfftw3 -lfftw3f"],
+  [AC_MSG_WARN([**** FFTW library not found, Grid can still compile but FFT-based routines will not work ****])])
+CXXFLAGS=$CXXFLAGS_CPY
+LDFLAGS=$LDFLAGS_CPY

-#AC_CHECK_LIB([mpfr],[mpfr_init],,
-#        [AC_MSG_ERROR(GNU Multiple Precision MPFR library was not found in your system.
-#Please install or provide the correct path to your installation
-#Info at: http://www.mpfr.org/)])
-
-#
-# SIMD instructions selection
-#
-
-AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVXFMA4|AVX2|AVX512|IMCI],\
+############### SIMD instruction selection
+AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVXFMA4|AVXFMA|AVX2|AVX512|AVX512MIC|IMCI|KNL|KNC],\
 	[Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, IMCI])],\
-	[ac_SIMD=${enable_simd}],[ac_SIMD=DEBUG])
+	[ac_SIMD=${enable_simd}],[ac_SIMD=GEN])

-supported=no
-
-ac_ZMM=no;
+case ${ax_cv_cxx_compiler_vendor} in
+  clang|gnu)
+    case ${ac_SIMD} in
+      SSE4)
+        AC_DEFINE([SSE4],[1],[SSE4 intrinsics])
+        SIMD_FLAGS='-msse4.2';;
+      AVX)
+        AC_DEFINE([AVX1],[1],[AVX intrinsics])
+        SIMD_FLAGS='-mavx';;
+      AVXFMA4)
+        AC_DEFINE([AVXFMA4],[1],[AVX intrinsics with FMA4])
+        SIMD_FLAGS='-mavx -mfma4';;
+      AVXFMA)
+        AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA3])
+        SIMD_FLAGS='-mavx -mfma';;
+      AVX2)
+        AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
+        SIMD_FLAGS='-mavx2 -mfma';;
+      AVX512|AVX512MIC|KNL)
+        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
+        SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';;
+      IMCI|KNC)
+        AC_DEFINE([IMCI],[1],[IMCI intrinsics for Knights Corner])
+        SIMD_FLAGS='';;
+      GEN)
+        AC_DEFINE([GENERIC_VEC],[1],[generic vector code])
+        SIMD_FLAGS='';;
+      QPX|BGQ)
+        AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
+        SIMD_FLAGS='';;
+      *)
+        AC_MSG_ERROR(["SIMD option ${ac_SIMD} not supported by the GCC/Clang compiler"]);;
+    esac;;
+  intel)
+    case ${ac_SIMD} in
+      SSE4)
+        AC_DEFINE([SSE4],[1],[SSE4 intrinsics])
+        SIMD_FLAGS='-msse4.2 -xsse4.2';;
+      AVX)
+        AC_DEFINE([AVX1],[1],[AVX intrinsics])
+        SIMD_FLAGS='-mavx -xavx';;
+      AVXFMA4)
+        AC_DEFINE([AVXFMA4],[1],[AVX intrinsics with FMA4])
+        SIMD_FLAGS='-mavx -mfma';;
+      AVXFMA)
+        AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA4])
+        SIMD_FLAGS='-mavx -mfma';;
+      AVX2)
+        AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
+        SIMD_FLAGS='-march=core-avx2 -xcore-avx2';;
+      AVX512)
+        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
+        SIMD_FLAGS='-xcore-avx512';;
+      AVX512MIC|KNL)
+        AC_DEFINE([AVX512],[1],[AVX512 intrinsics for Knights Landing])
+        SIMD_FLAGS='-xmic-avx512';;
+      IMCI|KNC)
+        AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner])
+        SIMD_FLAGS='';;
+      GEN)
+        AC_DEFINE([GENERIC_VEC],[1],[generic vector code])
+        SIMD_FLAGS='';;
+      *)
+        AC_MSG_ERROR(["SIMD option ${ac_SIMD} not supported by the Intel compiler"]);;
+    esac;;
+  *)
+    AC_MSG_WARN([Compiler unknown, using generic vector code])
+    AC_DEFINE([GENERIC_VEC],[1],[generic vector code]);;
+esac
+AM_CXXFLAGS="$SIMD_FLAGS $AM_CXXFLAGS"
+AM_CFLAGS="$SIMD_FLAGS $AM_CFLAGS"

 case ${ac_SIMD} in
-     SSE4)
-       echo Configuring for SSE4
-       AC_DEFINE([SSE4],[1],[SSE4 Intrinsics] )
-       if test x"$ax_cv_support_ssse3_ext" = x"yes"; then  dnl minimal support for SSE4
-         supported=yes
-       else
-  	AC_MSG_WARN([Your processor does not support SSE4 instructions])
-       fi
-     ;;
-     AVX)
-       echo Configuring for AVX
-       AC_DEFINE([AVX1],[1],[AVX Intrinsics] )
-       if test x"$ax_cv_support_avx_ext" = x"yes"; then  dnl minimal support for AVX
-       supported=yes			  
-       else
-       	AC_MSG_WARN([Your processor does not support AVX instructions])
-       fi
-     ;;
-     AVXFMA4)
-       echo Configuring for AVX
-       AC_DEFINE([AVXFMA4],[1],[AVX Intrinsics with FMA4] )
-       if test x"$ax_cv_support_avx_ext" = x"yes"; then  dnl minimal support for AVX
-       supported=yes			  
-       else
-       	AC_MSG_WARN([Your processor does not support AVX instructions])
-       fi
-     ;;
-     AVX2)
-       echo Configuring for AVX2
-       AC_DEFINE([AVX2],[1],[AVX2 Intrinsics] )
-       if test x"$ax_cv_support_avx2_ext" = x"yes"; then  dnl minimal support for AVX2
-       supported=yes
-       else
-       AC_MSG_WARN([Your processor does not support AVX2 instructions])
-       fi
-     ;;
-     AVX512)
-       echo Configuring for AVX512 
-       AC_DEFINE([AVX512],[1],[AVX512 Intrinsics for Knights Landing] )
-       supported="cross compilation"
-       ac_ZMM=yes;
-     ;;
-     IMCI)
-       echo Configuring for IMCI
-       AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner] )
-       supported="cross compilation"
-       ac_ZMM=no;
-     ;;
-     NEONv8)
-       echo Configuring for experimental ARMv8a support 
-       AC_DEFINE([NEONv8],[1],[NEON ARMv8 Experimental support ] )
-       supported="cross compilation"
-     ;;
-     DEBUG)
-       echo Configuring without SIMD support - only for compiler DEBUGGING!
-       AC_DEFINE([EMPTY_SIMD],[1],[EMPTY_SIMD only for DEBUGGING] )
-      ;;     
-     *)
-     AC_MSG_ERROR([${ac_SIMD} flag unsupported as --enable-simd option\nRun ./configure --help for the list of options]); 
-     ;;
+  AVX512|AVX512MIC|KNL)
+    AC_DEFINE([TEST_ZMM],[1],[compile ZMM test]);;
+  *)
+	;;
 esac

-case ${ac_ZMM} in
-yes)
-	echo Enabling ZMM source code
-;;
-no)
-	echo Disabling ZMM source code
-;;
-esac
-
-AM_CONDITIONAL(BUILD_ZMM,[ test "X${ac_ZMM}X" == "XyesX" ])
-
+############### precision selection
 AC_ARG_ENABLE([precision],[AC_HELP_STRING([--enable-precision=single|double],[Select default word size of Real])],[ac_PRECISION=${enable_precision}],[ac_PRECISION=double])
 case ${ac_PRECISION} in
     single)
-       echo default precision is single
       AC_DEFINE([GRID_DEFAULT_PRECISION_SINGLE],[1],[GRID_DEFAULT_PRECISION is SINGLE] )
     ;;
     double)
-       echo default precision is double
       AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] )
     ;;
 esac

-#
-# Comms selection
-#
-
-AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
+############### communication type selection
+AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi|mpi-auto|shmem],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])

 case ${ac_COMMS} in
     none)
-       echo Configuring for NO communications
       AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
     ;;
+     mpi-auto)
+       AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
+       LX_FIND_MPI
+       if test "x$have_CXX_mpi" = 'xno'; then AC_MSG_ERROR(["MPI not found"]); fi
+       AM_CXXFLAGS="$MPI_CXXFLAGS $AM_CXXFLAGS"
+       AM_CFLAGS="$MPI_CFLAGS $AM_CFLAGS"
+       AM_LDFLAGS="`echo $MPI_CXXLDFLAGS | sed -E 's/-l@<:@^ @:>@+//g'` $AM_LDFLAGS"
+       LIBS="`echo $MPI_CXXLDFLAGS | sed -E 's/-L@<:@^ @:>@+//g'` $LIBS"
+     ;;
     mpi)
-       echo Configuring for MPI communications
       AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
     ;;
     shmem)
-       echo Configuring for SHMEM communications
       AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] )
     ;;
     *)
     AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]); 
     ;;
 esac
-
 AM_CONDITIONAL(BUILD_COMMS_SHMEM,[ test "X${ac_COMMS}X" == "XshmemX" ])
-AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" ])
+AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" || test "X${ac_COMMS}X" == "Xmpi-autoX" ])
 AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])

-#
-# RNG selection
-#
+############### RNG selection
 AC_ARG_ENABLE([rng],[AC_HELP_STRING([--enable-rng=ranlux48|mt19937],\
 	[Select Random Number Generator to be used])],\
 	[ac_RNG=${enable_rng}],[ac_RNG=ranlux48])
+
 case ${ac_RNG} in
     ranlux48)
-     AC_DEFINE([RNG_RANLUX],[1],[RNG_RANLUX] )
+      AC_DEFINE([RNG_RANLUX],[1],[RNG_RANLUX] )
     ;;
     mt19937)
-     AC_DEFINE([RNG_MT19937],[1],[RNG_MT19937] )
+      AC_DEFINE([RNG_MT19937],[1],[RNG_MT19937] )
     ;;
     *)
-     AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]); 
+      AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]); 
     ;;
 esac

-#
-# SDE timing mode
-#
-AC_ARG_ENABLE([timers],[AC_HELP_STRING([--enable-timers=yes|no],\
+############### timer option
+AC_ARG_ENABLE([timers],[AC_HELP_STRING([--enable-timers],\
 	[Enable system dependent high res timers])],\
 	[ac_TIMERS=${enable_timers}],[ac_TIMERS=yes])
 case ${ac_TIMERS} in
     yes)
-     AC_DEFINE([TIMERS_ON],[1],[TIMERS_ON] )
+      AC_DEFINE([TIMERS_ON],[1],[TIMERS_ON] )
     ;;
     no)
-     AC_DEFINE([TIMERS_OFF],[1],[TIMERS_OFF] )
+      AC_DEFINE([TIMERS_OFF],[1],[TIMERS_OFF] )
     ;;
     *)
-     AC_MSG_ERROR([${ac_TIMERS} unsupported --enable-timers option]); 
+      AC_MSG_ERROR([${ac_TIMERS} unsupported --enable-timers option]); 
     ;;
 esac

-#
-# Chroma regression tests
-#
+############### Chroma regression test
 AC_ARG_ENABLE([chroma],[AC_HELP_STRING([--enable-chroma],[Expect chroma compiled under c++11 ])],ac_CHROMA=yes,ac_CHROMA=no)
-
 case ${ac_CHROMA} in
-     yes)
-       echo Enabling tests regressing to Chroma
-     ;;
-     no)
-       echo Disabling tests regressing to Chroma
+     yes|no)
     ;;
     *)
-     AC_MSG_ERROR([${ac_CHROMA} unsupported --enable-chroma option]); 
+       AC_MSG_ERROR([${ac_CHROMA} unsupported --enable-chroma option]); 
     ;;
 esac
-
 AM_CONDITIONAL(BUILD_CHROMA_REGRESSION,[ test "X${ac_CHROMA}X" == "XyesX" ])

-#
-# Lapack
-#
-AC_ARG_ENABLE([lapack],[AC_HELP_STRING([--enable-lapack],[Enable lapack yes/no ])],[ac_LAPACK=${enable_lapack}],[ac_LAPACK=no])
+############### Doxygen
+AC_PROG_DOXYGEN

-case ${ac_LAPACK} in
-     yes)
-       echo Enabling lapack
-     ;;
-     no)
-       echo Disabling lapack
-     ;;
-     *)
-       echo Enabling lapack at ${ac_LAPACK}
-     ;;
-esac
+if test -n "$DOXYGEN"
+then
+AC_CONFIG_FILES([docs/doxy.cfg])
+fi

-AM_CONDITIONAL(USE_LAPACK,[ test "X${ac_LAPACK}X" != "XnoX" ])
-AM_CONDITIONAL(USE_LAPACK_LIB,[ test "X${ac_LAPACK}X" != "XyesX" ])
-
-###################################################################
-# Checks for doxygen support
-# if present enables the "make doxyfile" command
-#echo
-#echo Checking doxygen support 
-#echo :::::::::::::::::::::::::::::::::::::::::::
-#AC_PROG_DOXYGEN
-
-#if test -n "$DOXYGEN"
-#then
-#AC_CONFIG_FILES([docs/doxy.cfg])
-#fi
-
-echo
-echo Creating configuration files
-echo :::::::::::::::::::::::::::::::::::::::::::
+############### Ouput
+cwd=`pwd -P`; cd ${srcdir}; abs_srcdir=`pwd -P`; cd ${cwd}
+AM_CXXFLAGS="-I${abs_srcdir}/include $AM_CXXFLAGS"
+AM_CFLAGS="-I${abs_srcdir}/include $AM_CFLAGS"
+AM_LDFLAGS="-L${cwd}/lib $AM_LDFLAGS"
+AC_SUBST([AM_CFLAGS])
+AC_SUBST([AM_CXXFLAGS])
+AC_SUBST([AM_LDFLAGS])
 AC_CONFIG_FILES(Makefile)
 AC_CONFIG_FILES(lib/Makefile)
 AC_CONFIG_FILES(tests/Makefile)
+AC_CONFIG_FILES(tests/IO/Makefile)
+AC_CONFIG_FILES(tests/core/Makefile)
+AC_CONFIG_FILES(tests/debug/Makefile)
+AC_CONFIG_FILES(tests/forces/Makefile)
+AC_CONFIG_FILES(tests/hmc/Makefile)
+AC_CONFIG_FILES(tests/solver/Makefile)
 AC_CONFIG_FILES(tests/qdpxx/Makefile)
 AC_CONFIG_FILES(benchmarks/Makefile)
 AC_OUTPUT

-
 echo "
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Summary of configuration for $PACKAGE v$VERSION
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-The following features are enabled:
-
+----- PLATFORM ----------------------------------------
 - architecture (build)          : $build_cpu
 - os (build)                    : $build_os
 - architecture (target)         : $target_cpu
 - os (target)                   : $target_os
+- compiler vendor               : ${ax_cv_cxx_compiler_vendor}
+- compiler version              : ${ax_cv_gxx_version}
+----- BUILD OPTIONS -----------------------------------
+- SIMD                          : ${ac_SIMD}
+- Threading                     : ${ac_openmp} 
+- Communications type           : ${ac_COMMS}
+- Default precision             : ${ac_PRECISION}
+- RNG choice                    : ${ac_RNG} 
+- GMP                           : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi`
+- LAPACK                        : ${ac_LAPACK}
+- FFTW                          : `if test "x$have_fftw" = xtrue; then echo yes; else echo no; fi`
 - build DOXYGEN documentation   : `if test "x$enable_doc" = xyes; then echo yes; else echo no; fi`
 - graphs and diagrams           : `if test "x$enable_dot" = xyes; then echo yes; else echo no; fi`
- Supported SIMD flags          : $SIMD_FLAGS
----------------------------------------------------------
- enabled simd support          : ${ac_SIMD}   (config macro says supported: $supported )
- communications type           : ${ac_COMMS}
- default precision             : ${ac_PRECISION}
- RNG choice                    : ${ac_RNG} 
- LAPACK	                : ${ac_LAPACK} 
-
-
+----- BUILD FLAGS -------------------------------------
+- CXXFLAGS:
+`echo ${AM_CXXFLAGS} ${CXXFLAGS} | tr ' ' '\n' | sed 's/^-/    -/g'`
+- LDFLAGS:
+`echo ${AM_LDFLAGS} ${LDFLAGS} | tr ' ' '\n' | sed 's/^-/    -/g'`
+- LIBS:
+`echo ${LIBS} | tr ' ' '\n' | sed 's/^-/    -/g'`
+-------------------------------------------------------
 "
--- a/include/Grid
+++ b/include/Grid
@@ -0,0 +1 @@
+../lib
--- a/lib/pugixml/.dirstamp
+++ b/lib/pugixml/.dirstamp
--- a/lib/Algorithms.h
+++ b/lib/Algorithms.h
@@ -29,27 +29,28 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_ALGORITHMS_H
 #define GRID_ALGORITHMS_H

-#include <algorithms/SparseMatrix.h>
-#include <algorithms/LinearOperator.h>
-#include <algorithms/Preconditioner.h>
+#include <Grid/algorithms/SparseMatrix.h>
+#include <Grid/algorithms/LinearOperator.h>
+#include <Grid/algorithms/Preconditioner.h>

-#include <algorithms/approx/Zolotarev.h>
-#include <algorithms/approx/Chebyshev.h>
-#include <algorithms/approx/Remez.h>
-#include <algorithms/approx/MultiShiftFunction.h>
+#include <Grid/algorithms/approx/Zolotarev.h>
+#include <Grid/algorithms/approx/Chebyshev.h>
+#include <Grid/algorithms/approx/Remez.h>
+#include <Grid/algorithms/approx/MultiShiftFunction.h>

-#include <algorithms/iterative/ConjugateGradient.h>
-#include <algorithms/iterative/ConjugateResidual.h>
-#include <algorithms/iterative/NormalEquations.h>
-#include <algorithms/iterative/SchurRedBlack.h>
+#include <Grid/algorithms/iterative/ConjugateGradient.h>
+#include <Grid/algorithms/iterative/ConjugateResidual.h>
+#include <Grid/algorithms/iterative/NormalEquations.h>
+#include <Grid/algorithms/iterative/SchurRedBlack.h>

-#include <algorithms/iterative/ConjugateGradientMultiShift.h>
+#include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
+#include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>

 // Lanczos support
-#include <algorithms/iterative/MatrixUtils.h>
-#include <algorithms/iterative/ImplicitlyRestartedLanczos.h>
+#include <Grid/algorithms/iterative/MatrixUtils.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>

-#include <algorithms/CoarsenedMatrix.h>
+#include <Grid/algorithms/CoarsenedMatrix.h>

 // Eigen/lanczos
 // EigCg
--- a/lib/AlignedAllocator.h
+++ b/lib/AlignedAllocator.h
@@ -113,9 +113,8 @@ public:

 #endif
    _Tp tmp;
-#undef FIRST_TOUCH_OPTIMISE
-#ifdef FIRST_TOUCH_OPTIMISE
-#pragma omp parallel for 
+#ifdef GRID_NUMA
+#pragma omp parallel for schedule(static)
  for(int i=0;i<__n;i++){
    ptr[i]=tmp;
  }
--- a/lib/Cartesian.h
+++ b/lib/Cartesian.h
@@ -28,8 +28,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_CARTESIAN_H
 #define GRID_CARTESIAN_H

-#include <cartesian/Cartesian_base.h>
-#include <cartesian/Cartesian_full.h>
-#include <cartesian/Cartesian_red_black.h> 
+#include <Grid/cartesian/Cartesian_base.h>
+#include <Grid/cartesian/Cartesian_full.h>
+#include <Grid/cartesian/Cartesian_red_black.h> 

 #endif
--- a/lib/Communicator.h
+++ b/lib/Communicator.h
@@ -28,6 +28,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_COMMUNICATOR_H
 #define GRID_COMMUNICATOR_H

-#include <communicator/Communicator_base.h>
+#include <Grid/communicator/Communicator_base.h>

 #endif
--- a/lib/Cshift.h
+++ b/lib/Cshift.h
@@ -28,17 +28,17 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef _GRID_CSHIFT_H_
 #define _GRID_CSHIFT_H_

-#include <cshift/Cshift_common.h>
+#include <Grid/cshift/Cshift_common.h>

 #ifdef GRID_COMMS_NONE
-#include <cshift/Cshift_none.h>
+#include <Grid/cshift/Cshift_none.h>
 #endif

 #ifdef GRID_COMMS_MPI
-#include <cshift/Cshift_mpi.h>
+#include <Grid/cshift/Cshift_mpi.h>
 #endif 

 #ifdef GRID_COMMS_SHMEM
-#include <cshift/Cshift_mpi.h> // uses same implementation of communicator
+#include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
 #endif 
 #endif
--- a/lib/FFT.h
+++ b/lib/FFT.h
@@ -0,0 +1,276 @@
+
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/Cshift.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef _GRID_FFT_H_
+#define _GRID_FFT_H_
+
+#ifdef HAVE_FFTW	
+#include <fftw3.h>
+#endif
+namespace Grid {
+
+  template<class scalar> struct FFTW { };
+
+#ifdef HAVE_FFTW	
+  template<> struct FFTW<ComplexD> {
+  public:
+
+    typedef fftw_complex FFTW_scalar;
+    typedef fftw_plan    FFTW_plan;
+
+    static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
+					FFTW_scalar *in, const int *inembed,		
+					int istride, int idist,		
+					FFTW_scalar *out, const int *onembed,		
+					int ostride, int odist,		
+					int sign, unsigned flags) {
+      return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
+    }	  
+    
+    static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
+      ::fftw_flops(p,add,mul,fmas);
+    }
+
+    inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
+      ::fftw_execute_dft(p,in,out);
+    }
+    inline static void fftw_destroy_plan(const FFTW_plan p) {
+      ::fftw_destroy_plan(p);
+    }
+  };
+
+  template<> struct FFTW<ComplexF> {
+  public:
+
+    typedef fftwf_complex FFTW_scalar;
+    typedef fftwf_plan    FFTW_plan;
+
+    static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
+					FFTW_scalar *in, const int *inembed,		
+					int istride, int idist,		
+					FFTW_scalar *out, const int *onembed,		
+					int ostride, int odist,		
+					int sign, unsigned flags) {
+      return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
+    }	  
+    
+    static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
+      ::fftwf_flops(p,add,mul,fmas);
+    }
+
+    inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
+      ::fftwf_execute_dft(p,in,out);
+    }
+    inline static void fftw_destroy_plan(const FFTW_plan p) {
+      ::fftwf_destroy_plan(p);
+    }
+  };
+
+#endif
+
+#ifndef FFTW_FORWARD
+#define FFTW_FORWARD (-1)
+#define FFTW_BACKWARD (+1)
+#endif
+
+  class FFT { 
+  private:
+
+    GridCartesian *vgrid;
+    GridCartesian *sgrid;
+
+    int Nd;
+    double flops;
+    double flops_call;
+    uint64_t usec;
+
+    std::vector<int> dimensions;
+    std::vector<int> processors;
+    std::vector<int> processor_coor;
+
+  public:
+
+    static const int forward=FFTW_FORWARD;
+    static const int backward=FFTW_BACKWARD;
+
+    double Flops(void) {return flops;}
+    double MFlops(void) {return flops/usec;}
+
+    FFT ( GridCartesian * grid ) : 
+      vgrid(grid),
+      Nd(grid->_ndimension),
+      dimensions(grid->_fdimensions),
+      processors(grid->_processors),
+      processor_coor(grid->_processor_coor)
+    {
+      flops=0;
+      usec =0;
+      std::vector<int> layout(Nd,1);
+      sgrid = new GridCartesian(dimensions,layout,processors);
+    };
+
+    ~FFT ( void)  { 
+      delete sgrid; 
+    }
+    
+    template<class vobj>
+    void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int inverse){
+
+      conformable(result._grid,vgrid);
+      conformable(source._grid,vgrid);
+
+      int L = vgrid->_ldimensions[dim];
+      int G = vgrid->_fdimensions[dim];
+
+      std::vector<int> layout(Nd,1);
+      std::vector<int> pencil_gd(vgrid->_fdimensions);
+
+      pencil_gd[dim] = G*processors[dim];    
+
+      // Pencil global vol LxLxGxLxL per node
+      GridCartesian pencil_g(pencil_gd,layout,processors);
+
+      // Construct pencils
+      typedef typename vobj::scalar_object sobj;
+      typedef typename sobj::scalar_type   scalar;
+
+      Lattice<vobj> ssource(vgrid); ssource =source;
+      Lattice<sobj> pgsource(&pencil_g);
+      Lattice<sobj> pgresult(&pencil_g); pgresult=zero;
+
+#ifndef HAVE_FFTW	
+      assert(0);
+#else 
+      typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
+      typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
+
+      {
+	int Ncomp = sizeof(sobj)/sizeof(scalar);
+	int Nlow  = 1;
+	for(int d=0;d<dim;d++){
+	  Nlow*=vgrid->_ldimensions[d];
+	}
+
+	int rank = 1;  /* 1d transforms */
+	int n[] = {G}; /* 1d transforms of length G */
+	int howmany = Ncomp;
+	int odist,idist,istride,ostride;
+	idist   = odist   = 1;          /* Distance between consecutive FT's */
+	istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
+	int *inembed = n, *onembed = n;
+
+	
+	int sign = FFTW_FORWARD;
+	if (inverse) sign = FFTW_BACKWARD;
+
+	FFTW_plan p;
+	{
+	  FFTW_scalar *in = (FFTW_scalar *)&pgsource._odata[0];
+	  FFTW_scalar *out= (FFTW_scalar *)&pgresult._odata[0];
+	  p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
+					       in,inembed,
+					       istride,idist,
+					       out,onembed,
+					       ostride, odist,
+					       sign,FFTW_ESTIMATE);
+	}
+
+	double add,mul,fma;
+	FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
+	flops_call = add+mul+2.0*fma;
+
+	GridStopWatch timer;
+
+	// Barrel shift and collect global pencil
+	for(int p=0;p<processors[dim];p++) { 
+
+	  for(int idx=0;idx<sgrid->lSites();idx++) { 
+
+	    std::vector<int> lcoor(Nd);
+    	    sgrid->LocalIndexToLocalCoor(idx,lcoor);
+
+	    sobj s;
+
+	    peekLocalSite(s,ssource,lcoor);
+
+	    lcoor[dim]+=p*L;
+	   
+	    pokeLocalSite(s,pgsource,lcoor);
+	  }
+
+	  ssource = Cshift(ssource,dim,L);
+	}
+	
+	// Loop over orthog coords
+	int NN=pencil_g.lSites();
+
+	GridStopWatch Timer;
+	Timer.Start();
+
+PARALLEL_FOR_LOOP
+	for(int idx=0;idx<NN;idx++) { 
+
+	  std::vector<int> lcoor(Nd);
+	  pencil_g.LocalIndexToLocalCoor(idx,lcoor);
+
+	  if ( lcoor[dim] == 0 ) {  // restricts loop to plane at lcoor[dim]==0
+	    FFTW_scalar *in = (FFTW_scalar *)&pgsource._odata[idx];
+	    FFTW_scalar *out= (FFTW_scalar *)&pgresult._odata[idx];
+	    FFTW<scalar>::fftw_execute_dft(p,in,out);
+	  }
+	}
+
+        Timer.Stop();
+	usec += Timer.useconds();
+	flops+= flops_call*NN;
+
+        int pc = processor_coor[dim];
+        for(int idx=0;idx<sgrid->lSites();idx++) { 
+	  std::vector<int> lcoor(Nd);
+	  sgrid->LocalIndexToLocalCoor(idx,lcoor);
+	  std::vector<int> gcoor = lcoor;
+	  // extract the result
+	  sobj s;
+	  gcoor[dim] = lcoor[dim]+L*pc;
+	  peekLocalSite(s,pgresult,gcoor);
+	  pokeLocalSite(s,result,lcoor);
+	}
+      	  
+	FFTW<scalar>::fftw_destroy_plan(p);
+      }
+#endif
+
+
+    }
+
+  };
+
+
+}
+
+#endif
--- a/lib/Grid.h
+++ b/lib/Grid.h
@@ -59,29 +59,31 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 ///////////////////
 // Grid headers
 ///////////////////
-#include <serialisation/Serialisation.h>
-#include <Config.h>
-#include <Timer.h>
-#include <PerfCount.h>
-#include <Log.h>
-#include <AlignedAllocator.h>
-#include <Simd.h>
-#include <Threads.h>
-#include <Lexicographic.h>
-#include <Communicator.h> 
-#include <Cartesian.h>    
-#include <Tensors.h>      
-#include <Lattice.h>      
-#include <Cshift.h>       
-#include <Stencil.h>      
-#include <Algorithms.h>   
-#include <parallelIO/BinaryIO.h>
-#include <qcd/QCD.h>
-#include <parallelIO/NerscIO.h>
-#include <Init.h>
+#include <Grid/serialisation/Serialisation.h>
+#include "Config.h"
+#include <Grid/Timer.h>
+#include <Grid/PerfCount.h>
+#include <Grid/Log.h>
+#include <Grid/AlignedAllocator.h>
+#include <Grid/Simd.h>
+#include <Grid/Threads.h>
+#include <Grid/Lexicographic.h>
+#include <Grid/Init.h>
+#include <Grid/Communicator.h> 
+#include <Grid/Cartesian.h>    
+#include <Grid/Tensors.h>      
+#include <Grid/Lattice.h>      
+#include <Grid/Cshift.h>       
+#include <Grid/Stencil.h>      
+#include <Grid/Algorithms.h>   
+#include <Grid/parallelIO/BinaryIO.h>
+#include <Grid/qcd/QCD.h>
+#include <Grid/parallelIO/NerscIO.h>

-#include <qcd/hmc/NerscCheckpointer.h>
-#include <qcd/hmc/HmcRunner.h>
+#include <Grid/FFT.h>
+
+#include <Grid/qcd/hmc/NerscCheckpointer.h>
+#include <Grid/qcd/hmc/HmcRunner.h>



--- a/lib/Init.cc
+++ b/lib/Init.cc
@@ -153,6 +153,7 @@ void GridParseLayout(char **argv,int argc,
    assert(ompthreads.size()==1);
    GridThread::SetThreads(ompthreads[0]);
  }
+
  if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
    std::vector<int> cores(0);
    arg= GridCmdOptionPayload(argv,argv+argc,"--cores");
@@ -193,7 +194,7 @@ void Grid_init(int *argc,char ***argv)
    std::cout<<GridLogMessage<<"--mpi n.n.n.n   : default MPI decomposition"<<std::endl;    
    std::cout<<GridLogMessage<<"--threads n     : default number of OMP threads"<<std::endl;
    std::cout<<GridLogMessage<<"--grid n.n.n.n  : default Grid size"<<std::endl;    
-    std::cout<<GridLogMessage<<"--log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug"<<std::endl;
+    std::cout<<GridLogMessage<<"--log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
    exit(EXIT_SUCCESS);
  }

@@ -203,7 +204,6 @@ void Grid_init(int *argc,char ***argv)
    GridLogConfigure(logstreams);
  }

-
  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
    Grid_debug_handler_init();
  }
@@ -234,26 +234,34 @@ void Grid_init(int *argc,char ***argv)
    std::cout<<GridLogMessage<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
  }

+  std::string COL_RED    = GridLogColours.colour["RED"];
+  std::string COL_PURPLE = GridLogColours.colour["PURPLE"];
+  std::string COL_BLACK  = GridLogColours.colour["BLACK"];
+  std::string COL_GREEN  = GridLogColours.colour["GREEN"];
+  std::string COL_BLUE   = GridLogColours.colour["BLUE"];
+  std::string COL_YELLOW = GridLogColours.colour["YELLOW"];
+  std::string COL_BACKGROUND = GridLogColours.colour["NORMAL"];
+
+  
  std::cout <<std::endl;
-  std::cout <<Logger::RED  << "__|__|__|__|__"<<             "|__|__|_"<<Logger::PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
-  std::cout <<Logger::RED  << "__|__|__|__|__"<<             "|__|__|_"<<Logger::PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
-  std::cout <<Logger::RED  << "__|__|  |  |  "<<             "|  |  | "<<Logger::PURPLE<<" |  |  |"<<                "  |  |  | _|__"<<std::endl; 
-  std::cout <<Logger::RED  << "__|__         "<<             "        "<<Logger::PURPLE<<"        "<<                "          _|__"<<std::endl; 
-  std::cout <<Logger::RED  << "__|_  "<<Logger::GREEN<<" GGGG   "<<Logger::RED<<" RRRR   "<<Logger::BLUE  <<" III    "<<Logger::PURPLE<<"DDDD  "<<Logger::PURPLE<<"    _|__"<<std::endl;
-  std::cout <<Logger::RED  << "__|_  "<<Logger::GREEN<<"G       "<<Logger::RED<<" R   R  "<<Logger::BLUE  <<"  I     "<<Logger::PURPLE<<"D   D "<<Logger::PURPLE<<"    _|__"<<std::endl;
-  std::cout <<Logger::RED  << "__|_  "<<Logger::GREEN<<"G       "<<Logger::RED<<" R   R  "<<Logger::BLUE  <<"  I     "<<Logger::PURPLE<<"D    D"<<Logger::PURPLE<<"    _|__"<<std::endl;
-  std::cout <<Logger::BLUE << "__|_  "<<Logger::GREEN<<"G  GG   "<<Logger::RED<<" RRRR   "<<Logger::BLUE  <<"  I     "<<Logger::PURPLE<<"D    D"<<Logger::GREEN <<"    _|__"<<std::endl;
-  std::cout <<Logger::BLUE << "__|_  "<<Logger::GREEN<<"G   G   "<<Logger::RED<<" R  R   "<<Logger::BLUE  <<"  I     "<<Logger::PURPLE<<"D   D "<<Logger::GREEN <<"    _|__"<<std::endl;
-  std::cout <<Logger::BLUE << "__|_  "<<Logger::GREEN<<" GGGG   "<<Logger::RED<<" R   R  "<<Logger::BLUE  <<" III    "<<Logger::PURPLE<<"DDDD  "<<Logger::GREEN <<"    _|__"<<std::endl;
-  std::cout <<Logger::BLUE << "__|__         "<<             "        "<<Logger::GREEN <<"        "<<                "          _|__"<<std::endl; 
-  std::cout <<Logger::BLUE << "__|__|__|__|__"<<             "|__|__|_"<<Logger::GREEN <<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
-  std::cout <<Logger::BLUE << "__|__|__|__|__"<<             "|__|__|_"<<Logger::GREEN <<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
-  std::cout <<Logger::BLUE << "  |  |  |  |  "<<             "|  |  | "<<Logger::GREEN <<" |  |  |"<<                "  |  |  |  |  "<<std::endl; 
+  std::cout <<COL_RED  << "__|__|__|__|__"<<             "|__|__|_"<<COL_PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
+  std::cout <<COL_RED  << "__|__|__|__|__"<<             "|__|__|_"<<COL_PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
+  std::cout <<COL_RED  << "__|_ |  |  |  "<<             "|  |  | "<<COL_PURPLE<<" |  |  |"<<                "  |  |  | _|__"<<std::endl; 
+  std::cout <<COL_RED  << "__|_          "<<             "        "<<COL_PURPLE<<"        "<<                "          _|__"<<std::endl; 
+  std::cout <<COL_RED  << "__|_  "<<COL_GREEN<<" GGGG   "<<COL_RED<<" RRRR   "<<COL_BLUE  <<" III    "<<COL_PURPLE<<"DDDD  "<<COL_PURPLE<<"    _|__"<<std::endl;
+  std::cout <<COL_RED  << "__|_  "<<COL_GREEN<<"G       "<<COL_RED<<" R   R  "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D   D "<<COL_PURPLE<<"    _|__"<<std::endl;
+  std::cout <<COL_RED  << "__|_  "<<COL_GREEN<<"G       "<<COL_RED<<" R   R  "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D    D"<<COL_PURPLE<<"    _|__"<<std::endl;
+  std::cout <<COL_BLUE << "__|_  "<<COL_GREEN<<"G  GG   "<<COL_RED<<" RRRR   "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D    D"<<COL_GREEN <<"    _|__"<<std::endl;
+  std::cout <<COL_BLUE << "__|_  "<<COL_GREEN<<"G   G   "<<COL_RED<<" R  R   "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D   D "<<COL_GREEN <<"    _|__"<<std::endl;
+  std::cout <<COL_BLUE << "__|_  "<<COL_GREEN<<" GGGG   "<<COL_RED<<" R   R  "<<COL_BLUE  <<" III    "<<COL_PURPLE<<"DDDD  "<<COL_GREEN <<"    _|__"<<std::endl;
+  std::cout <<COL_BLUE << "__|_          "<<             "        "<<COL_GREEN <<"        "<<                "          _|__"<<std::endl; 
+  std::cout <<COL_BLUE << "__|__|__|__|__"<<             "|__|__|_"<<COL_GREEN <<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
+  std::cout <<COL_BLUE << "__|__|__|__|__"<<             "|__|__|_"<<COL_GREEN <<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
+  std::cout <<COL_BLUE << "  |  |  |  |  "<<             "|  |  | "<<COL_GREEN <<" |  |  |"<<                "  |  |  |  |  "<<std::endl; 
  std::cout << std::endl;
  std::cout << std::endl;
-  std::cout <<Logger::YELLOW<< std::endl;
+  std::cout <<COL_YELLOW<< std::endl;
  std::cout << "Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors"<<std::endl;
-  std::cout << "Colours by Tadahito Boyle "<<std::endl;
  std::cout << std::endl;
  std::cout << "This program is free software; you can redistribute it and/or modify"<<std::endl;
  std::cout << "it under the terms of the GNU General Public License as published by"<<std::endl;
@@ -264,7 +272,8 @@ void Grid_init(int *argc,char ***argv)
  std::cout << "but WITHOUT ANY WARRANTY; without even the implied warranty of"<<std::endl;
  std::cout << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the"<<std::endl;
  std::cout << "GNU General Public License for more details."<<std::endl;
-  std::cout << Logger::BLACK <<std::endl;
+  std::cout << COL_BACKGROUND <<std::endl;
+  std::cout << std::endl;
 }

  
--- a/lib/Lattice.h
+++ b/lib/Lattice.h
@@ -28,6 +28,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_LATTICE_H
 #define GRID_LATTICE_H

-#include <lattice/Lattice_base.h>
+#include <Grid/lattice/Lattice_base.h>

 #endif
--- a/lib/Log.cc
+++ b/lib/Log.cc
@@ -1,126 +1,92 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/Log.cc
+Source file: ./lib/Log.cc

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 #include <Grid.h>

 namespace Grid {

 GridStopWatch Logger::StopWatch;
-std::ostream  Logger::devnull(0);
-std::string Logger::BLACK("\033[30m");
-std::string Logger::RED("\033[31m");
-std::string Logger::GREEN("\033[32m");
-std::string Logger::YELLOW("\033[33m");
-std::string Logger::BLUE("\033[34m");
-std::string Logger::PURPLE("\033[35m");
-std::string Logger::CYAN("\033[36m");
-std::string Logger::WHITE("\033[37m");
-std::string Logger::NORMAL("\033[0;39m");
-std::string EMPTY("");
+std::ostream Logger::devnull(0);

-#if 0  
-  GridLogger GridLogError      (1,"Error",Logger::RED);
-  GridLogger GridLogWarning    (1,"Warning",Logger::YELLOW);
-  GridLogger GridLogMessage    (1,"Message",Logger::BLACK);
-  GridLogger GridLogDebug      (1,"Debug",Logger::PURPLE);
-  GridLogger GridLogPerformance(1,"Performance",Logger::GREEN);
-  GridLogger GridLogIterative  (1,"Iterative",Logger::BLUE);
-  GridLogger GridLogIntegrator (1,"Integrator",Logger::BLUE);
-#else
-  GridLogger GridLogError      (1,"Error",EMPTY);
-  GridLogger GridLogWarning    (1,"Warning",EMPTY);
-  GridLogger GridLogMessage    (1,"Message",EMPTY);
-  GridLogger GridLogDebug      (1,"Debug",EMPTY);
-  GridLogger GridLogPerformance(1,"Performance",EMPTY);
-  GridLogger GridLogIterative  (1,"Iterative",EMPTY);
-  GridLogger GridLogIntegrator (1,"Integrator",EMPTY);
-#endif
+Colours GridLogColours(0);
+GridLogger GridLogError(1, "Error", GridLogColours, "RED");
+GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
+GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL");
+GridLogger GridLogDebug(1, "Debug", GridLogColours, "PURPLE");
+GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
+GridLogger GridLogIterative(1, "Iterative", GridLogColours, "BLUE");
+GridLogger GridLogIntegrator(1, "Integrator", GridLogColours, "BLUE");

-void GridLogConfigure(std::vector<std::string> &logstreams)
-{
+void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogError.Active(0);
  GridLogWarning.Active(0);
-  GridLogMessage.Active(0);
+  GridLogMessage.Active(1); // at least the messages should be always on
  GridLogIterative.Active(0);
  GridLogDebug.Active(0);
  GridLogPerformance.Active(0);
  GridLogIntegrator.Active(0);
+  GridLogColours.Active(0);

-  int blackAndWhite = 1;
-  if(blackAndWhite){
-    Logger::BLACK = std::string("");
-    Logger::RED    =Logger::BLACK;
-    Logger::GREEN  =Logger::BLACK;
-    Logger::YELLOW =Logger::BLACK;
-    Logger::BLUE   =Logger::BLACK;
-    Logger::PURPLE =Logger::BLACK;
-    Logger::CYAN   =Logger::BLACK;
-    Logger::WHITE  =Logger::BLACK;
-    Logger::NORMAL =Logger::BLACK;
-  }
-
-  for(int i=0;i<logstreams.size();i++){
-    if ( logstreams[i]== std::string("Error")       ) GridLogError.Active(1);
-    if ( logstreams[i]== std::string("Warning")     ) GridLogWarning.Active(1);
-    if ( logstreams[i]== std::string("Message")     ) GridLogMessage.Active(1);
-    if ( logstreams[i]== std::string("Iterative")   ) GridLogIterative.Active(1);
-    if ( logstreams[i]== std::string("Debug")       ) GridLogDebug.Active(1);
-    if ( logstreams[i]== std::string("Performance") ) GridLogPerformance.Active(1);
-    if ( logstreams[i]== std::string("Integrator" ) ) GridLogIntegrator.Active(1);
+  for (int i = 0; i < logstreams.size(); i++) {
+    if (logstreams[i] == std::string("Error")) GridLogError.Active(1);
+    if (logstreams[i] == std::string("Warning")) GridLogWarning.Active(1);
+    if (logstreams[i] == std::string("NoMessage")) GridLogMessage.Active(0);
+    if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1);
+    if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1);
+    if (logstreams[i] == std::string("Performance"))
+      GridLogPerformance.Active(1);
+    if (logstreams[i] == std::string("Integrator")) GridLogIntegrator.Active(1);
+    if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1);
  }
 }

 ////////////////////////////////////////////////////////////
 // Verbose limiter on MPI tasks
 ////////////////////////////////////////////////////////////
-void Grid_quiesce_nodes(void)
-{
-  int me=0;
+void Grid_quiesce_nodes(void) {
+  int me = 0;
 #ifdef GRID_COMMS_MPI
-  MPI_Comm_rank(MPI_COMM_WORLD,&me);
+  MPI_Comm_rank(MPI_COMM_WORLD, &me);
 #endif
 #ifdef GRID_COMMS_SHMEM
  me = shmem_my_pe();
 #endif
-  if ( me ) { 
+  if (me) {
    std::cout.setstate(std::ios::badbit);
  }
 }

-void Grid_unquiesce_nodes(void)
-{
+void Grid_unquiesce_nodes(void) {
 #ifdef GRID_COMMS_MPI
-    std::cout.clear();
+  std::cout.clear();
 #endif
 }
-
-
 }
-
--- a/lib/Log.h
+++ b/lib/Log.h
@@ -6,9 +6,9 @@

    Copyright (C) 2015

-Author: Antonin Portelli <antonin.portelli@me.com>
-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: Antonin Portelli <antonin.portelli@me.com>
+    Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -27,6 +27,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+
+#include <map>
+
 #ifndef GRID_LOG_H
 #define GRID_LOG_H

@@ -34,56 +37,99 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <execinfo.h>
 #endif

-namespace Grid {
+    namespace Grid {

 // Dress the output; use std::chrono for time stamping via the StopWatch class
 int Rank(void); // used for early stage debug before library init


+class Colours{
+protected:
+  bool is_active;
+public:
+  std::map<std::string, std::string> colour;
+
+  Colours(bool activate=false){
+    Active(activate);
+  };
+
+  void Active(bool activate){
+    is_active=activate;
+
+    if (is_active){
+     colour["BLACK"]  ="\033[30m";
+     colour["RED"]    ="\033[31m";
+     colour["GREEN"]  ="\033[32m";
+     colour["YELLOW"] ="\033[33m";
+     colour["BLUE"]   ="\033[34m";
+     colour["PURPLE"] ="\033[35m";
+     colour["CYAN"]   ="\033[36m";
+     colour["WHITE"]  ="\033[37m";
+     colour["NORMAL"] ="\033[0;39m";
+   } else {
+    colour["BLACK"] ="";
+    colour["RED"]   ="";
+    colour["GREEN"] ="";
+    colour["YELLOW"]="";
+    colour["BLUE"]  ="";
+    colour["PURPLE"]="";
+    colour["CYAN"]  ="";
+    colour["WHITE"] ="";
+    colour["NORMAL"]="";
+  }
+
+
+};
+
+};
+
+
 class Logger {
 protected:
-    int active;
-    std::string name, topName, COLOUR;
-public:
-    static GridStopWatch StopWatch;
-    static std::ostream devnull;
+  Colours &Painter;
+  int active;
+  std::string name, topName;
+  std::string COLOUR;

-    static std::string BLACK;
-    static std::string RED  ;
-    static std::string GREEN;
-    static std::string YELLOW;
-    static std::string BLUE  ;
-    static std::string PURPLE;
-    static std::string CYAN  ;
-    static std::string WHITE ;
-    static std::string NORMAL;
-    
- Logger(std::string topNm, int on, std::string nm,std::string col)
-   : active(on), name(nm), topName(topNm), COLOUR(col) {};
-    
-    void Active(int on) {active = on;};
-    int  isActive(void) {return active;};
-    
-    friend std::ostream& operator<< (std::ostream& stream, const Logger& log){
-        if ( log.active ) {
-            StopWatch.Stop();
-            GridTime now = StopWatch.Elapsed();
-            StopWatch.Start();
-            stream << BLACK <<std::setw(8) << std::left << log.topName << BLACK<< " : ";
-            stream << log.COLOUR <<std::setw(11)  << log.name << BLACK << " : ";
-            stream << YELLOW <<std::setw(6) << now <<BLACK << " : " ;
-            stream << log.COLOUR;
-            return stream;
-        } else { 
-            return devnull;
-        }
+public:
+  static GridStopWatch StopWatch;
+  static std::ostream devnull;
+
+  std::string background() {return Painter.colour["NORMAL"];}
+  std::string evidence() {return Painter.colour["YELLOW"];}
+  std::string colour() {return Painter.colour[COLOUR];}
+
+  Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col)
+  : active(on),
+  name(nm),
+  topName(topNm),
+  Painter(col_class),
+  COLOUR(col){} ;
+  
+  void Active(int on) {active = on;};
+  int  isActive(void) {return active;};
+  
+  friend std::ostream& operator<< (std::ostream& stream, Logger& log){
+
+    if ( log.active ) {
+      StopWatch.Stop();
+      GridTime now = StopWatch.Elapsed();
+      StopWatch.Start();
+      stream << log.background()<< log.topName << log.background()<< " : ";
+      stream << log.colour() <<std::setw(14) << std::left << log.name << log.background() << " : ";
+      stream << log.evidence()<< now << log.background() << " : " << log.colour();
+      return stream;
+    } else { 
+      return devnull;
    }
-    
+  }
+
 };
-    
+
 class GridLogger: public Logger {
 public:
- GridLogger(int on, std::string nm, std::string col = Logger::BLACK): Logger("Grid", on, nm, col){};
+  GridLogger(int on, std::string nm, Colours&col_class, std::string col_key = "NORMAL"):
+  Logger("Grid", on, nm, col_class, col_key){};
 };

 void GridLogConfigure(std::vector<std::string> &logstreams);
@@ -95,38 +141,40 @@ extern GridLogger GridLogDebug  ;
 extern GridLogger GridLogPerformance;
 extern GridLogger GridLogIterative  ;
 extern GridLogger GridLogIntegrator  ;
+extern Colours    GridLogColours;


 #define _NBACKTRACE (256)
 extern void * Grid_backtrace_buffer[_NBACKTRACE];

 #define BACKTRACEFILE() {\
-    char string[20];					\
-    std::sprintf(string,"backtrace.%d",Rank());				\
-    std::FILE * fp = std::fopen(string,"w");				\
-    BACKTRACEFP(fp)\
-    std::fclose(fp);	    \
+char string[20];					\
+std::sprintf(string,"backtrace.%d",Rank());				\
+std::FILE * fp = std::fopen(string,"w");				\
+BACKTRACEFP(fp)\
+std::fclose(fp);	    \
 }


 #ifdef HAVE_EXECINFO_H
 #define BACKTRACEFP(fp) { \
-  int symbols    = backtrace        (Grid_backtrace_buffer,_NBACKTRACE);\
-  char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);\
-  for (int i = 0; i < symbols; i++){\
-    std::fprintf (fp,"BackTrace Strings: %d %s\n",i, strings[i]); std::fflush(fp); \
-  }\
+int symbols    = backtrace        (Grid_backtrace_buffer,_NBACKTRACE);\
+char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);\
+for (int i = 0; i < symbols; i++){\
+  std::fprintf (fp,"BackTrace Strings: %d %s\n",i, strings[i]); std::fflush(fp); \
+}\
 }
 #else 
 #define BACKTRACEFP(fp) { \
-    std::fprintf (fp,"BT %d %lx\n",0, __builtin_return_address(0)); std::fflush(fp); \
-    std::fprintf (fp,"BT %d %lx\n",1, __builtin_return_address(1)); std::fflush(fp); \
-    std::fprintf (fp,"BT %d %lx\n",2, __builtin_return_address(2)); std::fflush(fp); \
-    std::fprintf (fp,"BT %d %lx\n",3, __builtin_return_address(3)); std::fflush(fp); \
+std::fprintf (fp,"BT %d %lx\n",0, __builtin_return_address(0)); std::fflush(fp); \
+std::fprintf (fp,"BT %d %lx\n",1, __builtin_return_address(1)); std::fflush(fp); \
+std::fprintf (fp,"BT %d %lx\n",2, __builtin_return_address(2)); std::fflush(fp); \
+std::fprintf (fp,"BT %d %lx\n",3, __builtin_return_address(3)); std::fflush(fp); \
 }
 #endif

 #define BACKTRACE() BACKTRACEFP(stdout) 

+
 }
 #endif
--- a/lib/Make.inc
+++ b/lib/Make.inc
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -1,6 +1,3 @@
-# additional include paths necessary to compile the C++ library
-AM_CXXFLAGS = -I$(top_srcdir)/
-
 extra_sources=
 if BUILD_COMMS_MPI
  extra_sources+=communicator/Communicator_mpi.cc
@@ -17,16 +14,11 @@ endif
 #
 # Libraries
 #
-
 include Make.inc
+include Eigen.inc

 lib_LIBRARIES = libGrid.a
-libGrid_a_SOURCES = $(CCFILES) $(extra_sources)
-
-
-#	qcd/action/fermion/PartialFractionFermion5D.cc\	\
-#
-# Include files
-#
-nobase_include_HEADERS=$(HFILES)

+libGrid_a_SOURCES              = $(CCFILES) $(extra_sources)
+libGrid_adir                   = $(pkgincludedir)
+nobase_dist_pkginclude_HEADERS = $(HFILES) $(eigen_files) Config.h
--- a/lib/Simd.h
+++ b/lib/Simd.h
@@ -1,32 +1,33 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/Simd.h
+Source file: ./lib/Simd.h

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
 Author: paboyle <paboyle@ph.ed.ac.uk>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef GRID_SIMD_H
 #define GRID_SIMD_H

@@ -118,6 +119,14 @@ namespace Grid {
  inline ComplexD timesI(const ComplexD &r)     { return(r*ComplexD(0.0,1.0));}
  inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));}
  inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));}
+
+  // define projections to real and imaginay parts
+  inline ComplexF projReal(const ComplexF &r){return( ComplexF(std::real(r), 0.0));}
+  inline ComplexD projReal(const ComplexD &r){return( ComplexD(std::real(r), 0.0));}
+  inline ComplexF projImag(const ComplexF &r){return (ComplexF(std::imag(r), 0.0 ));}
+  inline ComplexD projImag(const ComplexD &r){return (ComplexD(std::imag(r), 0.0));}
+
+  // define auxiliary functions for complex computations
  inline void timesI(ComplexF &ret,const ComplexF &r)     { ret = timesI(r);}
  inline void timesI(ComplexD &ret,const ComplexD &r)     { ret = timesI(r);}
  inline void timesMinusI(ComplexF &ret,const ComplexF &r){ ret = timesMinusI(r);}
@@ -163,8 +172,8 @@ namespace Grid {

 };

-#include <simd/Grid_vector_types.h>
-#include <simd/Grid_vector_unops.h>
+#include "simd/Grid_vector_types.h"
+#include "simd/Grid_vector_unops.h"

 namespace Grid {
  // Default precision
--- a/lib/Stat.cc
+++ b/lib/Stat.cc
@@ -0,0 +1,247 @@
+#include <Grid.h>
+#include <PerfCount.h>
+#include <Stat.h>
+
+
+namespace Grid { 
+
+
+bool PmuStat::pmu_initialized=false;
+
+
+void PmuStat::init(const char *regname)
+{
+#ifdef __x86_64__
+  name = regname;
+  if (!pmu_initialized)
+    {
+      std::cout<<"initialising pmu"<<std::endl;
+      pmu_initialized = true;
+      pmu_init();
+    }
+  clear();
+#endif
+}
+void PmuStat::clear(void)
+{
+#ifdef __x86_64__
+  count = 0;
+  tregion = 0;
+  pmc0 = 0;
+  pmc1 = 0;
+  inst = 0;
+  cyc = 0;
+  ref = 0;
+  tcycles = 0;
+  reads = 0;
+  writes = 0;
+#endif
+}
+void PmuStat::print(void)
+{
+#ifdef __x86_64__
+  std::cout <<"Reg "<<std::string(name)<<":\n";
+  std::cout <<"  region "<<tregion<<std::endl;
+  std::cout <<"  cycles "<<tcycles<<std::endl;
+  std::cout <<"  inst   "<<inst   <<std::endl;
+  std::cout <<"  cyc    "<<cyc    <<std::endl;
+  std::cout <<"  ref    "<<ref    <<std::endl;
+  std::cout <<"  pmc0   "<<pmc0   <<std::endl;
+  std::cout <<"  pmc1   "<<pmc1   <<std::endl;
+  std::cout <<"  count  "<<count  <<std::endl;
+  std::cout <<"  reads  "<<reads  <<std::endl;
+  std::cout <<"  writes "<<writes <<std::endl;
+#endif
+}
+void PmuStat::start(void)
+{
+#ifdef __x86_64__
+  pmu_start();
+  ++count;
+  xmemctrs(&mrstart, &mwstart);
+  tstart = __rdtsc();
+#endif
+}
+void PmuStat::enter(int t)
+{
+#ifdef __x86_64__
+  counters[0][t] = __rdpmc(0);
+  counters[1][t] = __rdpmc(1);
+  counters[2][t] = __rdpmc((1<<30)|0);
+  counters[3][t] = __rdpmc((1<<30)|1);
+  counters[4][t] = __rdpmc((1<<30)|2);
+  counters[5][t] = __rdtsc();
+#endif
+}
+void PmuStat::exit(int t)
+{
+#ifdef __x86_64__
+  counters[0][t] = __rdpmc(0) - counters[0][t];
+  counters[1][t] = __rdpmc(1) - counters[1][t];
+  counters[2][t] = __rdpmc((1<<30)|0) - counters[2][t];
+  counters[3][t] = __rdpmc((1<<30)|1) - counters[3][t];
+  counters[4][t] = __rdpmc((1<<30)|2) - counters[4][t];
+  counters[5][t] = __rdtsc() - counters[5][t];
+#endif
+}
+void PmuStat::accum(int nthreads)
+{
+#ifdef __x86_64__
+  tend = __rdtsc();
+  xmemctrs(&mrend, &mwend);
+  pmu_stop();
+  for (int t = 0; t < nthreads; ++t) {
+    pmc0 += counters[0][t];
+    pmc1 += counters[1][t];
+    inst += counters[2][t];
+    cyc += counters[3][t];
+    ref += counters[4][t];
+    tcycles += counters[5][t];
+  }
+  uint64_t region = tend - tstart;
+  tregion += region;
+  uint64_t mreads = mrend - mrstart;
+  reads += mreads;
+  uint64_t mwrites = mwend - mwstart;
+  writes += mwrites;
+#endif
+}
+
+
+void PmuStat::pmu_fini(void) {}
+void PmuStat::pmu_start(void) {};
+void PmuStat::pmu_stop(void) {};
+void PmuStat::pmu_init(void)
+{
+#ifdef _KNIGHTS_LANDING_
+  KNLsetup();
+#endif
+}
+void PmuStat::xmemctrs(uint64_t *mr, uint64_t *mw)
+{
+#ifdef _KNIGHTS_LANDING_
+  ctrs c;
+  KNLreadctrs(c);
+  uint64_t emr = 0, emw = 0;
+  for (int i = 0; i < NEDC; ++i)
+    {
+      emr += c.edcrd[i];
+      emw += c.edcwr[i];
+    }
+  *mr = emr;
+  *mw = emw;
+#else
+  *mr = *mw = 0;
+#endif
+}
+
+#ifdef _KNIGHTS_LANDING_
+
+struct knl_gbl_ PmuStat::gbl;
+
+#define PMU_MEM
+
+void PmuStat::KNLevsetup(const char *ename, int &fd, int event, int umask)
+{
+  char fname[1024];
+  snprintf(fname, sizeof(fname), "%s/type", ename);
+  FILE *fp = fopen(fname, "r");
+  if (fp == 0) {
+    ::printf("open %s", fname);
+    ::exit(0);
+  }
+  int type;
+  int ret = fscanf(fp, "%d", &type);
+  assert(ret == 1);
+  fclose(fp);
+  //  std::cout << "Using PMU type "<<type<<" from " << std::string(ename) <<std::endl;
+
+  struct perf_event_attr hw = {};
+  hw.size = sizeof(hw);
+  hw.type = type;
+  // see /sys/devices/uncore_*/format/*
+  // All of the events we are interested in are configured the same way, but
+  // that isn't always true. Proper code would parse the format files
+  hw.config = event | (umask << 8);
+  //hw.read_format = PERF_FORMAT_GROUP;
+  // unfortunately the above only works within a single PMU; might
+  // as well just read them one at a time
+  int cpu = 0;
+  fd = perf_event_open(&hw, -1, cpu, -1, 0);
+  if (fd == -1) {
+    ::printf("CPU %d, box %s, event 0x%lx", cpu, ename, hw.config);
+    ::exit(0);
+  } else { 
+    //    std::cout << "event "<<std::string(ename)<<" set up for fd "<<fd<<" hw.config "<<hw.config <<std::endl;
+  }
+}
+
+
+ void PmuStat::KNLsetup(void){
+
+   int ret;
+   char fname[1024];
+
+   // MC RPQ inserts and WPQ inserts (reads & writes)
+   for (int mc = 0; mc < NMC; ++mc)
+     {
+       ::snprintf(fname, sizeof(fname), "/sys/devices/uncore_imc_%d",mc);
+       // RPQ Inserts
+       KNLevsetup(fname, gbl.mc_rd[mc], 0x1, 0x1);
+       // WPQ Inserts
+       KNLevsetup(fname, gbl.mc_wr[mc], 0x2, 0x1);
+     }
+   // EDC RPQ inserts and WPQ inserts
+   for (int edc=0; edc < NEDC; ++edc)
+     {
+       ::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_eclk_%d",edc);
+       // RPQ inserts
+       KNLevsetup(fname, gbl.edc_rd[edc], 0x1, 0x1);
+       // WPQ inserts
+       KNLevsetup(fname, gbl.edc_wr[edc], 0x2, 0x1);
+     }
+   // EDC HitE, HitM, MissE, MissM
+   for (int edc=0; edc < NEDC; ++edc)
+     {
+       ::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_uclk_%d", edc);
+       KNLevsetup(fname, gbl.edc_hite[edc], 0x2, 0x1);
+       KNLevsetup(fname, gbl.edc_hitm[edc], 0x2, 0x2);
+       KNLevsetup(fname, gbl.edc_misse[edc], 0x2, 0x4);
+       KNLevsetup(fname, gbl.edc_missm[edc], 0x2, 0x8);
+     }
+ }
+
+uint64_t PmuStat::KNLreadctr(int fd)
+{
+  uint64_t data;
+  size_t s = ::read(fd, &data, sizeof(data));
+  if (s != sizeof(uint64_t)){
+    ::printf("read counter %lu", s);
+    ::exit(0);
+  }
+  return data;
+}
+
+void PmuStat::KNLreadctrs(ctrs &c)
+{
+  for (int i = 0; i < NMC; ++i)
+    {
+      c.mcrd[i] = KNLreadctr(gbl.mc_rd[i]);
+      c.mcwr[i] = KNLreadctr(gbl.mc_wr[i]);
+    }
+  for (int i = 0; i < NEDC; ++i)
+    {
+      c.edcrd[i] = KNLreadctr(gbl.edc_rd[i]);
+      c.edcwr[i] = KNLreadctr(gbl.edc_wr[i]);
+    }
+  for (int i = 0; i < NEDC; ++i)
+    {
+      c.edchite[i] = KNLreadctr(gbl.edc_hite[i]);
+      c.edchitm[i] = KNLreadctr(gbl.edc_hitm[i]);
+      c.edcmisse[i] = KNLreadctr(gbl.edc_misse[i]);
+      c.edcmissm[i] = KNLreadctr(gbl.edc_missm[i]);
+    }
+}
+
+#endif
+}
--- a/lib/Stat.h
+++ b/lib/Stat.h
@@ -0,0 +1,104 @@
+#ifndef _GRID_STAT_H
+#define _GRID_STAT_H
+
+#ifdef AVX512
+#define _KNIGHTS_LANDING_ROOTONLY
+#endif
+
+namespace Grid { 
+
+///////////////////////////////////////////////////////////////////////////////
+// Extra KNL counters from MCDRAM
+///////////////////////////////////////////////////////////////////////////////
+#ifdef _KNIGHTS_LANDING_
+#define NMC 6
+#define NEDC 8
+struct ctrs
+{
+    uint64_t mcrd[NMC];
+    uint64_t mcwr[NMC];
+    uint64_t edcrd[NEDC]; 
+    uint64_t edcwr[NEDC];
+    uint64_t edchite[NEDC];
+    uint64_t edchitm[NEDC];
+    uint64_t edcmisse[NEDC];
+    uint64_t edcmissm[NEDC];
+};
+// Peter/Azusa:
+// Our modification of a code provided by Larry Meadows from Intel
+// Verified by email exchange non-NDA, ok for github. Should be as uses /sys/devices/ FS
+// so is already public and in the linux kernel for KNL.
+struct knl_gbl_
+{
+  int mc_rd[NMC];
+  int mc_wr[NMC];
+  int edc_rd[NEDC];
+  int edc_wr[NEDC];
+  int edc_hite[NEDC];
+  int edc_hitm[NEDC];
+  int edc_misse[NEDC];
+  int edc_missm[NEDC];
+};
+#endif
+///////////////////////////////////////////////////////////////////////////////
+
+class PmuStat
+{
+    uint64_t counters[8][256];
+#ifdef _KNIGHTS_LANDING_
+    static struct knl_gbl_ gbl;
+#endif
+    const char *name;
+
+    uint64_t reads;     // memory reads
+    uint64_t writes;    // memory writes
+    uint64_t mrstart;   // memory read counter at start of parallel region
+    uint64_t mrend;     // memory read counter at end of parallel region
+    uint64_t mwstart;   // memory write counter at start of parallel region
+    uint64_t mwend;     // memory write counter at end of parallel region
+
+    // cumulative counters
+    uint64_t count;     // number of invocations
+    uint64_t tregion;   // total time in parallel region (from thread 0)
+    uint64_t tcycles;   // total cycles inside parallel region
+    uint64_t inst, ref, cyc;   // fixed counters
+    uint64_t pmc0, pmc1;// pmu
+    // add memory counters here
+    // temp variables
+    uint64_t tstart;    // tsc at start of parallel region
+    uint64_t tend;      // tsc at end of parallel region
+    // map for ctrs values
+    // 0 pmc0 start
+    // 1 pmc0 end
+    // 2 pmc1 start
+    // 3 pmc1 end
+    // 4 tsc start
+    // 5 tsc end
+    static bool pmu_initialized;
+public:
+    static bool is_init(void){ return pmu_initialized;}
+    static void pmu_init(void);
+    static void pmu_fini(void);
+    static void pmu_start(void);
+    static void pmu_stop(void);
+    void accum(int nthreads);
+    static void xmemctrs(uint64_t *mr, uint64_t *mw);
+    void start(void);
+    void enter(int t);
+    void exit(int t);
+    void print(void);
+    void init(const char *regname);
+    void clear(void);
+#ifdef _KNIGHTS_LANDING_
+    static void     KNLsetup(void);
+    static uint64_t KNLreadctr(int fd);
+    static void     KNLreadctrs(ctrs &c);
+    static void     KNLevsetup(const char *ename, int &fd, int event, int umask);
+#endif
+    
+  };
+
+}
+#endif
+
+
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -30,7 +30,7 @@

 #include <thread>

- #include <stencil/Lebesgue.h>   // subdir aggregate
+ #include <Grid/stencil/Lebesgue.h>   // subdir aggregate

 //////////////////////////////////////////////////////////////////////////////////////////
 // Must not lose sight that goal is to be able to construct really efficient
@@ -70,9 +70,70 @@

 namespace Grid {

+template<class vobj,class cobj,class compressor> void 
+Gather_plane_simple_table_compute (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress, int off,std::vector<std::pair<int,int> >& table)
+{
+  table.resize(0);
+  int rd = rhs._grid->_rdimensions[dimension];
+
+  if ( !rhs._grid->CheckerBoarded(dimension) ) {
+    cbmask = 0x3;
+  }
+  int so= plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+  int e1=rhs._grid->_slice_nblock[dimension];
+  int e2=rhs._grid->_slice_block[dimension];
+
+  int stride=rhs._grid->_slice_stride[dimension];
+  if ( cbmask == 0x3 ) { 
+    table.resize(e1*e2);
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o  = n*stride;
+	int bo = n*e2;
+	table[bo+b]=std::pair<int,int>(bo+b,o+b);
+      }
+    }
+  } else { 
+     int bo=0;
+     table.resize(e1*e2/2);
+     for(int n=0;n<e1;n++){
+       for(int b=0;b<e2;b++){
+	 int o  = n*stride;
+	 int ocb=1<<rhs._grid->CheckerBoardFromOindexTable(o+b);
+	 if ( ocb &cbmask ) {
+	   table[bo]=std::pair<int,int>(bo,o+b); bo++;
+	 }
+       }
+     }
+  }
+}
+
+template<class vobj,class cobj,class compressor> void 
+Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,
+			   compressor &compress, int off,int so)
+{
+PARALLEL_FOR_LOOP     
+     for(int i=0;i<table.size();i++){
+       buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
+     }
+}
+
+template<class vobj,class cobj,class compressor> void 
+Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress, int off,
+			     double &t_table ,double & t_data )
+{
+  std::vector<std::pair<int,int> > table;
+  Gather_plane_simple_table_compute (rhs, buffer,dimension,plane,cbmask,compress,off,table);
+  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+  Gather_plane_simple_table         (table,rhs,buffer,compress,off,so);
+}
+
+
+
+
   struct StencilEntry { 
-     uint32_t _offset;
-     uint32_t _byte_offset;
+     uint64_t _offset;
+     uint64_t _byte_offset;
     uint16_t _is_local;
     uint16_t _permute;
     uint32_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline
@@ -101,12 +162,14 @@
       };

       std::vector<Packet> Packets;
+
+       int face_table_computed;
+       std::vector<std::vector<std::pair<int,int> > > face_table ;
       
- #define SEND_IMMEDIATE
- #define SERIAL_SENDS
+#define SEND_IMMEDIATE
+#define SERIAL_SENDS

       void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){
-	 comms_bytes+=2.0*bytes;
 #ifdef SEND_IMMEDIATE
 	 commtime-=usecond();
 	 _grid->SendToRecvFrom(xmit,to,rcv,from,bytes);
@@ -256,7 +319,8 @@
 	   if( _entries[i]._is_local ) {
 	     _entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj);
 	   } else { 
-	     _entries[i]._byte_offset =(uint64_t)&comm_buf[0]+ _entries[i]._offset*sizeof(cobj);
+	     // PrecomputeByteOffsets [5] 16384/32768 140735768678528 140735781261056 2581581952
+	     _entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj);
 	   }
 	 }
       };
@@ -265,17 +329,21 @@
 	 //	 _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0);
       }
       inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
-	 _mm_prefetch((char *)&_entries[ent+1],_MM_HINT_T0);
+	 uint64_t cbase = (uint64_t)&comm_buf[0];
 	 local = _entries[ent]._is_local;
 	 perm  = _entries[ent]._permute;
 	 if (perm)  ptype = _permute_type[point]; 
-	 if (local) return base + _entries[ent]._byte_offset;
-	 else       return _entries[ent]._byte_offset;
+	 if (local) {
+	   return  base + _entries[ent]._byte_offset;
+	 } else {
+	   return cbase + _entries[ent]._byte_offset;
+	 }
       }
       inline uint64_t GetPFInfo(int ent,uint64_t base) {
+	 uint64_t cbase = (uint64_t)&comm_buf[0];
 	 int local = _entries[ent]._is_local;
-	 if (local) return base + _entries[ent]._byte_offset;
-	 else       return        _entries[ent]._byte_offset;
+	 if (local) return  base + _entries[ent]._byte_offset;
+	 else       return cbase + _entries[ent]._byte_offset;
       }

       // Comms buffers
@@ -301,6 +369,48 @@
       double gathermtime;
       double splicetime;
       double nosplicetime;
+       double t_data;
+       double t_table;
+       double calls;
+
+       void ZeroCounters(void) {
+         gathertime = 0.;
+         jointime = 0.;
+         commtime = 0.;
+         halogtime = 0.;
+         mergetime = 0.;
+         spintime = 0.;
+         gathermtime = 0.;
+         splicetime = 0.;
+         nosplicetime = 0.;
+	 t_data = 0.0;
+         t_table= 0.0;
+         comms_bytes = 0.;
+         calls = 0.;
+       };
+
+       void Report(void) {
+#define PRINTIT(A)	\
+ std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
+	 if ( calls > 0. ) {
+	   std::cout << GridLogMessage << " Stencil calls "<<calls<<std::endl;
+	   PRINTIT(halogtime);
+	   PRINTIT(gathertime);
+	   PRINTIT(gathermtime);
+	   PRINTIT(mergetime);
+	   if(comms_bytes>1.0){
+	     PRINTIT(comms_bytes);
+	     PRINTIT(commtime);
+	     std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s "<<std::endl;
+	   }
+	   PRINTIT(jointime);
+	   PRINTIT(spintime);
+	   PRINTIT(splicetime);
+	   PRINTIT(nosplicetime);
+	   PRINTIT(t_table);
+	   PRINTIT(t_data);
+	 }
+       };
 #endif

   CartesianStencil(GridBase *grid,
@@ -310,18 +420,7 @@
 				      const std::vector<int> &distances) 
     :   _permute_type(npoints), _comm_buf_size(npoints)
     {
- #ifdef TIMING_HACK
-       gathertime=0;
-       jointime=0;
-       commtime=0;
-       halogtime=0;
-       mergetime=0;
-       spintime=0;
-       gathermtime=0;
-       splicetime=0;
-       nosplicetime=0;
-       comms_bytes=0;
- #endif
+       face_table_computed=0;
       _npoints = npoints;
       _grid    = grid;
       _directions = directions;
@@ -623,6 +722,7 @@
       template<class compressor>
       void HaloExchange(const Lattice<vobj> &source,compressor &compress) 
       {
+	 calls++;
 	 Mergers.resize(0);
         Packets.resize(0);
         HaloGather(source,compress);
@@ -648,7 +748,7 @@
       }
 #endif
       template<class compressor>
-       void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point)
+       void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
       {
 	   int dimension    = _directions[point];
 	   int displacement = _distances[point];
@@ -676,23 +776,23 @@
 	     if ( sshift[0] == sshift[1] ) {
 	       if (splice_dim) {
 		 splicetime-=usecond();
-		 GatherSimd(source,dimension,shift,0x3,compress);
+		 GatherSimd(source,dimension,shift,0x3,compress,face_idx);
 		 splicetime+=usecond();
 	       } else { 
 		 nosplicetime-=usecond();
-		 Gather(source,dimension,shift,0x3,compress);
+		 Gather(source,dimension,shift,0x3,compress,face_idx);
 		 nosplicetime+=usecond();
 	       }
 	     } else {
 	       if(splice_dim){
 		 splicetime-=usecond();
-		 GatherSimd(source,dimension,shift,0x1,compress);// if checkerboard is unfavourable take two passes
-		 GatherSimd(source,dimension,shift,0x2,compress);// both with block stride loop iteration
+		 GatherSimd(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes
+		 GatherSimd(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration
 		 splicetime+=usecond();
 	       } else {
 		 nosplicetime-=usecond();
-		 Gather(source,dimension,shift,0x1,compress);
-		 Gather(source,dimension,shift,0x2,compress);
+		 Gather(source,dimension,shift,0x1,compress,face_idx);
+		 Gather(source,dimension,shift,0x2,compress,face_idx);
 		 nosplicetime+=usecond();
 	       }
 	     }
@@ -710,17 +810,19 @@
 	 u_comm_offset=0;

 	 // Gather all comms buffers
+	 int face_idx=0;
 	 for(int point = 0 ; point < _npoints; point++) {
 	   compress.Point(point);
-	   HaloGatherDir(source,compress,point);
+	   HaloGatherDir(source,compress,point,face_idx);
 	 }
+	 face_table_computed=1;

 	 assert(u_comm_offset==_unified_buffer_size);
 	 halogtime+=usecond();
       }

       template<class compressor>
-	 void Gather(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor & compress)
+	 void Gather(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx)
 	 {
 	   typedef typename cobj::vector_type vector_type;
 	   typedef typename cobj::scalar_type scalar_type;
@@ -757,8 +859,20 @@
 	       int bytes = words * sizeof(cobj);

 	       gathertime-=usecond();
-	       Gather_plane_simple (rhs,u_send_buf,dimension,sx,cbmask,compress,u_comm_offset);
+	       int so  = sx*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+	       if ( !face_table_computed ) {
+		 t_table-=usecond();
+		 face_table.resize(face_idx+1);
+		 Gather_plane_simple_table_compute (rhs,u_send_buf,dimension,sx,cbmask,compress,u_comm_offset,face_table[face_idx]);
+		 t_table+=usecond();
+	       }
+	       t_data-=usecond();
+	       Gather_plane_simple_table         (face_table[face_idx],rhs,u_send_buf,compress,u_comm_offset,so);
+	       face_idx++;
+	       t_data+=usecond();
 	       gathertime+=usecond();
+	       
+	       //	       Gather_plane_simple_stencil (rhs,u_send_buf,dimension,sx,cbmask,compress,u_comm_offset,t_table,t_data);

 	       int rank           = _grid->_processor;
 	       int recv_from_rank;
@@ -781,7 +895,7 @@


       template<class compressor>
-	 void  GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress)
+	 void  GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx)
 	 {
 	   const int Nsimd = _grid->Nsimd();

--- a/lib/Tensors.h
+++ b/lib/Tensors.h
@@ -30,22 +30,22 @@ Author: neo <cossu@post.kek.jp>
 #ifndef GRID_MATH_H
 #define GRID_MATH_H

-#include <tensors/Tensor_traits.h>
-#include <tensors/Tensor_class.h>
-#include <tensors/Tensor_arith.h>
-#include <tensors/Tensor_inner.h>
-#include <tensors/Tensor_outer.h>
-#include <tensors/Tensor_transpose.h>
-#include <tensors/Tensor_trace.h>
-#include <tensors/Tensor_index.h>
-#include <tensors/Tensor_Ta.h>
-#include <tensors/Tensor_determinant.h>
-#include <tensors/Tensor_exp.h>
-//#include <tensors/Tensor_peek.h>
-//#include <tensors/Tensor_poke.h>
-#include <tensors/Tensor_reality.h>
-#include <tensors/Tensor_unary.h>
-#include <tensors/Tensor_extract_merge.h>
-#include <tensors/Tensor_logical.h>
+#include <Grid/tensors/Tensor_traits.h>
+#include <Grid/tensors/Tensor_class.h>
+#include <Grid/tensors/Tensor_arith.h>
+#include <Grid/tensors/Tensor_inner.h>
+#include <Grid/tensors/Tensor_outer.h>
+#include <Grid/tensors/Tensor_transpose.h>
+#include <Grid/tensors/Tensor_trace.h>
+#include <Grid/tensors/Tensor_index.h>
+#include <Grid/tensors/Tensor_Ta.h>
+#include <Grid/tensors/Tensor_determinant.h>
+#include <Grid/tensors/Tensor_exp.h>
+//#include <Grid/tensors/Tensor_peek.h>
+//#include <Grid/tensors/Tensor_poke.h>
+#include <Grid/tensors/Tensor_reality.h>
+#include <Grid/tensors/Tensor_unary.h>
+#include <Grid/tensors/Tensor_extract_merge.h>
+#include <Grid/tensors/Tensor_logical.h>

 #endif
--- a/lib/Threads.h
+++ b/lib/Threads.h
@@ -37,7 +37,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 #ifdef GRID_OMP
 #include <omp.h>
-#define PARALLEL_FOR_LOOP _Pragma("omp parallel for ")
+#ifdef GRID_NUMA
+#define PARALLEL_FOR_LOOP _Pragma("omp parallel for schedule(static)")
+#else
+#define PARALLEL_FOR_LOOP _Pragma("omp parallel for schedule(runtime)")
+#endif
 #define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
 #else
 #define PARALLEL_FOR_LOOP 
--- a/lib/algorithms/CoarsenedMatrix.h
+++ b/lib/algorithms/CoarsenedMatrix.h
@@ -31,7 +31,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_ALGORITHM_COARSENED_MATRIX_H
 #define  GRID_ALGORITHM_COARSENED_MATRIX_H

-#include <Grid.h>

 namespace Grid {

--- a/lib/algorithms/SparseMatrix.h
+++ b/lib/algorithms/SparseMatrix.h
@@ -28,7 +28,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_ALGORITHM_SPARSE_MATRIX_H
 #define  GRID_ALGORITHM_SPARSE_MATRIX_H

-#include <Grid.h>

 namespace Grid {

--- a/lib/algorithms/approx/.dirstamp
+++ b/lib/algorithms/approx/.dirstamp
--- a/lib/algorithms/approx/Chebyshev.h
+++ b/lib/algorithms/approx/Chebyshev.h
@@ -29,8 +29,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_CHEBYSHEV_H
 #define GRID_CHEBYSHEV_H

-#include<Grid.h>
-#include<algorithms/LinearOperator.h>
+#include <Grid/algorithms/LinearOperator.h>

 namespace Grid {

--- a/lib/algorithms/approx/Remez.h
+++ b/lib/algorithms/approx/Remez.h
@@ -18,10 +18,10 @@
 #include <stddef.h>
 #include <Config.h>

-#ifdef HAVE_GMP_H
-#include <algorithms/approx/bigfloat.h>
+#ifdef HAVE_LIBGMP
+#include "bigfloat.h"
 #else
-#include <algorithms/approx/bigfloat_double.h>
+#include "bigfloat_double.h"
 #endif

 #define JMAX 10000 //Maximum number of iterations of Newton's approximation
--- a/lib/algorithms/iterative/ConjugateGradient.h
+++ b/lib/algorithms/iterative/ConjugateGradient.h
@@ -1,150 +1,168 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/algorithms/iterative/ConjugateGradient.h
+Source file: ./lib/algorithms/iterative/ConjugateGradient.h

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_H
 #define GRID_CONJUGATE_GRADIENT_H

 namespace Grid {

-    /////////////////////////////////////////////////////////////
-    // Base classes for iterative processes based on operators
-    // single input vec, single output vec.
-    /////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////
+// Base classes for iterative processes based on operators
+// single input vec, single output vec.
+/////////////////////////////////////////////////////////////

-  template<class Field> 
-    class ConjugateGradient : public OperatorFunction<Field> {
-public:                                                
-    RealD   Tolerance;
-    Integer MaxIterations;
-    ConjugateGradient(RealD tol,Integer maxit) : Tolerance(tol), MaxIterations(maxit) { 
-    };
+template <class Field>
+class ConjugateGradient : public OperatorFunction<Field> {
+ public:
+  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
+                           // Defaults true.
+  RealD Tolerance;
+  Integer MaxIterations;
+  ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
+      : Tolerance(tol),
+        MaxIterations(maxit),
+        ErrorOnNoConverge(err_on_no_conv){};

+  void operator()(LinearOperatorBase<Field> &Linop, const Field &src,
+                  Field &psi) {
+    psi.checkerboard = src.checkerboard;
+    conformable(psi, src);

-    void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
+    RealD cp, c, a, d, b, ssq, qq, b_pred;

-      psi.checkerboard = src.checkerboard;
-      conformable(psi,src);
+    Field p(src);
+    Field mmp(src);
+    Field r(src);

-      RealD cp,c,a,d,b,ssq,qq,b_pred;
-      
-      Field   p(src);
-      Field mmp(src);
-      Field   r(src);
-      
-      //Initial residual computation & set up
-      RealD guess = norm2(psi);
-      assert(std::isnan(guess)==0);
+    // Initial residual computation & set up
+    RealD guess = norm2(psi);
+    assert(std::isnan(guess) == 0);

-      Linop.HermOpAndNorm(psi,mmp,d,b);
-      
-      r= src-mmp;
-      p= r;
-      
-      a  =norm2(p);
-      cp =a;
-      ssq=norm2(src);
+    
+    Linop.HermOpAndNorm(psi, mmp, d, b);
+    

-      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient: guess "<<guess<<std::endl;
-      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:   src "<<ssq  <<std::endl;
-      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:    mp "<<d    <<std::endl;
-      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:   mmp "<<b    <<std::endl;
-      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:  cp,r "<<cp   <<std::endl;
-      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:     p "<<a    <<std::endl;
+    r = src - mmp;
+    p = r;

-      RealD rsq =  Tolerance* Tolerance*ssq;
-      
-      //Check if guess is really REALLY good :)
-      if ( cp <= rsq ) {
-	return;
-      }
-      
-      std::cout<<GridLogIterative << std::setprecision(4)<< "ConjugateGradient: k=0 residual "<<cp<<" target "<<rsq<<std::endl;
+    a = norm2(p);
+    cp = a;
+    ssq = norm2(src);

-      GridStopWatch LinalgTimer;
-      GridStopWatch MatrixTimer;
-      GridStopWatch SolverTimer;
+    std::cout << GridLogIterative << std::setprecision(4)
+              << "ConjugateGradient: guess " << guess << std::endl;
+    std::cout << GridLogIterative << std::setprecision(4)
+              << "ConjugateGradient:   src " << ssq << std::endl;
+    std::cout << GridLogIterative << std::setprecision(4)
+              << "ConjugateGradient:    mp " << d << std::endl;
+    std::cout << GridLogIterative << std::setprecision(4)
+              << "ConjugateGradient:   mmp " << b << std::endl;
+    std::cout << GridLogIterative << std::setprecision(4)
+              << "ConjugateGradient:  cp,r " << cp << std::endl;
+    std::cout << GridLogIterative << std::setprecision(4)
+              << "ConjugateGradient:     p " << a << std::endl;

-      SolverTimer.Start();
-      int k;
-      for (k=1;k<=MaxIterations;k++){
-	
-	c=cp;
+    RealD rsq = Tolerance * Tolerance * ssq;

-	MatrixTimer.Start();
-	Linop.HermOpAndNorm(p,mmp,d,qq);
-	MatrixTimer.Stop();
-
-	LinalgTimer.Start();
-	//	RealD    qqck = norm2(mmp);
-	//	ComplexD dck  = innerProduct(p,mmp);
-      
-	a      = c/d;
-	b_pred = a*(a*qq-d)/c;
-
-	cp = axpy_norm(r,-a,mmp,r);
-	b = cp/c;
-	
-	// Fuse these loops ; should be really easy
-	psi= a*p+psi;
-	p  = p*b+r;
-	  
-	LinalgTimer.Stop();
-	std::cout<<GridLogIterative<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target "<< rsq<<std::endl;
-	
-	// Stopping condition
-	if ( cp <= rsq ) { 
-	  
-	  SolverTimer.Stop();
-	  Linop.HermOpAndNorm(psi,mmp,d,qq);
-	  p=mmp-src;
-	  
-	  RealD mmpnorm = sqrt(norm2(mmp));
-	  RealD psinorm = sqrt(norm2(psi));
-	  RealD srcnorm = sqrt(norm2(src));
-	  RealD resnorm = sqrt(norm2(p));
-	  RealD true_residual = resnorm/srcnorm;
-
-	  std::cout<<GridLogMessage<<"ConjugateGradient: Converged on iteration " <<k
-		   <<" computed residual "<<sqrt(cp/ssq)
-		   <<" true residual "    <<true_residual
-		   <<" target "<<Tolerance<<std::endl;
-	  std::cout<<GridLogMessage<<"Time elapsed: Total "<< SolverTimer.Elapsed() << " Matrix  "<<MatrixTimer.Elapsed() << " Linalg "<<LinalgTimer.Elapsed();
-	  std::cout<<std::endl;
-	  
-	  assert(true_residual/Tolerance < 1000.0);
-
-	  return;
-	}
-      }
-      std::cout<<GridLogMessage<<"ConjugateGradient did NOT converge"<<std::endl;
-      assert(0);
+    // Check if guess is really REALLY good :)
+    if (cp <= rsq) {
+      return;
    }
-  };
+
+    std::cout << GridLogIterative << std::setprecision(4)
+              << "ConjugateGradient: k=0 residual " << cp << " target " << rsq
+              << std::endl;
+
+    GridStopWatch LinalgTimer;
+    GridStopWatch MatrixTimer;
+    GridStopWatch SolverTimer;
+
+    SolverTimer.Start();
+    int k;
+    for (k = 1; k <= MaxIterations; k++) {
+      c = cp;
+
+      MatrixTimer.Start();
+      Linop.HermOpAndNorm(p, mmp, d, qq);
+      MatrixTimer.Stop();
+
+      LinalgTimer.Start();
+      //  RealD    qqck = norm2(mmp);
+      //  ComplexD dck  = innerProduct(p,mmp);
+
+      a = c / d;
+      b_pred = a * (a * qq - d) / c;
+
+      cp = axpy_norm(r, -a, mmp, r);
+      b = cp / c;
+
+      // Fuse these loops ; should be really easy
+      psi = a * p + psi;
+      p = p * b + r;
+
+      LinalgTimer.Stop();
+      std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
+                << " residual " << cp << " target " << rsq << std::endl;
+
+      // Stopping condition
+      if (cp <= rsq) {
+        SolverTimer.Stop();
+        Linop.HermOpAndNorm(psi, mmp, d, qq);
+        p = mmp - src;
+
+        RealD mmpnorm = sqrt(norm2(mmp));
+        RealD psinorm = sqrt(norm2(psi));
+        RealD srcnorm = sqrt(norm2(src));
+        RealD resnorm = sqrt(norm2(p));
+        RealD true_residual = resnorm / srcnorm;
+
+        std::cout << GridLogMessage
+                  << "ConjugateGradient: Converged on iteration " << k << std::endl;
+        std::cout << GridLogMessage << "Computed residual " << sqrt(cp / ssq)
+                  << " true residual " << true_residual << " target "
+                  << Tolerance << std::endl;
+        std::cout << GridLogMessage << "Time elapsed: Iterations "
+                  << SolverTimer.Elapsed() << " Matrix  "
+                  << MatrixTimer.Elapsed() << " Linalg "
+                  << LinalgTimer.Elapsed();
+        std::cout << std::endl;
+
+        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 1000.0);
+
+        return;
+      }
+    }
+    std::cout << GridLogMessage << "ConjugateGradient did NOT converge"
+              << std::endl;
+    if (ErrorOnNoConverge) assert(0);
+  }
+};
 }
 #endif
--- a/lib/algorithms/iterative/ConjugateGradientMixedPrec.h
+++ b/lib/algorithms/iterative/ConjugateGradientMixedPrec.h
@@ -0,0 +1,142 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrec.h
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@phys.columbia.edu>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
+#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
+
+namespace Grid {
+
+  //Mixed precision restarted defect correction CG
+  template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
+  class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
+  public:                                                
+    RealD   Tolerance;
+    Integer MaxInnerIterations;
+    Integer MaxOuterIterations;
+    GridBase* SinglePrecGrid; //Grid for single-precision fields
+    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
+    LinearOperatorBase<FieldF> &Linop_f;
+    LinearOperatorBase<FieldD> &Linop_d;
+
+    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
+    LinearFunction<FieldF> *guesser;
+    
+    MixedPrecisionConjugateGradient(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d) :
+      Linop_f(_Linop_f), Linop_d(_Linop_d),
+      Tolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
+      OuterLoopNormMult(100.), guesser(NULL){ };
+
+    void useGuesser(LinearFunction<FieldF> &g){
+      guesser = &g;
+    }
+  
+    void operator() (const FieldD &src_d_in, FieldD &sol_d){
+      GridStopWatch TotalTimer;
+      TotalTimer.Start();
+    
+      int cb = src_d_in.checkerboard;
+      sol_d.checkerboard = cb;
+    
+      RealD src_norm = norm2(src_d_in);
+      RealD stop = src_norm * Tolerance*Tolerance;
+
+      GridBase* DoublePrecGrid = src_d_in._grid;
+      FieldD tmp_d(DoublePrecGrid);
+      tmp_d.checkerboard = cb;
+    
+      FieldD tmp2_d(DoublePrecGrid);
+      tmp2_d.checkerboard = cb;
+    
+      FieldD src_d(DoublePrecGrid);
+      src_d = src_d_in; //source for next inner iteration, computed from residual during operation
+    
+      RealD inner_tol = Tolerance;
+    
+      FieldF src_f(SinglePrecGrid);
+      src_f.checkerboard = cb;
+    
+      FieldF sol_f(SinglePrecGrid);
+      sol_f.checkerboard = cb;
+    
+      ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
+      CG_f.ErrorOnNoConverge = false;
+
+      GridStopWatch InnerCGtimer;
+
+      GridStopWatch PrecChangeTimer;
+    
+      for(Integer outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
+	//Compute double precision rsd and also new RHS vector.
+	Linop_d.HermOp(sol_d, tmp_d);
+	RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
+      
+	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
+
+	if(norm < OuterLoopNormMult * stop){
+	  std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
+	  break;
+	}
+	while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
+
+	PrecChangeTimer.Start();
+	precisionChange(src_f, src_d);
+	PrecChangeTimer.Stop();
+      
+	zeroit(sol_f);
+
+	//Optionally improve inner solver guess (eg using known eigenvectors)
+	if(guesser != NULL)
+	  (*guesser)(src_f, sol_f);
+
+	//Inner CG
+	CG_f.Tolerance = inner_tol;
+	InnerCGtimer.Start();
+	CG_f(Linop_f, src_f, sol_f);
+	InnerCGtimer.Stop();
+      
+	//Convert sol back to double and add to double prec solution
+	PrecChangeTimer.Start();
+	precisionChange(tmp_d, sol_f);
+	PrecChangeTimer.Stop();
+      
+	axpy(sol_d, 1.0, tmp_d, sol_d);
+      }
+    
+      //Final trial CG
+      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl;
+    
+      ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
+      CG_d(Linop_d, src_d_in, sol_d);
+
+      TotalTimer.Stop();
+      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
+    }
+  };
+
+}
+
+#endif
--- a/lib/algorithms/iterative/DenseMatrix.h
+++ b/lib/algorithms/iterative/DenseMatrix.h
@@ -130,8 +130,8 @@ DenseMatrix<T> GetSubMtx(DenseMatrix<T> &A,int row_st, int row_end, int col_st,

 }

-#include <algorithms/iterative/Householder.h>
-#include <algorithms/iterative/Francis.h>
+#include "Householder.h"
+#include "Francis.h"

 #endif

--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -33,8 +33,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifdef USE_LAPACK
 #include <lapacke.h>
 #endif
-#include <algorithms/iterative/DenseMatrix.h>
-#include <algorithms/iterative/EigenSort.h>
+#include "DenseMatrix.h"
+#include "EigenSort.h"

 namespace Grid {

--- a/lib/cartesian/Cartesian_base.h
+++ b/lib/cartesian/Cartesian_base.h
@@ -29,7 +29,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_CARTESIAN_BASE_H
 #define GRID_CARTESIAN_BASE_H

-#include <Grid.h>

 namespace Grid{

@@ -82,11 +81,8 @@ public:
    virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
    virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
    virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
-    int  CheckerBoardFromOindex (int Oindex){
-      std::vector<int> ocoor;
-      oCoorFromOindex(ocoor,Oindex); 
-      return CheckerBoard(ocoor);
-    }
+    virtual int CheckerBoardFromOindex (int Oindex)=0;
+    virtual int CheckerBoardFromOindexTable (int Oindex)=0;

    //////////////////////////////////////////////////////////////////////////////////////////////
    // Local layout calculations
@@ -107,6 +103,12 @@ public:
        for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
        return idx;
    }
+    virtual int iIndex(std::vector<int> &lcoor)
+    {
+        int idx=0;
+        for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
+        return idx;
+    }
    inline int oIndexReduced(std::vector<int> &ocoor)
    {
      int idx=0; 
@@ -123,12 +125,6 @@ public:
    //////////////////////////////////////////////////////////
    // SIMD lane addressing
    //////////////////////////////////////////////////////////
-    inline int iIndex(std::vector<int> &lcoor)
-    {
-        int idx=0;
-        for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
-        return idx;
-    }
    inline void iCoorFromIindex(std::vector<int> &coor,int lane)
    {
      Lexicographic::CoorFromIndex(coor,lane,_simd_layout);
@@ -220,7 +216,7 @@ public:
      }

      i_idx= iIndex(cblcoor);// this does not imply divide by 2 on checker dim
-      o_idx= oIndex(lcoor);// this implies divide by 2 on checkerdim
+      o_idx= oIndex(lcoor);  // this implies divide by 2 on checkerdim
    }

    void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector<int> &gcoor)
--- a/lib/cartesian/Cartesian_full.h
+++ b/lib/cartesian/Cartesian_full.h
@@ -39,6 +39,13 @@ class GridCartesian: public GridBase {

 public:

+    virtual int  CheckerBoardFromOindexTable (int Oindex) {
+      return 0;
+    }
+    virtual int  CheckerBoardFromOindex (int Oindex)
+    {
+      return 0;
+    }
    virtual int CheckerBoarded(int dim){
      return 0;
    }
--- a/lib/cartesian/Cartesian_red_black.h
+++ b/lib/cartesian/Cartesian_red_black.h
@@ -32,23 +32,18 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 namespace Grid {

-    static const int CbRed  =0;
-    static const int CbBlack=1;
-    static const int Even   =CbRed;
-    static const int Odd    =CbBlack;
-
-    // Perhaps these are misplaced and 
-    // should be in sparse matrix.
-    // Also should make these a named enum type
-    static const int DaggerNo=0;
-    static const int DaggerYes=1;
-
+  static const int CbRed  =0;
+  static const int CbBlack=1;
+  static const int Even   =CbRed;
+  static const int Odd    =CbBlack;
+    
 // Specialise this for red black grids storing half the data like a chess board.
 class GridRedBlackCartesian : public GridBase
 {
 public:
    std::vector<int> _checker_dim_mask;
    int              _checker_dim;
+    std::vector<int> _checker_board;

    virtual int CheckerBoarded(int dim){
      if( dim==_checker_dim) return 1;
@@ -78,12 +73,20 @@ public:
      // or by looping over x,y,z and multiply rather than computing checkerboard.
 	  
      if ( (source_cb+ocb)&1 ) {
-
 	return (shift)/2;
      } else {
 	return (shift+1)/2;
      }
    }
+    virtual int  CheckerBoardFromOindexTable (int Oindex) {
+      return _checker_board[Oindex];
+    }
+    virtual int  CheckerBoardFromOindex (int Oindex)
+    {
+      std::vector<int> ocoor;
+      oCoorFromOindex(ocoor,Oindex);
+      return CheckerBoard(ocoor);
+    }
    virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite){

      if(dim != _checker_dim) return shift;
@@ -175,7 +178,7 @@ public:
 	// all elements of a simd vector must have same checkerboard.
 	// If Ls vectorised, this must still be the case; e.g. dwf rb5d
 	if ( _simd_layout[d]>1 ) {
-	  if ( d != _checker_dim ) { 
+	  if ( checker_dim_mask[d] ) { 
 	    assert( (_rdimensions[d]&0x1) == 0 );
 	  }
 	}
@@ -191,6 +194,8 @@ public:
 	  _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
 	  _istride[d] = _istride[d-1]*_simd_layout[d-1];
 	}
+
+
      }
            
      ////////////////////////////////////////////////////////////////////////////////////////////
@@ -211,6 +216,18 @@ public:
 	_slice_nblock[d]=nblock;
 	block = block*_rdimensions[d];
      }
+
+      ////////////////////////////////////////////////
+      // Create a checkerboard lookup table
+      ////////////////////////////////////////////////
+      int rvol = 1;
+      for(int d=0;d<_ndimension;d++){
+	rvol=rvol * _rdimensions[d];
+      }
+      _checker_board.resize(rvol);
+      for(int osite=0;osite<_osites;osite++){
+	_checker_board[osite] = CheckerBoardFromOindex (osite);
+      }
      
    };
 protected:
@@ -224,9 +241,21 @@ protected:
 	  idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
 	}
      }
-        return idx;
+      return idx;
    };
        
+    virtual int iIndex(std::vector<int> &lcoor)
+    {
+        int idx=0;
+        for(int d=0;d<_ndimension;d++) {
+	  if( d==_checker_dim ) {
+	    idx+=_istride[d]*(lcoor[d]/(2*_rdimensions[d]));
+	  } else { 
+	    idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
+	  }
+	}
+        return idx;
+    }
 };

 }
--- a/lib/communicator/.dirstamp
+++ b/lib/communicator/.dirstamp
--- a/lib/cshift/Cshift_common.h
+++ b/lib/cshift/Cshift_common.h
@@ -1,3 +1,4 @@
+
    /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 
@@ -56,6 +57,7 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
  
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
+
  int stride=rhs._grid->_slice_stride[dimension];
  if ( cbmask == 0x3 ) { 
 PARALLEL_NESTED_LOOP2
@@ -68,15 +70,20 @@ PARALLEL_NESTED_LOOP2
    }
  } else { 
     int bo=0;
+     std::vector<std::pair<int,int> > table;
     for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
 	 int o  = n*stride;
-	 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
+	 int ocb=1<<rhs._grid->CheckerBoardFromOindexTable(o+b);
 	 if ( ocb &cbmask ) {
-	   buffer[off+bo++]=compress(rhs._odata[so+o+b]);
+	   table.push_back(std::pair<int,int> (bo++,o+b));
 	 }
       }
     }
+PARALLEL_FOR_LOOP     
+     for(int i=0;i<table.size();i++){
+       buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
+     }
  }
 }

--- a/lib/fftw/fftw3.h
+++ b/lib/fftw/fftw3.h
@@ -0,0 +1,412 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * The following statement of license applies *only* to this header file,
+ * and *not* to the other files distributed with FFTW or derived therefrom:
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/***************************** NOTE TO USERS *********************************
+ *
+ *                 THIS IS A HEADER FILE, NOT A MANUAL
+ *
+ *    If you want to know how to use FFTW, please read the manual,
+ *    online at http://www.fftw.org/doc/ and also included with FFTW.
+ *    For a quick start, see the manual's tutorial section.
+ *
+ *   (Reading header files to learn how to use a library is a habit
+ *    stemming from code lacking a proper manual.  Arguably, it's a
+ *    *bad* habit in most cases, because header files can contain
+ *    interfaces that are not part of the public, stable API.)
+ *
+ ****************************************************************************/
+
+#ifndef FFTW3_H
+#define FFTW3_H
+
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif /* __cplusplus */
+
+/* If <complex.h> is included, use the C99 complex type.  Otherwise
+   define a type bit-compatible with C99 complex */
+#if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I)
+#  define FFTW_DEFINE_COMPLEX(R, C) typedef R _Complex C
+#else
+#  define FFTW_DEFINE_COMPLEX(R, C) typedef R C[2]
+#endif
+
+#define FFTW_CONCAT(prefix, name) prefix ## name
+#define FFTW_MANGLE_DOUBLE(name) FFTW_CONCAT(fftw_, name)
+#define FFTW_MANGLE_FLOAT(name) FFTW_CONCAT(fftwf_, name)
+#define FFTW_MANGLE_LONG_DOUBLE(name) FFTW_CONCAT(fftwl_, name)
+#define FFTW_MANGLE_QUAD(name) FFTW_CONCAT(fftwq_, name)
+
+/* IMPORTANT: for Windows compilers, you should add a line
+        #define FFTW_DLL
+   here and in kernel/ifftw.h if you are compiling/using FFTW as a
+   DLL, in order to do the proper importing/exporting, or
+   alternatively compile with -DFFTW_DLL or the equivalent
+   command-line flag.  This is not necessary under MinGW/Cygwin, where
+   libtool does the imports/exports automatically. */
+#if defined(FFTW_DLL) && (defined(_WIN32) || defined(__WIN32__))
+   /* annoying Windows syntax for shared-library declarations */
+#  if defined(COMPILING_FFTW) /* defined in api.h when compiling FFTW */
+#    define FFTW_EXTERN extern __declspec(dllexport) 
+#  else /* user is calling FFTW; import symbol */
+#    define FFTW_EXTERN extern __declspec(dllimport) 
+#  endif
+#else
+#  define FFTW_EXTERN extern
+#endif
+
+enum fftw_r2r_kind_do_not_use_me {
+     FFTW_R2HC=0, FFTW_HC2R=1, FFTW_DHT=2,
+     FFTW_REDFT00=3, FFTW_REDFT01=4, FFTW_REDFT10=5, FFTW_REDFT11=6,
+     FFTW_RODFT00=7, FFTW_RODFT01=8, FFTW_RODFT10=9, FFTW_RODFT11=10
+};
+
+struct fftw_iodim_do_not_use_me {
+     int n;                     /* dimension size */
+     int is;			/* input stride */
+     int os;			/* output stride */
+};
+
+#include <stddef.h> /* for ptrdiff_t */
+struct fftw_iodim64_do_not_use_me {
+     ptrdiff_t n;                     /* dimension size */
+     ptrdiff_t is;			/* input stride */
+     ptrdiff_t os;			/* output stride */
+};
+
+typedef void (*fftw_write_char_func_do_not_use_me)(char c, void *);
+typedef int (*fftw_read_char_func_do_not_use_me)(void *);
+
+/*
+  huge second-order macro that defines prototypes for all API
+  functions.  We expand this macro for each supported precision
+ 
+  X: name-mangling macro
+  R: real data type
+  C: complex data type
+*/
+
+#define FFTW_DEFINE_API(X, R, C)					   \
+									   \
+FFTW_DEFINE_COMPLEX(R, C);						   \
+									   \
+typedef struct X(plan_s) *X(plan);					   \
+									   \
+typedef struct fftw_iodim_do_not_use_me X(iodim);			   \
+typedef struct fftw_iodim64_do_not_use_me X(iodim64);			   \
+									   \
+typedef enum fftw_r2r_kind_do_not_use_me X(r2r_kind);			   \
+									   \
+typedef fftw_write_char_func_do_not_use_me X(write_char_func);		   \
+typedef fftw_read_char_func_do_not_use_me X(read_char_func);		   \
+									   \
+FFTW_EXTERN void X(execute)(const X(plan) p);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_dft)(int rank, const int *n,			   \
+		    C *in, C *out, int sign, unsigned flags);		   \
+									   \
+FFTW_EXTERN X(plan) X(plan_dft_1d)(int n, C *in, C *out, int sign,	   \
+		       unsigned flags);					   \
+FFTW_EXTERN X(plan) X(plan_dft_2d)(int n0, int n1,			   \
+		       C *in, C *out, int sign, unsigned flags);	   \
+FFTW_EXTERN X(plan) X(plan_dft_3d)(int n0, int n1, int n2,		   \
+		       C *in, C *out, int sign, unsigned flags);	   \
+									   \
+FFTW_EXTERN X(plan) X(plan_many_dft)(int rank, const int *n,		   \
+                         int howmany,					   \
+                         C *in, const int *inembed,			   \
+                         int istride, int idist,			   \
+                         C *out, const int *onembed,			   \
+                         int ostride, int odist,			   \
+                         int sign, unsigned flags);			   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru_dft)(int rank, const X(iodim) *dims,	   \
+			 int howmany_rank,				   \
+			 const X(iodim) *howmany_dims,			   \
+			 C *in, C *out,					   \
+			 int sign, unsigned flags);			   \
+FFTW_EXTERN X(plan) X(plan_guru_split_dft)(int rank, const X(iodim) *dims, \
+			 int howmany_rank,				   \
+			 const X(iodim) *howmany_dims,			   \
+			 R *ri, R *ii, R *ro, R *io,			   \
+			 unsigned flags);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru64_dft)(int rank,			   \
+                         const X(iodim64) *dims,			   \
+			 int howmany_rank,				   \
+			 const X(iodim64) *howmany_dims,		   \
+			 C *in, C *out,					   \
+			 int sign, unsigned flags);			   \
+FFTW_EXTERN X(plan) X(plan_guru64_split_dft)(int rank,			   \
+                         const X(iodim64) *dims,			   \
+			 int howmany_rank,				   \
+			 const X(iodim64) *howmany_dims,		   \
+			 R *ri, R *ii, R *ro, R *io,			   \
+			 unsigned flags);				   \
+									   \
+FFTW_EXTERN void X(execute_dft)(const X(plan) p, C *in, C *out);	   \
+FFTW_EXTERN void X(execute_split_dft)(const X(plan) p, R *ri, R *ii,	   \
+                                      R *ro, R *io);			   \
+									   \
+FFTW_EXTERN X(plan) X(plan_many_dft_r2c)(int rank, const int *n,	   \
+                             int howmany,				   \
+                             R *in, const int *inembed,			   \
+                             int istride, int idist,			   \
+                             C *out, const int *onembed,		   \
+                             int ostride, int odist,			   \
+                             unsigned flags);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_dft_r2c)(int rank, const int *n,		   \
+                        R *in, C *out, unsigned flags);			   \
+									   \
+FFTW_EXTERN X(plan) X(plan_dft_r2c_1d)(int n,R *in,C *out,unsigned flags); \
+FFTW_EXTERN X(plan) X(plan_dft_r2c_2d)(int n0, int n1,			   \
+			   R *in, C *out, unsigned flags);		   \
+FFTW_EXTERN X(plan) X(plan_dft_r2c_3d)(int n0, int n1,			   \
+			   int n2,					   \
+			   R *in, C *out, unsigned flags);		   \
+									   \
+									   \
+FFTW_EXTERN X(plan) X(plan_many_dft_c2r)(int rank, const int *n,	   \
+			     int howmany,				   \
+			     C *in, const int *inembed,			   \
+			     int istride, int idist,			   \
+			     R *out, const int *onembed,		   \
+			     int ostride, int odist,			   \
+			     unsigned flags);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_dft_c2r)(int rank, const int *n,		   \
+                        C *in, R *out, unsigned flags);			   \
+									   \
+FFTW_EXTERN X(plan) X(plan_dft_c2r_1d)(int n,C *in,R *out,unsigned flags); \
+FFTW_EXTERN X(plan) X(plan_dft_c2r_2d)(int n0, int n1,			   \
+			   C *in, R *out, unsigned flags);		   \
+FFTW_EXTERN X(plan) X(plan_dft_c2r_3d)(int n0, int n1,			   \
+			   int n2,					   \
+			   C *in, R *out, unsigned flags);		   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru_dft_r2c)(int rank, const X(iodim) *dims,   \
+			     int howmany_rank,				   \
+			     const X(iodim) *howmany_dims,		   \
+			     R *in, C *out,				   \
+			     unsigned flags);				   \
+FFTW_EXTERN X(plan) X(plan_guru_dft_c2r)(int rank, const X(iodim) *dims,   \
+			     int howmany_rank,				   \
+			     const X(iodim) *howmany_dims,		   \
+			     C *in, R *out,				   \
+			     unsigned flags);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru_split_dft_r2c)(				   \
+                             int rank, const X(iodim) *dims,		   \
+			     int howmany_rank,				   \
+			     const X(iodim) *howmany_dims,		   \
+			     R *in, R *ro, R *io,			   \
+			     unsigned flags);				   \
+FFTW_EXTERN X(plan) X(plan_guru_split_dft_c2r)(				   \
+                             int rank, const X(iodim) *dims,		   \
+			     int howmany_rank,				   \
+			     const X(iodim) *howmany_dims,		   \
+			     R *ri, R *ii, R *out,			   \
+			     unsigned flags);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru64_dft_r2c)(int rank,			   \
+                             const X(iodim64) *dims,			   \
+			     int howmany_rank,				   \
+			     const X(iodim64) *howmany_dims,		   \
+			     R *in, C *out,				   \
+			     unsigned flags);				   \
+FFTW_EXTERN X(plan) X(plan_guru64_dft_c2r)(int rank,			   \
+                             const X(iodim64) *dims,			   \
+			     int howmany_rank,				   \
+			     const X(iodim64) *howmany_dims,		   \
+			     C *in, R *out,				   \
+			     unsigned flags);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru64_split_dft_r2c)(			   \
+                             int rank, const X(iodim64) *dims,		   \
+			     int howmany_rank,				   \
+			     const X(iodim64) *howmany_dims,		   \
+			     R *in, R *ro, R *io,			   \
+			     unsigned flags);				   \
+FFTW_EXTERN X(plan) X(plan_guru64_split_dft_c2r)(			   \
+                             int rank, const X(iodim64) *dims,		   \
+			     int howmany_rank,				   \
+			     const X(iodim64) *howmany_dims,		   \
+			     R *ri, R *ii, R *out,			   \
+			     unsigned flags);				   \
+									   \
+FFTW_EXTERN void X(execute_dft_r2c)(const X(plan) p, R *in, C *out);	   \
+FFTW_EXTERN void X(execute_dft_c2r)(const X(plan) p, C *in, R *out);	   \
+									   \
+FFTW_EXTERN void X(execute_split_dft_r2c)(const X(plan) p,		   \
+                                          R *in, R *ro, R *io);		   \
+FFTW_EXTERN void X(execute_split_dft_c2r)(const X(plan) p,		   \
+                                          R *ri, R *ii, R *out);	   \
+									   \
+FFTW_EXTERN X(plan) X(plan_many_r2r)(int rank, const int *n,		   \
+                         int howmany,					   \
+                         R *in, const int *inembed,			   \
+                         int istride, int idist,			   \
+                         R *out, const int *onembed,			   \
+                         int ostride, int odist,			   \
+                         const X(r2r_kind) *kind, unsigned flags);	   \
+									   \
+FFTW_EXTERN X(plan) X(plan_r2r)(int rank, const int *n, R *in, R *out,	   \
+                    const X(r2r_kind) *kind, unsigned flags);		   \
+									   \
+FFTW_EXTERN X(plan) X(plan_r2r_1d)(int n, R *in, R *out,		   \
+                       X(r2r_kind) kind, unsigned flags);		   \
+FFTW_EXTERN X(plan) X(plan_r2r_2d)(int n0, int n1, R *in, R *out,	   \
+                       X(r2r_kind) kind0, X(r2r_kind) kind1,		   \
+                       unsigned flags);					   \
+FFTW_EXTERN X(plan) X(plan_r2r_3d)(int n0, int n1, int n2,		   \
+                       R *in, R *out, X(r2r_kind) kind0,		   \
+                       X(r2r_kind) kind1, X(r2r_kind) kind2,		   \
+                       unsigned flags);					   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru_r2r)(int rank, const X(iodim) *dims,	   \
+                         int howmany_rank,				   \
+                         const X(iodim) *howmany_dims,			   \
+                         R *in, R *out,					   \
+                         const X(r2r_kind) *kind, unsigned flags);	   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru64_r2r)(int rank, const X(iodim64) *dims,   \
+                         int howmany_rank,				   \
+                         const X(iodim64) *howmany_dims,		   \
+                         R *in, R *out,					   \
+                         const X(r2r_kind) *kind, unsigned flags);	   \
+									   \
+FFTW_EXTERN void X(execute_r2r)(const X(plan) p, R *in, R *out);	   \
+									   \
+FFTW_EXTERN void X(destroy_plan)(X(plan) p);				   \
+FFTW_EXTERN void X(forget_wisdom)(void);				   \
+FFTW_EXTERN void X(cleanup)(void);					   \
+									   \
+FFTW_EXTERN void X(set_timelimit)(double t);				   \
+									   \
+FFTW_EXTERN void X(plan_with_nthreads)(int nthreads);			   \
+FFTW_EXTERN int X(init_threads)(void);					   \
+FFTW_EXTERN void X(cleanup_threads)(void);				   \
+									   \
+FFTW_EXTERN int X(export_wisdom_to_filename)(const char *filename);	   \
+FFTW_EXTERN void X(export_wisdom_to_file)(FILE *output_file);		   \
+FFTW_EXTERN char *X(export_wisdom_to_string)(void);			   \
+FFTW_EXTERN void X(export_wisdom)(X(write_char_func) write_char,   	   \
+                                  void *data);				   \
+FFTW_EXTERN int X(import_system_wisdom)(void);				   \
+FFTW_EXTERN int X(import_wisdom_from_filename)(const char *filename);	   \
+FFTW_EXTERN int X(import_wisdom_from_file)(FILE *input_file);		   \
+FFTW_EXTERN int X(import_wisdom_from_string)(const char *input_string);	   \
+FFTW_EXTERN int X(import_wisdom)(X(read_char_func) read_char, void *data); \
+									   \
+FFTW_EXTERN void X(fprint_plan)(const X(plan) p, FILE *output_file);	   \
+FFTW_EXTERN void X(print_plan)(const X(plan) p);			   \
+FFTW_EXTERN char *X(sprint_plan)(const X(plan) p);			   \
+									   \
+FFTW_EXTERN void *X(malloc)(size_t n);					   \
+FFTW_EXTERN R *X(alloc_real)(size_t n);					   \
+FFTW_EXTERN C *X(alloc_complex)(size_t n);				   \
+FFTW_EXTERN void X(free)(void *p);					   \
+									   \
+FFTW_EXTERN void X(flops)(const X(plan) p,				   \
+                          double *add, double *mul, double *fmas);	   \
+FFTW_EXTERN double X(estimate_cost)(const X(plan) p);			   \
+FFTW_EXTERN double X(cost)(const X(plan) p);				   \
+									   \
+FFTW_EXTERN int X(alignment_of)(R *p);                                     \
+FFTW_EXTERN const char X(version)[];                                       \
+FFTW_EXTERN const char X(cc)[];						   \
+FFTW_EXTERN const char X(codelet_optim)[];
+
+
+/* end of FFTW_DEFINE_API macro */
+
+FFTW_DEFINE_API(FFTW_MANGLE_DOUBLE, double, fftw_complex)
+FFTW_DEFINE_API(FFTW_MANGLE_FLOAT, float, fftwf_complex)
+FFTW_DEFINE_API(FFTW_MANGLE_LONG_DOUBLE, long double, fftwl_complex)
+
+/* __float128 (quad precision) is a gcc extension on i386, x86_64, and ia64
+   for gcc >= 4.6 (compiled in FFTW with --enable-quad-precision) */
+#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) \
+ && !(defined(__ICC) || defined(__INTEL_COMPILER)) \
+ && (defined(__i386__) || defined(__x86_64__) || defined(__ia64__))
+#  if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I)
+/* note: __float128 is a typedef, which is not supported with the _Complex
+         keyword in gcc, so instead we use this ugly __attribute__ version.
+         However, we can't simply pass the __attribute__ version to
+         FFTW_DEFINE_API because the __attribute__ confuses gcc in pointer
+         types.  Hence redefining FFTW_DEFINE_COMPLEX.  Ugh. */
+#    undef FFTW_DEFINE_COMPLEX
+#    define FFTW_DEFINE_COMPLEX(R, C) typedef _Complex float __attribute__((mode(TC))) C
+#  endif
+FFTW_DEFINE_API(FFTW_MANGLE_QUAD, __float128, fftwq_complex)
+#endif
+
+#define FFTW_FORWARD (-1)
+#define FFTW_BACKWARD (+1)
+
+#define FFTW_NO_TIMELIMIT (-1.0)
+
+/* documented flags */
+#define FFTW_MEASURE (0U)
+#define FFTW_DESTROY_INPUT (1U << 0)
+#define FFTW_UNALIGNED (1U << 1)
+#define FFTW_CONSERVE_MEMORY (1U << 2)
+#define FFTW_EXHAUSTIVE (1U << 3) /* NO_EXHAUSTIVE is default */
+#define FFTW_PRESERVE_INPUT (1U << 4) /* cancels FFTW_DESTROY_INPUT */
+#define FFTW_PATIENT (1U << 5) /* IMPATIENT is default */
+#define FFTW_ESTIMATE (1U << 6)
+#define FFTW_WISDOM_ONLY (1U << 21)
+
+/* undocumented beyond-guru flags */
+#define FFTW_ESTIMATE_PATIENT (1U << 7)
+#define FFTW_BELIEVE_PCOST (1U << 8)
+#define FFTW_NO_DFT_R2HC (1U << 9)
+#define FFTW_NO_NONTHREADED (1U << 10)
+#define FFTW_NO_BUFFERING (1U << 11)
+#define FFTW_NO_INDIRECT_OP (1U << 12)
+#define FFTW_ALLOW_LARGE_GENERIC (1U << 13) /* NO_LARGE_GENERIC is default */
+#define FFTW_NO_RANK_SPLITS (1U << 14)
+#define FFTW_NO_VRANK_SPLITS (1U << 15)
+#define FFTW_NO_VRECURSE (1U << 16)
+#define FFTW_NO_SIMD (1U << 17)
+#define FFTW_NO_SLOW (1U << 18)
+#define FFTW_NO_FIXED_RADIX_LARGE_N (1U << 19)
+#define FFTW_ALLOW_PRUNING (1U << 20)
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* FFTW3_H */
--- a/lib/lattice/Lattice_ET.h
+++ b/lib/lattice/Lattice_ET.h
@@ -1,73 +1,74 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/lattice/Lattice_ET.h
+Source file: ./lib/lattice/Lattice_ET.h

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef GRID_LATTICE_ET_H
 #define GRID_LATTICE_ET_H

 #include <iostream>
-#include <vector>
 #include <tuple>
 #include <typeinfo>
+#include <vector>

 namespace Grid {

-  ////////////////////////////////////////////////////
-  // Predicated where support
-  ////////////////////////////////////////////////////
-  template<class iobj,class vobj,class robj>
-    inline vobj predicatedWhere(const iobj &predicate,const vobj &iftrue,const robj &iffalse) {
+////////////////////////////////////////////////////
+// Predicated where support
+////////////////////////////////////////////////////
+template <class iobj, class vobj, class robj>
+inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
+                            const robj &iffalse) {
+  typename std::remove_const<vobj>::type ret;

-    typename std::remove_const<vobj>::type ret;
+  typedef typename vobj::scalar_object scalar_object;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;

-    typedef typename vobj::scalar_object scalar_object;
-    typedef typename vobj::scalar_type scalar_type;
-    typedef typename vobj::vector_type vector_type;
+  const int Nsimd = vobj::vector_type::Nsimd();
+  const int words = sizeof(vobj) / sizeof(vector_type);

-    const int Nsimd = vobj::vector_type::Nsimd();
-    const int words = sizeof(vobj)/sizeof(vector_type);
+  std::vector<Integer> mask(Nsimd);
+  std::vector<scalar_object> truevals(Nsimd);
+  std::vector<scalar_object> falsevals(Nsimd);

-    std::vector<Integer> mask(Nsimd);
-    std::vector<scalar_object> truevals (Nsimd);
-    std::vector<scalar_object> falsevals(Nsimd);
+  extract(iftrue, truevals);
+  extract(iffalse, falsevals);
+  extract<vInteger, Integer>(TensorRemove(predicate), mask);

-    extract(iftrue   ,truevals);
-    extract(iffalse  ,falsevals);
-    extract<vInteger,Integer>(TensorRemove(predicate),mask);
-
-    for(int s=0;s<Nsimd;s++){
-      if (mask[s]) falsevals[s]=truevals[s];
-    }
-
-    merge(ret,falsevals);
-    return ret;
+  for (int s = 0; s < Nsimd; s++) {
+    if (mask[s]) falsevals[s] = truevals[s];
  }

+  merge(ret, falsevals);
+  return ret;
+}
+
 ////////////////////////////////////////////
 // recursive evaluation of expressions; Could
 // switch to generic approach with variadics, a la
@@ -75,303 +76,351 @@ namespace Grid {
 // from tuple is hideous; C++14 introduces std::make_index_sequence for this
 ////////////////////////////////////////////

+// leaf eval of lattice ; should enable if protect using traits

-//leaf eval of lattice ; should enable if protect using traits
+template <typename T>
+using is_lattice = std::is_base_of<LatticeBase, T>;

-template <typename T> using is_lattice      = std::is_base_of<LatticeBase,T >;
+template <typename T>
+using is_lattice_expr = std::is_base_of<LatticeExpressionBase, T>;

 template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;

+//Specialization of getVectorType for lattices
+template<typename T>
+struct getVectorType<Lattice<T> >{
+  typedef typename Lattice<T>::vector_object type;
+};
+ 
 template<class sobj>
 inline sobj eval(const unsigned int ss, const sobj &arg)
 {
  return arg;
 }
-template<class lobj>
-inline const lobj &eval(const unsigned int ss, const Lattice<lobj> &arg)
-{
-    return arg._odata[ss];
+template <class lobj>
+inline const lobj &eval(const unsigned int ss, const Lattice<lobj> &arg) {
+  return arg._odata[ss];
 }

 // handle nodes in syntax tree
 template <typename Op, typename T1>
-auto inline eval(const unsigned int ss, const LatticeUnaryExpression<Op,T1 > &expr) // eval one operand
-  -> decltype(expr.first.func(eval(ss,std::get<0>(expr.second))))
-{
-  return expr.first.func(eval(ss,std::get<0>(expr.second)));
+auto inline eval(
+    const unsigned int ss,
+    const LatticeUnaryExpression<Op, T1> &expr)  // eval one operand
+    -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)))) {
+  return expr.first.func(eval(ss, std::get<0>(expr.second)));
 }

 template <typename Op, typename T1, typename T2>
-auto inline eval(const unsigned int ss, const LatticeBinaryExpression<Op,T1,T2> &expr) // eval two operands
-  -> decltype(expr.first.func(eval(ss,std::get<0>(expr.second)),eval(ss,std::get<1>(expr.second))))
-{
-  return expr.first.func(eval(ss,std::get<0>(expr.second)),eval(ss,std::get<1>(expr.second)));
+auto inline eval(
+    const unsigned int ss,
+    const LatticeBinaryExpression<Op, T1, T2> &expr)  // eval two operands
+    -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)),
+                                eval(ss, std::get<1>(expr.second)))) {
+  return expr.first.func(eval(ss, std::get<0>(expr.second)),
+                         eval(ss, std::get<1>(expr.second)));
 }

 template <typename Op, typename T1, typename T2, typename T3>
-auto inline eval(const unsigned int ss, const LatticeTrinaryExpression<Op,T1,T2,T3 > &expr) // eval three operands
-  -> decltype(expr.first.func(eval(ss,std::get<0>(expr.second)),eval(ss,std::get<1>(expr.second)),eval(ss,std::get<2>(expr.second))))
-{
-  return expr.first.func(eval(ss,std::get<0>(expr.second)),eval(ss,std::get<1>(expr.second)),eval(ss,std::get<2>(expr.second)) );
+auto inline eval(const unsigned int ss,
+                 const LatticeTrinaryExpression<Op, T1, T2, T3>
+                     &expr)  // eval three operands
+    -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)),
+                                eval(ss, std::get<1>(expr.second)),
+                                eval(ss, std::get<2>(expr.second)))) {
+  return expr.first.func(eval(ss, std::get<0>(expr.second)),
+                         eval(ss, std::get<1>(expr.second)),
+                         eval(ss, std::get<2>(expr.second)));
 }

 //////////////////////////////////////////////////////////////////////////
-// Obtain the grid from an expression, ensuring conformable. This must follow a tree recursion
+// Obtain the grid from an expression, ensuring conformable. This must follow a
+// tree recursion
 //////////////////////////////////////////////////////////////////////////
-template<class T1, typename std::enable_if<is_lattice<T1>::value, T1>::type * =nullptr >
-inline void GridFromExpression(GridBase * &grid,const T1& lat)   // Lattice leaf
-{
-  if ( grid ) {
-    conformable(grid,lat._grid);
-  } 
-  grid=lat._grid;
-}
-template<class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr >
-inline void GridFromExpression(GridBase * &grid,const T1& notlat)   // non-lattice leaf
+template <class T1,
+          typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
+inline void GridFromExpression(GridBase *&grid, const T1 &lat)  // Lattice leaf
 {
+  if (grid) {
+    conformable(grid, lat._grid);
+  }
+  grid = lat._grid;
 }
+template <class T1,
+          typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
+inline void GridFromExpression(GridBase *&grid,
+                               const T1 &notlat)  // non-lattice leaf
+{}
 template <typename Op, typename T1>
-inline void GridFromExpression(GridBase * &grid,const LatticeUnaryExpression<Op,T1 > &expr)
-{
-  GridFromExpression(grid,std::get<0>(expr.second));// recurse 
+inline void GridFromExpression(GridBase *&grid,
+                               const LatticeUnaryExpression<Op, T1> &expr) {
+  GridFromExpression(grid, std::get<0>(expr.second));  // recurse
 }

 template <typename Op, typename T1, typename T2>
-inline void GridFromExpression(GridBase * &grid,const LatticeBinaryExpression<Op,T1,T2> &expr) 
-{
-  GridFromExpression(grid,std::get<0>(expr.second));// recurse
-  GridFromExpression(grid,std::get<1>(expr.second));
+inline void GridFromExpression(
+    GridBase *&grid, const LatticeBinaryExpression<Op, T1, T2> &expr) {
+  GridFromExpression(grid, std::get<0>(expr.second));  // recurse
+  GridFromExpression(grid, std::get<1>(expr.second));
 }
 template <typename Op, typename T1, typename T2, typename T3>
-inline void GridFromExpression( GridBase * &grid,const LatticeTrinaryExpression<Op,T1,T2,T3 > &expr) 
-{
-  GridFromExpression(grid,std::get<0>(expr.second));// recurse
-  GridFromExpression(grid,std::get<1>(expr.second));
-  GridFromExpression(grid,std::get<2>(expr.second));
+inline void GridFromExpression(
+    GridBase *&grid, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) {
+  GridFromExpression(grid, std::get<0>(expr.second));  // recurse
+  GridFromExpression(grid, std::get<1>(expr.second));
+  GridFromExpression(grid, std::get<2>(expr.second));
 }

-
 //////////////////////////////////////////////////////////////////////////
-// Obtain the CB from an expression, ensuring conformable. This must follow a tree recursion
+// Obtain the CB from an expression, ensuring conformable. This must follow a
+// tree recursion
 //////////////////////////////////////////////////////////////////////////
-template<class T1, typename std::enable_if<is_lattice<T1>::value, T1>::type * =nullptr >
-inline void CBFromExpression(int &cb,const T1& lat)   // Lattice leaf
+template <class T1,
+          typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
+inline void CBFromExpression(int &cb, const T1 &lat)  // Lattice leaf
 {
-  if ( (cb==Odd) || (cb==Even) ) {
-    assert(cb==lat.checkerboard);
-  } 
-  cb=lat.checkerboard;
+  if ((cb == Odd) || (cb == Even)) {
+    assert(cb == lat.checkerboard);
+  }
+  cb = lat.checkerboard;
  //  std::cout<<GridLogMessage<<"Lattice leaf cb "<<cb<<std::endl;
 }
-template<class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr >
-inline void CBFromExpression(int &cb,const T1& notlat)   // non-lattice leaf
+template <class T1,
+          typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
+inline void CBFromExpression(int &cb, const T1 &notlat)  // non-lattice leaf
 {
  //  std::cout<<GridLogMessage<<"Non lattice leaf cb"<<cb<<std::endl;
 }
 template <typename Op, typename T1>
-inline void CBFromExpression(int &cb,const LatticeUnaryExpression<Op,T1 > &expr)
-{
-  CBFromExpression(cb,std::get<0>(expr.second));// recurse 
+inline void CBFromExpression(int &cb,
+                             const LatticeUnaryExpression<Op, T1> &expr) {
+  CBFromExpression(cb, std::get<0>(expr.second));  // recurse
  //  std::cout<<GridLogMessage<<"Unary node cb "<<cb<<std::endl;
 }

 template <typename Op, typename T1, typename T2>
-inline void CBFromExpression(int &cb,const LatticeBinaryExpression<Op,T1,T2> &expr) 
-{
-  CBFromExpression(cb,std::get<0>(expr.second));// recurse
-  CBFromExpression(cb,std::get<1>(expr.second));
+inline void CBFromExpression(int &cb,
+                             const LatticeBinaryExpression<Op, T1, T2> &expr) {
+  CBFromExpression(cb, std::get<0>(expr.second));  // recurse
+  CBFromExpression(cb, std::get<1>(expr.second));
  //  std::cout<<GridLogMessage<<"Binary node cb "<<cb<<std::endl;
 }
 template <typename Op, typename T1, typename T2, typename T3>
-inline void CBFromExpression( int &cb,const LatticeTrinaryExpression<Op,T1,T2,T3 > &expr) 
-{
-  CBFromExpression(cb,std::get<0>(expr.second));// recurse
-  CBFromExpression(cb,std::get<1>(expr.second));
-  CBFromExpression(cb,std::get<2>(expr.second));
+inline void CBFromExpression(
+    int &cb, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) {
+  CBFromExpression(cb, std::get<0>(expr.second));  // recurse
+  CBFromExpression(cb, std::get<1>(expr.second));
+  CBFromExpression(cb, std::get<2>(expr.second));
  //  std::cout<<GridLogMessage<<"Trinary node cb "<<cb<<std::endl;
 }

 ////////////////////////////////////////////
 // Unary operators and funcs
 ////////////////////////////////////////////
-#define GridUnopClass(name,ret)\
-template <class arg> struct name\
-{\
-  static auto inline func(const arg a)-> decltype(ret) { return ret; } \
-};
+#define GridUnopClass(name, ret)                                          \
+  template <class arg>                                                    \
+  struct name {                                                           \
+    static auto inline func(const arg a) -> decltype(ret) { return ret; } \
+  };

-GridUnopClass(UnarySub,-a);
-GridUnopClass(UnaryNot,Not(a));
-GridUnopClass(UnaryAdj,adj(a));
-GridUnopClass(UnaryConj,conjugate(a));
-GridUnopClass(UnaryTrace,trace(a));
-GridUnopClass(UnaryTranspose,transpose(a));
-GridUnopClass(UnaryTa,Ta(a));
-GridUnopClass(UnaryProjectOnGroup,ProjectOnGroup(a));
-GridUnopClass(UnaryReal,real(a));
-GridUnopClass(UnaryImag,imag(a));
-GridUnopClass(UnaryToReal,toReal(a));
-GridUnopClass(UnaryToComplex,toComplex(a));
-GridUnopClass(UnaryAbs,abs(a));
-GridUnopClass(UnarySqrt,sqrt(a));
-GridUnopClass(UnaryRsqrt,rsqrt(a));
-GridUnopClass(UnarySin,sin(a));
-GridUnopClass(UnaryCos,cos(a));
-GridUnopClass(UnaryLog,log(a));
-GridUnopClass(UnaryExp,exp(a));
+GridUnopClass(UnarySub, -a);
+GridUnopClass(UnaryNot, Not(a));
+GridUnopClass(UnaryAdj, adj(a));
+GridUnopClass(UnaryConj, conjugate(a));
+GridUnopClass(UnaryTrace, trace(a));
+GridUnopClass(UnaryTranspose, transpose(a));
+GridUnopClass(UnaryTa, Ta(a));
+GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
+GridUnopClass(UnaryReal, real(a));
+GridUnopClass(UnaryImag, imag(a));
+GridUnopClass(UnaryToReal, toReal(a));
+GridUnopClass(UnaryToComplex, toComplex(a));
+GridUnopClass(UnaryTimesI, timesI(a));
+GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
+GridUnopClass(UnaryAbs, abs(a));
+GridUnopClass(UnarySqrt, sqrt(a));
+GridUnopClass(UnaryRsqrt, rsqrt(a));
+GridUnopClass(UnarySin, sin(a));
+GridUnopClass(UnaryCos, cos(a));
+GridUnopClass(UnaryAsin, asin(a));
+GridUnopClass(UnaryAcos, acos(a));
+GridUnopClass(UnaryLog, log(a));
+GridUnopClass(UnaryExp, exp(a));

 ////////////////////////////////////////////
 // Binary operators
 ////////////////////////////////////////////
-#define GridBinOpClass(name,combination)\
-template <class left,class right>\
-struct name\
-{\
-  static auto inline func(const left &lhs,const right &rhs)-> decltype(combination) const \
-    {\
-      return combination;\
-    }\
-}
-GridBinOpClass(BinaryAdd,lhs+rhs);
-GridBinOpClass(BinarySub,lhs-rhs);
-GridBinOpClass(BinaryMul,lhs*rhs);
+#define GridBinOpClass(name, combination)                      \
+  template <class left, class right>                           \
+  struct name {                                                \
+    static auto inline func(const left &lhs, const right &rhs) \
+        -> decltype(combination) const {                       \
+      return combination;                                      \
+    }                                                          \
+  }
+GridBinOpClass(BinaryAdd, lhs + rhs);
+GridBinOpClass(BinarySub, lhs - rhs);
+GridBinOpClass(BinaryMul, lhs *rhs);

-GridBinOpClass(BinaryAnd   ,lhs&rhs);
-GridBinOpClass(BinaryOr    ,lhs|rhs);
-GridBinOpClass(BinaryAndAnd,lhs&&rhs);
-GridBinOpClass(BinaryOrOr  ,lhs||rhs);
+GridBinOpClass(BinaryAnd, lhs &rhs);
+GridBinOpClass(BinaryOr, lhs | rhs);
+GridBinOpClass(BinaryAndAnd, lhs &&rhs);
+GridBinOpClass(BinaryOrOr, lhs || rhs);

 ////////////////////////////////////////////////////
 // Trinary conditional op
 ////////////////////////////////////////////////////
-#define GridTrinOpClass(name,combination)\
-template <class predicate,class left, class right>	\
-struct name\
-{\
-  static auto inline func(const predicate &pred,const left &lhs,const right &rhs)-> decltype(combination) const \
-    {\
-      return combination;\
-    }\
-}
+#define GridTrinOpClass(name, combination)                                     \
+  template <class predicate, class left, class right>                          \
+  struct name {                                                                \
+    static auto inline func(const predicate &pred, const left &lhs,            \
+                            const right &rhs) -> decltype(combination) const { \
+      return combination;                                                      \
+    }                                                                          \
+  }

-GridTrinOpClass(TrinaryWhere,(predicatedWhere<predicate, \
-			       typename std::remove_reference<left>::type, \
-			       typename std::remove_reference<right>::type> (pred,lhs,rhs)));
+GridTrinOpClass(
+    TrinaryWhere,
+    (predicatedWhere<predicate, typename std::remove_reference<left>::type,
+                     typename std::remove_reference<right>::type>(pred, lhs,
+                                                                  rhs)));

 ////////////////////////////////////////////
 // Operator syntactical glue
 ////////////////////////////////////////////
- 
-#define GRID_UNOP(name)   name<decltype(eval(0, arg))>
-#define GRID_BINOP(name)  name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
-#define GRID_TRINOP(name) name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>

-#define GRID_DEF_UNOP(op, name)\
-template <typename T1,\
-  typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value, T1>::type* = nullptr> inline auto op(const T1 &arg) \
-  -> decltype(LatticeUnaryExpression<GRID_UNOP(name),const T1&>(std::make_pair(GRID_UNOP(name)(),std::forward_as_tuple(arg)))) \
-{ return LatticeUnaryExpression<GRID_UNOP(name), const T1 &>(std::make_pair(GRID_UNOP(name)(),std::forward_as_tuple(arg))); }
+#define GRID_UNOP(name) name<decltype(eval(0, arg))>
+#define GRID_BINOP(name) name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
+#define GRID_TRINOP(name) \
+  name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>

-#define GRID_BINOP_LEFT(op, name)\
-template <typename T1,typename T2,\
-          typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value, T1>::type* = nullptr>\
-inline auto op(const T1 &lhs,const T2&rhs) \
-  -> decltype(LatticeBinaryExpression<GRID_BINOP(name),const T1&,const T2 &>(std::make_pair(GRID_BINOP(name)(),\
-											    std::forward_as_tuple(lhs, rhs)))) \
-{\
- return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(std::make_pair(GRID_BINOP(name)(),\
-									  std::forward_as_tuple(lhs, rhs))); \
-}
+#define GRID_DEF_UNOP(op, name)                                             \
+  template <typename T1,                                                    \
+            typename std::enable_if<is_lattice<T1>::value ||                \
+                                        is_lattice_expr<T1>::value,         \
+                                    T1>::type * = nullptr>                  \
+  inline auto op(const T1 &arg)                                             \
+      ->decltype(LatticeUnaryExpression<GRID_UNOP(name), const T1 &>(       \
+          std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg)))) { \
+    return LatticeUnaryExpression<GRID_UNOP(name), const T1 &>(             \
+        std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg)));     \
+  }

-#define GRID_BINOP_RIGHT(op, name)\
- template <typename T1,typename T2,\
-           typename std::enable_if<!is_lattice<T1>::value && !is_lattice_expr<T1>::value, T1>::type* = nullptr,\
-           typename std::enable_if< is_lattice<T2>::value ||  is_lattice_expr<T2>::value, T2>::type* = nullptr> \
-inline auto op(const T1 &lhs,const T2&rhs)			\
-  -> decltype(LatticeBinaryExpression<GRID_BINOP(name),const T1&,const T2 &>(std::make_pair(GRID_BINOP(name)(),\
-											    std::forward_as_tuple(lhs, rhs)))) \
-{\
- return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(std::make_pair(GRID_BINOP(name)(),\
-								          std::forward_as_tuple(lhs, rhs))); \
-}
+#define GRID_BINOP_LEFT(op, name)                                             \
+  template <typename T1, typename T2,                                         \
+            typename std::enable_if<is_lattice<T1>::value ||                  \
+                                        is_lattice_expr<T1>::value,           \
+                                    T1>::type * = nullptr>                    \
+  inline auto op(const T1 &lhs, const T2 &rhs)                                \
+      ->decltype(                                                             \
+          LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(  \
+              std::make_pair(GRID_BINOP(name)(),                              \
+                             std::forward_as_tuple(lhs, rhs)))) {             \
+    return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
+        std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \
+  }

-#define GRID_DEF_BINOP(op, name)\
- GRID_BINOP_LEFT(op,name);\
- GRID_BINOP_RIGHT(op,name);
+#define GRID_BINOP_RIGHT(op, name)                                            \
+  template <typename T1, typename T2,                                         \
+            typename std::enable_if<!is_lattice<T1>::value &&                 \
+                                        !is_lattice_expr<T1>::value,          \
+                                    T1>::type * = nullptr,                    \
+            typename std::enable_if<is_lattice<T2>::value ||                  \
+                                        is_lattice_expr<T2>::value,           \
+                                    T2>::type * = nullptr>                    \
+  inline auto op(const T1 &lhs, const T2 &rhs)                                \
+      ->decltype(                                                             \
+          LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(  \
+              std::make_pair(GRID_BINOP(name)(),                              \
+                             std::forward_as_tuple(lhs, rhs)))) {             \
+    return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
+        std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \
+  }

+#define GRID_DEF_BINOP(op, name) \
+  GRID_BINOP_LEFT(op, name);     \
+  GRID_BINOP_RIGHT(op, name);

-#define GRID_DEF_TRINOP(op, name)\
-template <typename T1,typename T2,typename T3> inline auto op(const T1 &pred,const T2&lhs,const T3 &rhs) \
-  -> decltype(LatticeTrinaryExpression<GRID_TRINOP(name),const T1&,const T2 &,const T3&>(std::make_pair(GRID_TRINOP(name)(),\
-										   std::forward_as_tuple(pred,lhs,rhs)))) \
-{\
-  return LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &,const T3&>(std::make_pair(GRID_TRINOP(name)(), \
-										 std::forward_as_tuple(pred,lhs, rhs))); \
-}
+#define GRID_DEF_TRINOP(op, name)                                              \
+  template <typename T1, typename T2, typename T3>                             \
+  inline auto op(const T1 &pred, const T2 &lhs, const T3 &rhs)                 \
+      ->decltype(                                                              \
+          LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &,  \
+                                   const T3 &>(std::make_pair(                 \
+              GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs)))) {  \
+    return LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &, \
+                                    const T3 &>(std::make_pair(                \
+        GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs)));          \
+  }
 ////////////////////////
-//Operator definitions
+// Operator definitions
 ////////////////////////

-GRID_DEF_UNOP(operator -,UnarySub);
-GRID_DEF_UNOP(Not,UnaryNot);
-GRID_DEF_UNOP(operator !,UnaryNot);
-GRID_DEF_UNOP(adj,UnaryAdj);
-GRID_DEF_UNOP(conjugate,UnaryConj);
-GRID_DEF_UNOP(trace,UnaryTrace);
-GRID_DEF_UNOP(transpose,UnaryTranspose);
-GRID_DEF_UNOP(Ta,UnaryTa);
-GRID_DEF_UNOP(ProjectOnGroup,UnaryProjectOnGroup);
-GRID_DEF_UNOP(real,UnaryReal);
-GRID_DEF_UNOP(imag,UnaryImag);
-GRID_DEF_UNOP(toReal,UnaryToReal);
-GRID_DEF_UNOP(toComplex,UnaryToComplex);
-GRID_DEF_UNOP(abs  ,UnaryAbs); //abs overloaded in cmath C++98; DON'T do the abs-fabs-dabs-labs thing
-GRID_DEF_UNOP(sqrt ,UnarySqrt);
-GRID_DEF_UNOP(rsqrt,UnaryRsqrt);
-GRID_DEF_UNOP(sin  ,UnarySin);
-GRID_DEF_UNOP(cos  ,UnaryCos);
-GRID_DEF_UNOP(log  ,UnaryLog);
-GRID_DEF_UNOP(exp  ,UnaryExp);
+GRID_DEF_UNOP(operator-, UnarySub);
+GRID_DEF_UNOP(Not, UnaryNot);
+GRID_DEF_UNOP(operator!, UnaryNot);
+GRID_DEF_UNOP(adj, UnaryAdj);
+GRID_DEF_UNOP(conjugate, UnaryConj);
+GRID_DEF_UNOP(trace, UnaryTrace);
+GRID_DEF_UNOP(transpose, UnaryTranspose);
+GRID_DEF_UNOP(Ta, UnaryTa);
+GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
+GRID_DEF_UNOP(real, UnaryReal);
+GRID_DEF_UNOP(imag, UnaryImag);
+GRID_DEF_UNOP(toReal, UnaryToReal);
+GRID_DEF_UNOP(toComplex, UnaryToComplex);
+GRID_DEF_UNOP(timesI, UnaryTimesI);
+GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
+GRID_DEF_UNOP(abs, UnaryAbs);  // abs overloaded in cmath C++98; DON'T do the
+                               // abs-fabs-dabs-labs thing
+GRID_DEF_UNOP(sqrt, UnarySqrt);
+GRID_DEF_UNOP(rsqrt, UnaryRsqrt);
+GRID_DEF_UNOP(sin, UnarySin);
+GRID_DEF_UNOP(cos, UnaryCos);
+GRID_DEF_UNOP(asin, UnaryAsin);
+GRID_DEF_UNOP(acos, UnaryAcos);
+GRID_DEF_UNOP(log, UnaryLog);
+GRID_DEF_UNOP(exp, UnaryExp);

-GRID_DEF_BINOP(operator+,BinaryAdd);
-GRID_DEF_BINOP(operator-,BinarySub);
-GRID_DEF_BINOP(operator*,BinaryMul);
+GRID_DEF_BINOP(operator+, BinaryAdd);
+GRID_DEF_BINOP(operator-, BinarySub);
+GRID_DEF_BINOP(operator*, BinaryMul);

-GRID_DEF_BINOP(operator&,BinaryAnd);
-GRID_DEF_BINOP(operator|,BinaryOr);
-GRID_DEF_BINOP(operator&&,BinaryAndAnd);
-GRID_DEF_BINOP(operator||,BinaryOrOr);
+GRID_DEF_BINOP(operator&, BinaryAnd);
+GRID_DEF_BINOP(operator|, BinaryOr);
+GRID_DEF_BINOP(operator&&, BinaryAndAnd);
+GRID_DEF_BINOP(operator||, BinaryOrOr);

-GRID_DEF_TRINOP(where,TrinaryWhere);
+GRID_DEF_TRINOP(where, TrinaryWhere);

 /////////////////////////////////////////////////////////////
 // Closure convenience to force expression to evaluate
 /////////////////////////////////////////////////////////////
-template<class Op,class T1>
-  auto closure(const LatticeUnaryExpression<Op,T1> & expr)
-  -> Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second))))>
-{
-  Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second))))> ret(expr);
+template <class Op, class T1>
+auto closure(const LatticeUnaryExpression<Op, T1> &expr)
+    -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> {
+  Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> ret(
+      expr);
  return ret;
 }
-template<class Op,class T1, class T2>
-  auto closure(const LatticeBinaryExpression<Op,T1,T2> & expr)
-  -> Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
-				      eval(0,std::get<1>(expr.second))))>
-{
-  Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
-				   eval(0,std::get<1>(expr.second))))> ret(expr);
+template <class Op, class T1, class T2>
+auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
+    -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
+                                        eval(0, std::get<1>(expr.second))))> {
+  Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
+                                   eval(0, std::get<1>(expr.second))))>
+      ret(expr);
  return ret;
 }
-template<class Op,class T1, class T2, class T3>
-  auto closure(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
-  -> Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
-				      eval(0,std::get<1>(expr.second)),
-				      eval(0,std::get<2>(expr.second))))>
-{
-  Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
-				   eval(0,std::get<1>(expr.second)),
-				   eval(0,std::get<2>(expr.second))))> ret(expr);
+template <class Op, class T1, class T2, class T3>
+auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
+    -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
+                                        eval(0, std::get<1>(expr.second)),
+                                        eval(0, std::get<2>(expr.second))))> {
+  Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
+                                   eval(0, std::get<1>(expr.second)),
+                                   eval(0, std::get<2>(expr.second))))>
+      ret(expr);
  return ret;
 }

@@ -382,12 +431,11 @@ template<class Op,class T1, class T2, class T3>
 #undef GRID_DEF_UNOP
 #undef GRID_DEF_BINOP
 #undef GRID_DEF_TRINOP
-
 }

 #if 0
 using namespace Grid;
- 	      
+        
 int main(int argc,char **argv){
   
   Lattice<double> v1(16);
@@ -397,7 +445,7 @@ using namespace Grid;
   BinaryAdd<double,double> tmp;
   LatticeBinaryExpression<BinaryAdd<double,double>,Lattice<double> &,Lattice<double> &> 
     expr(std::make_pair(tmp,
-	  std::forward_as_tuple(v1,v2)));
+    std::forward_as_tuple(v1,v2)));
   tmp.func(eval(0,v1),eval(0,v2));

   auto var = v1+v2;
--- a/lib/lattice/Lattice_base.h
+++ b/lib/lattice/Lattice_base.h
@@ -1,32 +1,33 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/lattice/Lattice_base.h
+Source file: ./lib/lattice/Lattice_base.h

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef GRID_LATTICE_BASE_H
 #define GRID_LATTICE_BASE_H

@@ -101,6 +102,7 @@ public:
    int begin(void) { return 0;};
    int end(void)   { return _odata.size(); }
    vobj & operator[](int i) { return _odata[i]; };
+    const vobj & operator[](int i) const { return _odata[i]; };

 public:
    typedef typename vobj::scalar_type scalar_type;
@@ -255,6 +257,18 @@ PARALLEL_FOR_LOOP
        checkerboard=0;
    }

+    Lattice(const Lattice& r){ // copy constructor
+    	_grid = r._grid;
+    	checkerboard = r.checkerboard;
+    	_odata.resize(_grid->oSites());// essential
+  		PARALLEL_FOR_LOOP
+        for(int ss=0;ss<_grid->oSites();ss++){
+            _odata[ss]=r._odata[ss];
+        }  	
+    }
+
+
+
    virtual ~Lattice(void) = default;
    
    template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
@@ -267,7 +281,7 @@ PARALLEL_FOR_LOOP
    template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
      this->checkerboard = r.checkerboard;
      conformable(*this,r);
-      std::cout<<GridLogMessage<<"Lattice operator ="<<std::endl;
+      
 PARALLEL_FOR_LOOP
        for(int ss=0;ss<_grid->oSites();ss++){
            this->_odata[ss]=r._odata[ss];
@@ -324,27 +338,27 @@ PARALLEL_FOR_LOOP



-#include <lattice/Lattice_conformable.h>
+#include "Lattice_conformable.h"
 #define GRID_LATTICE_EXPRESSION_TEMPLATES
 #ifdef  GRID_LATTICE_EXPRESSION_TEMPLATES
-#include <lattice/Lattice_ET.h>
+#include "Lattice_ET.h"
 #else 
-#include <lattice/Lattice_overload.h>
+#include "Lattice_overload.h"
 #endif
-#include <lattice/Lattice_arith.h>
-#include <lattice/Lattice_trace.h>
-#include <lattice/Lattice_transpose.h>
-#include <lattice/Lattice_local.h>
-#include <lattice/Lattice_reduction.h>
-#include <lattice/Lattice_peekpoke.h>
-#include <lattice/Lattice_reality.h>
-#include <lattice/Lattice_comparison_utils.h>
-#include <lattice/Lattice_comparison.h>
-#include <lattice/Lattice_coordinate.h>
-#include <lattice/Lattice_where.h>
-#include <lattice/Lattice_rng.h>
-#include <lattice/Lattice_unary.h>
-#include <lattice/Lattice_transfer.h>
+#include "Lattice_arith.h"
+#include "Lattice_trace.h"
+#include "Lattice_transpose.h"
+#include "Lattice_local.h"
+#include "Lattice_reduction.h"
+#include "Lattice_peekpoke.h"
+#include "Lattice_reality.h"
+#include "Lattice_comparison_utils.h"
+#include "Lattice_comparison.h"
+#include "Lattice_coordinate.h"
+#include "Lattice_where.h"
+#include "Lattice_rng.h"
+#include "Lattice_unary.h"
+#include "Lattice_transfer.h"


 #endif
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@@ -40,7 +40,7 @@ namespace Grid {
    ////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
    ComplexD nrm = innerProduct(arg,arg);
-    return real(nrm); 
+    return std::real(nrm); 
  }

    template<class vobj>
--- a/lib/lattice/Lattice_rng.h
+++ b/lib/lattice/Lattice_rng.h
@@ -31,6 +31,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 #include <random>

+// Have not enable RNG_SPRNG_SHA256 by default yet.
+// Uncomment the following line to see the effect of the new RNG.
+// #define RNG_SPRNG_SHA256
+
+#ifdef RNG_SPRNG_SHA256
+#include "rng/sprng-sha256.h"
+#endif
+
 namespace Grid {


@@ -110,7 +118,11 @@ namespace Grid {
    int _seeded;
    // One generator per site.
    // Uniform and Gaussian distributions from these generators.
-#ifdef RNG_RANLUX
+#ifdef RNG_SPRNG_SHA256
+    typedef uint32_t      RngStateType;
+    typedef SprngSha256 RngEngine;
+    static const int RngStateCount = 22;
+#elif defined RNG_RANLUX
    typedef uint64_t      RngStateType;
    typedef std::ranlux48 RngEngine;
    static const int RngStateCount = 15;
@@ -273,6 +285,34 @@ namespace Grid {
    }


+#ifdef RNG_SPRNG_SHA256
+    template<class source> void Seed(source &src)
+    {
+      std::vector<int> gcoor;
+
+      long gsites = _grid->_gsites;
+
+      RngState rs;
+      for (int i = 0; i < 8; ++i) {
+        splitRngState(rs, rs, src());
+      }
+
+      for(long gidx=0;gidx<gsites;gidx++){
+
+        int rank,o_idx,i_idx;
+        _grid->GlobalIndexToGlobalCoor(gidx,gcoor);
+        _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
+
+        int l_idx=generator_idx(o_idx,i_idx);
+
+        if( rank == _grid->ThisRank() ){
+          splitRngState(_generators[l_idx].rs, rs, gidx);
+        }
+      }
+      _seeded=1;
+
+    }
+#else
    // This loop could be made faster to avoid the Ahmdahl by
    // i)  seed generators on each timeslice, for x=y=z=0;
    // ii) seed generators on each z for x=y=0
@@ -312,6 +352,7 @@ namespace Grid {
      }
      _seeded=1;
    }    
+#endif

    //FIXME implement generic IO and create state save/restore
    //void SaveState(const std::string<char> &file);
--- a/lib/lattice/Lattice_transfer.h
+++ b/lib/lattice/Lattice_transfer.h
@@ -349,7 +349,7 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
    assert(ig->_ldimensions[d] == og->_ldimensions[d]);
  }

-PARALLEL_FOR_LOOP
+  //PARALLEL_FOR_LOOP
  for(int idx=0;idx<ig->lSites();idx++){
    std::vector<int> lcoor(ni);
    ig->LocalIndexToLocalCoor(idx,lcoor);
@@ -386,7 +386,7 @@ void InsertSlice(Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int
  }

  // the above should guarantee that the operations are local
-PARALLEL_FOR_LOOP
+  //PARALLEL_FOR_LOOP
  for(int idx=0;idx<lg->lSites();idx++){
    std::vector<int> lcoor(nl);
    std::vector<int> hcoor(nh);
@@ -420,15 +420,15 @@ void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, in
  assert(hg->_processors[orthog]==1);

  int dl; dl = 0;
-  for(int d=0;d<nh;d++){
-    if ( d != orthog) {
-      assert(lg->_processors[dl]  == hg->_processors[d]);
-      assert(lg->_ldimensions[dl] == hg->_ldimensions[d]);
-      dl++;
+    for(int d=0;d<nh;d++){
+      if ( d != orthog) {
+	assert(lg->_processors[dl]  == hg->_processors[d]);
+	assert(lg->_ldimensions[dl] == hg->_ldimensions[d]);
+	dl++;
    }
  }
  // the above should guarantee that the operations are local
-PARALLEL_FOR_LOOP
+  //PARALLEL_FOR_LOOP
  for(int idx=0;idx<lg->lSites();idx++){
    std::vector<int> lcoor(nl);
    std::vector<int> hcoor(nh);
@@ -446,6 +446,79 @@ PARALLEL_FOR_LOOP

 }

+
+template<class vobj>
+void InsertSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
+{
+  typedef typename vobj::scalar_object sobj;
+  sobj s;
+
+  GridBase *lg = lowDim._grid;
+  GridBase *hg = higherDim._grid;
+  int nl = lg->_ndimension;
+  int nh = hg->_ndimension;
+
+  assert(nl == nh);
+  assert(orthog<nh);
+  assert(orthog>=0);
+
+  for(int d=0;d<nh;d++){
+    assert(lg->_processors[d]  == hg->_processors[d]);
+    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+  }
+
+  // the above should guarantee that the operations are local
+  //PARALLEL_FOR_LOOP
+  for(int idx=0;idx<lg->lSites();idx++){
+    std::vector<int> lcoor(nl);
+    std::vector<int> hcoor(nh);
+    lg->LocalIndexToLocalCoor(idx,lcoor);
+    if( lcoor[orthog] == slice_lo ) { 
+      hcoor=lcoor;
+      hcoor[orthog] = slice_hi;
+      peekLocalSite(s,lowDim,lcoor);
+      pokeLocalSite(s,higherDim,hcoor);
+    }
+  }
+}
+
+
+template<class vobj>
+void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
+{
+  typedef typename vobj::scalar_object sobj;
+  sobj s;
+
+  GridBase *lg = lowDim._grid;
+  GridBase *hg = higherDim._grid;
+  int nl = lg->_ndimension;
+  int nh = hg->_ndimension;
+
+  assert(nl == nh);
+  assert(orthog<nh);
+  assert(orthog>=0);
+
+  for(int d=0;d<nh;d++){
+    assert(lg->_processors[d]  == hg->_processors[d]);
+    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+  }
+
+  // the above should guarantee that the operations are local
+  //PARALLEL_FOR_LOOP
+  for(int idx=0;idx<lg->lSites();idx++){
+    std::vector<int> lcoor(nl);
+    std::vector<int> hcoor(nh);
+    lg->LocalIndexToLocalCoor(idx,lcoor);
+    if( lcoor[orthog] == slice_lo ) { 
+      hcoor=lcoor;
+      hcoor[orthog] = slice_hi;
+      peekLocalSite(s,higherDim,hcoor);
+      pokeLocalSite(s,lowDim,lcoor);
+    }
+  }
+}
+
+
 template<class vobj>
 void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
 {
@@ -482,6 +555,96 @@ void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)

 }

+//Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
+template<typename vobj, typename sobj>
+typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>::value, void>::type unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in){
+  typedef typename vobj::vector_type vtype;
+  
+  GridBase* in_grid = in._grid;
+  out.resize(in_grid->lSites());
+  
+  int ndim = in_grid->Nd();
+  int in_nsimd = vtype::Nsimd();

+  std::vector<std::vector<int> > in_icoor(in_nsimd);
+      
+  for(int lane=0; lane < in_nsimd; lane++){
+    in_icoor[lane].resize(ndim);
+    in_grid->iCoorFromIindex(in_icoor[lane], lane);
+  }
+  
+PARALLEL_FOR_LOOP
+  for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index
+    //Assemble vector of pointers to output elements
+    std::vector<sobj*> out_ptrs(in_nsimd);
+
+    std::vector<int> in_ocoor(ndim);
+    in_grid->oCoorFromOindex(in_ocoor, in_oidx);
+
+    std::vector<int> lcoor(in_grid->Nd());
+      
+    for(int lane=0; lane < in_nsimd; lane++){
+      for(int mu=0;mu<ndim;mu++)
+	lcoor[mu] = in_ocoor[mu] + in_grid->_rdimensions[mu]*in_icoor[lane][mu];
+
+      int lex;
+      Lexicographic::IndexFromCoor(lcoor, lex, in_grid->_ldimensions);
+      out_ptrs[lane] = &out[lex];
+    }
+    
+    //Unpack into those ptrs
+    const vobj & in_vobj = in._odata[in_oidx];
+    extract1(in_vobj, out_ptrs, 0);
+  }
+}
+
+//Convert a Lattice from one precision to another
+template<class VobjOut, class VobjIn>
+void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
+  assert(out._grid->Nd() == in._grid->Nd());
+  out.checkerboard = in.checkerboard;
+  GridBase *in_grid=in._grid;
+  GridBase *out_grid = out._grid;
+
+  typedef typename VobjOut::scalar_object SobjOut;
+  typedef typename VobjIn::scalar_object SobjIn;
+
+  int ndim = out._grid->Nd();
+  int out_nsimd = out_grid->Nsimd();
+    
+  std::vector<std::vector<int> > out_icoor(out_nsimd);
+      
+  for(int lane=0; lane < out_nsimd; lane++){
+    out_icoor[lane].resize(ndim);
+    out_grid->iCoorFromIindex(out_icoor[lane], lane);
+  }
+        
+  std::vector<SobjOut> in_slex_conv(in_grid->lSites());
+  unvectorizeToLexOrdArray(in_slex_conv, in);
+    
+  PARALLEL_FOR_LOOP
+  for(int out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){
+    std::vector<int> out_ocoor(ndim);
+    out_grid->oCoorFromOindex(out_ocoor, out_oidx);
+
+    std::vector<SobjOut*> ptrs(out_nsimd);      
+
+    std::vector<int> lcoor(out_grid->Nd());
+      
+    for(int lane=0; lane < out_nsimd; lane++){
+      for(int mu=0;mu<ndim;mu++)
+	lcoor[mu] = out_ocoor[mu] + out_grid->_rdimensions[mu]*out_icoor[lane][mu];
+	
+      int llex; Lexicographic::IndexFromCoor(lcoor, llex, out_grid->_ldimensions);
+      ptrs[lane] = &in_slex_conv[llex];
+    }
+    merge(out._odata[out_oidx], ptrs, 0);
+  }
+}
+
+
+  
+
+ 
 }
 #endif
--- a/lib/lattice/rng/rng-state.h
+++ b/lib/lattice/rng/rng-state.h
@@ -0,0 +1,353 @@
+// vim: set ts=2 sw=2 expandtab:
+
+// Copyright (c) 2016 Luchang Jin
+// All rights reserved.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+#pragma once
+
+#ifndef INCLUDE_RNG_STATE_H
+#define INCLUDE_RNG_STATE_H
+
+#include "show.h"
+
+#ifndef USE_OPENSSL
+#include "sha256.h"
+#else
+#include <openssl/sha.h>
+#endif
+
+#include <stdint.h>
+#include <endian.h>
+#include <cstring>
+#include <cmath>
+#include <cassert>
+#include <string>
+#include <ostream>
+#include <istream>
+#include <vector>
+
+#ifdef CURRENT_DEFAULT_NAMESPACE_NAME
+namespace CURRENT_DEFAULT_NAMESPACE_NAME {
+#endif
+
+struct RngState;
+
+inline void reset(RngState& rs);
+
+inline void reset(RngState& rs, const std::string& seed);
+
+inline void reset(RngState& rs, const long seed)
+{
+  reset(rs, show(seed));
+}
+
+inline void splitRngState(RngState& rs, const RngState& rs0, const std::string& sindex);
+
+inline void splitRngState(RngState& rs, const RngState& rs0, const long sindex = 0)
+{
+  splitRngState(rs, rs0, show(sindex));
+}
+
+inline uint64_t randGen(RngState& rs);
+
+inline double uRandGen(RngState& rs, const double upper = 1.0, const double lower = 0.0);
+
+inline double gRandGen(RngState& rs, const double sigma = 1.0, const double center = 0.0);
+
+inline void computeHashWithInput(uint32_t hash[8], const RngState& rs, const std::string& input);
+
+struct RngState
+{
+  uint64_t numBytes;
+  uint32_t hash[8];
+  unsigned long index;
+  //
+  uint64_t cache[3];
+  double gaussian;
+  int cacheAvail;
+  bool gaussianAvail;
+  //
+  inline void init()
+  {
+    reset(*this);
+  }
+  //
+  RngState()
+  {
+    init();
+  }
+  RngState(const std::string& seed)
+  {
+    reset(*this, seed);
+  }
+  RngState(const long seed)
+  {
+    reset(*this, seed);
+  }
+  RngState(const RngState& rs0, const std::string& sindex)
+  {
+    splitRngState(*this, rs0, sindex);
+  }
+  RngState(const RngState& rs0, const long sindex)
+  {
+    splitRngState(*this, rs0, sindex);
+  }
+  //
+  RngState split(const std::string& sindex)
+  {
+    RngState rs(*this, sindex);
+    return rs;
+  }
+  RngState split(const long sindex)
+  {
+    RngState rs(*this, sindex);
+    return rs;
+  }
+};
+
+const size_t RNG_STATE_NUM_OF_INT32 = 2 + 8 + 2 + 3 * 2 + 2 + 1 + 1;
+
+inline uint64_t patchTwoUint32(const uint32_t a, const uint32_t b)
+{
+  return (uint64_t)a << 32 | (uint64_t)b;
+}
+
+inline void splitTwoUint32(uint32_t& a, uint32_t& b, const uint64_t x)
+{
+  b = (uint32_t)x;
+  a = (uint32_t)(x >> 32);
+  assert(x == patchTwoUint32(a, b));
+}
+
+inline void exportRngState(uint32_t* v, const RngState& rs)
+{
+  assert(22 == RNG_STATE_NUM_OF_INT32);
+  splitTwoUint32(v[0], v[1], rs.numBytes);
+  for (int i = 0; i < 8; ++i) {
+    v[2 + i] = rs.hash[i];
+  }
+  splitTwoUint32(v[10], v[11], rs.index);
+  for (int i = 0; i < 3; ++i) {
+    splitTwoUint32(v[12 + i * 2], v[12 + i * 2 + 1], rs.cache[i]);
+  }
+  union {
+    double d;
+    uint64_t l;
+  } g;
+  g.d = rs.gaussian;
+  splitTwoUint32(v[18], v[19], g.l);
+  v[20] = rs.cacheAvail;
+  v[21] = rs.gaussianAvail;
+}
+
+inline void importRngState(RngState& rs, const uint32_t* v)
+{
+  assert(22 == RNG_STATE_NUM_OF_INT32);
+  rs.numBytes = patchTwoUint32(v[0], v[1]);
+  for (int i = 0; i < 8; ++i) {
+    rs.hash[i] = v[2 + i];
+  }
+  rs.index = patchTwoUint32(v[10], v[11]);
+  for (int i = 0; i < 3; ++i) {
+    rs.cache[i] = patchTwoUint32(v[12 + i * 2], v[12 + i * 2 + 1]);
+  }
+  union {
+    double d;
+    uint64_t l;
+  } g;
+  g.l = patchTwoUint32(v[18], v[19]);
+  rs.gaussian = g.d;
+  rs.cacheAvail = v[20];
+  rs.gaussianAvail = v[21];
+}
+
+inline void exportRngState(std::vector<uint32_t>& v, const RngState& rs)
+{
+  v.resize(RNG_STATE_NUM_OF_INT32);
+  exportRngState(v.data(), rs);
+}
+
+inline void importRngState(RngState& rs, const std::vector<uint32_t>& v)
+{
+  assert(RNG_STATE_NUM_OF_INT32 == v.size());
+  importRngState(rs, v.data());
+}
+
+inline std::ostream& operator<<(std::ostream& os, const RngState& rs)
+{
+  std::vector<uint32_t> v(RNG_STATE_NUM_OF_INT32);
+  exportRngState(v, rs);
+  for (size_t i = 0; i < v.size() - 1; ++i) {
+    os << v[i] << " ";
+  }
+  os << v.back();
+  return os;
+}
+
+inline std::istream& operator>>(std::istream& is, RngState& rs)
+{
+  std::vector<uint32_t> v(RNG_STATE_NUM_OF_INT32);
+  for (size_t i = 0; i < v.size(); ++i) {
+    is >> v[i];
+  }
+  importRngState(rs, v);
+  return is;
+}
+
+inline std::string show(const RngState& rs)
+{
+  return shows(rs);
+}
+
+inline bool operator==(const RngState& rs1, const RngState& rs2)
+{
+  return 0 == memcmp(&rs1, &rs2, sizeof(RngState));
+}
+
+inline void reset(RngState& rs)
+{
+  std::memset(&rs, 0, sizeof(RngState));
+  rs.numBytes = 0;
+  rs.hash[0] = 0;
+  rs.hash[1] = 0;
+  rs.hash[2] = 0;
+  rs.hash[3] = 0;
+  rs.hash[4] = 0;
+  rs.hash[5] = 0;
+  rs.hash[6] = 0;
+  rs.hash[7] = 0;
+  rs.index = 0;
+  rs.cache[0] = 0;
+  rs.cache[1] = 0;
+  rs.cache[2] = 0;
+  rs.gaussian = 0.0;
+  rs.cacheAvail = 0;
+  rs.gaussianAvail = false;
+}
+
+inline void reset(RngState& rs, const std::string& seed)
+{
+  reset(rs);
+  splitRngState(rs, rs, seed);
+}
+
+inline void computeHashWithInput(uint32_t hash[8], const RngState& rs, const std::string& input)
+{
+  std::string data(32, ' ');
+  for (int i = 0; i < 8; ++i) {
+    data[i*4 + 0] = (rs.hash[i] >> 24) & 0xFF;
+    data[i*4 + 1] = (rs.hash[i] >> 16) & 0xFF;
+    data[i*4 + 2] = (rs.hash[i] >>  8) & 0xFF;
+    data[i*4 + 3] =  rs.hash[i]        & 0xFF;
+  }
+  data += input;
+#ifndef USE_OPENSSL
+  sha256::computeHash(hash, (const uint8_t*)data.c_str(), data.length());
+#else
+  {
+    uint8_t rawHash[32];
+    SHA256((unsigned char*)data.c_str(), data.length(), rawHash);
+    for (int i = 0; i < 8; ++i) {
+      hash[i] = (((uint32_t)rawHash[i*4 + 0]) << 24)
+              + (((uint32_t)rawHash[i*4 + 1]) << 16)
+              + (((uint32_t)rawHash[i*4 + 2]) <<  8)
+              + ( (uint32_t)rawHash[i*4 + 3]);
+    }
+  }
+#endif
+}
+
+inline void splitRngState(RngState& rs, const RngState& rs0, const std::string& sindex)
+  // produce a new rng ``rs'' uniquely identified by ``rs0'' and ``sindex''
+  // will not affect old rng ``rs0''
+  // the function should behave correctly even if ``rs'' is actually ``rs0''
+{
+  std::string input = ssprintf("[%lu] {%s}", rs0.index, sindex.c_str());
+  rs.numBytes = rs0.numBytes + 64 * ((32 + input.length() + 1 + 8 - 1) / 64 + 1);
+  computeHashWithInput(rs.hash, rs0, input);
+  rs.index = 0;
+  rs.cache[0] = 0;
+  rs.cache[1] = 0;
+  rs.cache[2] = 0;
+  rs.gaussian = 0.0;
+  rs.cacheAvail = 0;
+  rs.gaussianAvail = false;
+}
+
+inline uint64_t randGen(RngState& rs)
+{
+  assert(0 <= rs.cacheAvail && rs.cacheAvail <= 3);
+  rs.index += 1;
+  if (rs.cacheAvail > 0) {
+    rs.cacheAvail -= 1;
+    uint64_t r = rs.cache[rs.cacheAvail];
+    rs.cache[rs.cacheAvail] = 0;
+    return r;
+  } else {
+    uint32_t hash[8];
+    computeHashWithInput(hash, rs, ssprintf("[%lu]", rs.index));
+    rs.cache[0] = patchTwoUint32(hash[0], hash[1]);
+    rs.cache[1] = patchTwoUint32(hash[2], hash[3]);
+    rs.cache[2] = patchTwoUint32(hash[4], hash[5]);
+    rs.cacheAvail = 3;
+    return patchTwoUint32(hash[6], hash[7]);
+  }
+}
+
+inline double uRandGen(RngState& rs, const double upper, const double lower)
+{
+  uint64_t u = randGen(rs);
+  const double fac = 1.0 / (256.0 * 256.0 * 256.0 * 256.0) / (256.0 * 256.0 * 256.0 * 256.0);
+  return u * fac * (upper - lower) + lower;
+}
+
+inline double gRandGen(RngState& rs, const double sigma, const double center)
+{
+  rs.index += 1;
+  if (rs.gaussianAvail) {
+    rs.gaussianAvail = false;
+    return rs.gaussian * sigma + center;
+  } else {
+    // pick 2 uniform numbers in the square extending from
+    // -1 to 1 in each direction, see if they are in the
+    // unit circle, and try again if they are not.
+    int num_try = 1;
+    double v1, v2, rsq;
+    do {
+      v1 = uRandGen(rs, 1.0, -1.0);
+      v2 = uRandGen(rs, 1.0, -1.0);
+      if ((num_try % 1000)==0) {
+        printf("gRandGen : WARNING num_try=%d v1=%e v2=%e\n",num_try,v1,v2);
+      }
+      rsq = v1*v1 + v2*v2;
+      num_try++;
+    } while ((num_try < 10000) && (rsq >= 1.0 || rsq == 0));
+    if (num_try > 9999) {
+      printf("gRandGen : WARNING failed after 10000 tries (corrupted RNG?), returning ridiculous numbers (1e+10)\n");
+      return 1e+10;
+    }
+    double fac = std::sqrt(-2.0 * std::log(rsq)/rsq);
+    rs.gaussian = v1 * fac;
+    rs.gaussianAvail = true;
+    return v2 * fac * sigma + center;
+  }
+}
+
+#ifdef CURRENT_DEFAULT_NAMESPACE_NAME
+}
+#endif
+
+#endif
--- a/lib/lattice/rng/sha256.h
+++ b/lib/lattice/rng/sha256.h
@@ -0,0 +1,348 @@
+// vim: set ts=2 sw=2 expandtab:
+
+// Copyright (c) 2016 Luchang Jin
+// All rights reserved.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+// Code within namespace sha256 are originally from Stephan Brumme.
+// see http://create.stephan-brumme.com/disclaimer.html
+
+#pragma once
+
+#include <stdint.h>
+#include <endian.h>
+#include <cstring>
+#include <cmath>
+#include <cassert>
+#include <string>
+#include <ostream>
+#include <istream>
+#include <vector>
+
+#ifdef CURRENT_DEFAULT_NAMESPACE_NAME
+namespace CURRENT_DEFAULT_NAMESPACE_NAME {
+#endif
+
+namespace sha256 {
+
+  const size_t BlockSize = 512 / 8;
+
+  const size_t HashBytes = 32;
+
+  const size_t HashValues = HashBytes / 4;
+
+  inline uint32_t rotate(uint32_t a, uint32_t c)
+  {
+    return (a >> c) | (a << (32 - c));
+  }
+
+  inline uint32_t swap(uint32_t x)
+  {
+    return (x >> 24) |
+      ((x >>  8) & 0x0000FF00) |
+      ((x <<  8) & 0x00FF0000) |
+      (x << 24);
+  }
+
+  inline uint32_t f1(uint32_t e, uint32_t f, uint32_t g)
+  // mix functions for processBlock()
+  {
+    uint32_t term1 = rotate(e, 6) ^ rotate(e, 11) ^ rotate(e, 25);
+    uint32_t term2 = (e & f) ^ (~e & g); //(g ^ (e & (f ^ g)))
+    return term1 + term2;
+  }
+
+  inline uint32_t f2(uint32_t a, uint32_t b, uint32_t c)
+  // mix functions for processBlock()
+  {
+    uint32_t term1 = rotate(a, 2) ^ rotate(a, 13) ^ rotate(a, 22);
+    uint32_t term2 = ((a | b) & c) | (a & b); //(a & (b ^ c)) ^ (b & c);
+    return term1 + term2;
+  }
+
+  inline void processBlock(uint32_t newHash[8], const uint32_t oldHash[8], const uint8_t data[64])
+    // process 64 bytes of data
+    // newHash and oldHash and be the same
+  {
+    // get last hash
+    uint32_t a = oldHash[0];
+    uint32_t b = oldHash[1];
+    uint32_t c = oldHash[2];
+    uint32_t d = oldHash[3];
+    uint32_t e = oldHash[4];
+    uint32_t f = oldHash[5];
+    uint32_t g = oldHash[6];
+    uint32_t h = oldHash[7];
+    // data represented as 16x 32-bit words
+    const uint32_t* input = (uint32_t*) data;
+    // convert to big endian
+    uint32_t words[64];
+    int i;
+    for (i = 0; i < 16; i++) {
+#if defined(__BYTE_ORDER) && (__BYTE_ORDER != 0) && (__BYTE_ORDER == __BIG_ENDIAN)
+      words[i] =      input[i];
+#else
+      words[i] = swap(input[i]);
+#endif
+    }
+    uint32_t x,y; // temporaries
+    // first round
+    x = h + f1(e,f,g) + 0x428a2f98 + words[ 0]; y = f2(a,b,c); d += x; h = x + y;
+    x = g + f1(d,e,f) + 0x71374491 + words[ 1]; y = f2(h,a,b); c += x; g = x + y;
+    x = f + f1(c,d,e) + 0xb5c0fbcf + words[ 2]; y = f2(g,h,a); b += x; f = x + y;
+    x = e + f1(b,c,d) + 0xe9b5dba5 + words[ 3]; y = f2(f,g,h); a += x; e = x + y;
+    x = d + f1(a,b,c) + 0x3956c25b + words[ 4]; y = f2(e,f,g); h += x; d = x + y;
+    x = c + f1(h,a,b) + 0x59f111f1 + words[ 5]; y = f2(d,e,f); g += x; c = x + y;
+    x = b + f1(g,h,a) + 0x923f82a4 + words[ 6]; y = f2(c,d,e); f += x; b = x + y;
+    x = a + f1(f,g,h) + 0xab1c5ed5 + words[ 7]; y = f2(b,c,d); e += x; a = x + y;
+    // secound round
+    x = h + f1(e,f,g) + 0xd807aa98 + words[ 8]; y = f2(a,b,c); d += x; h = x + y;
+    x = g + f1(d,e,f) + 0x12835b01 + words[ 9]; y = f2(h,a,b); c += x; g = x + y;
+    x = f + f1(c,d,e) + 0x243185be + words[10]; y = f2(g,h,a); b += x; f = x + y;
+    x = e + f1(b,c,d) + 0x550c7dc3 + words[11]; y = f2(f,g,h); a += x; e = x + y;
+    x = d + f1(a,b,c) + 0x72be5d74 + words[12]; y = f2(e,f,g); h += x; d = x + y;
+    x = c + f1(h,a,b) + 0x80deb1fe + words[13]; y = f2(d,e,f); g += x; c = x + y;
+    x = b + f1(g,h,a) + 0x9bdc06a7 + words[14]; y = f2(c,d,e); f += x; b = x + y;
+    x = a + f1(f,g,h) + 0xc19bf174 + words[15]; y = f2(b,c,d); e += x; a = x + y;
+    // extend to 24 words
+    for (; i < 24; i++)
+      words[i] = words[i-16] +
+        (rotate(words[i-15],  7) ^ rotate(words[i-15], 18) ^ (words[i-15] >>  3)) +
+        words[i-7] +
+        (rotate(words[i- 2], 17) ^ rotate(words[i- 2], 19) ^ (words[i- 2] >> 10));
+    // third round
+    x = h + f1(e,f,g) + 0xe49b69c1 + words[16]; y = f2(a,b,c); d += x; h = x + y;
+    x = g + f1(d,e,f) + 0xefbe4786 + words[17]; y = f2(h,a,b); c += x; g = x + y;
+    x = f + f1(c,d,e) + 0x0fc19dc6 + words[18]; y = f2(g,h,a); b += x; f = x + y;
+    x = e + f1(b,c,d) + 0x240ca1cc + words[19]; y = f2(f,g,h); a += x; e = x + y;
+    x = d + f1(a,b,c) + 0x2de92c6f + words[20]; y = f2(e,f,g); h += x; d = x + y;
+    x = c + f1(h,a,b) + 0x4a7484aa + words[21]; y = f2(d,e,f); g += x; c = x + y;
+    x = b + f1(g,h,a) + 0x5cb0a9dc + words[22]; y = f2(c,d,e); f += x; b = x + y;
+    x = a + f1(f,g,h) + 0x76f988da + words[23]; y = f2(b,c,d); e += x; a = x + y;
+    // extend to 32 words
+    for (; i < 32; i++)
+      words[i] = words[i-16] +
+        (rotate(words[i-15],  7) ^ rotate(words[i-15], 18) ^ (words[i-15] >>  3)) +
+        words[i-7] +
+        (rotate(words[i- 2], 17) ^ rotate(words[i- 2], 19) ^ (words[i- 2] >> 10));
+    // fourth round
+    x = h + f1(e,f,g) + 0x983e5152 + words[24]; y = f2(a,b,c); d += x; h = x + y;
+    x = g + f1(d,e,f) + 0xa831c66d + words[25]; y = f2(h,a,b); c += x; g = x + y;
+    x = f + f1(c,d,e) + 0xb00327c8 + words[26]; y = f2(g,h,a); b += x; f = x + y;
+    x = e + f1(b,c,d) + 0xbf597fc7 + words[27]; y = f2(f,g,h); a += x; e = x + y;
+    x = d + f1(a,b,c) + 0xc6e00bf3 + words[28]; y = f2(e,f,g); h += x; d = x + y;
+    x = c + f1(h,a,b) + 0xd5a79147 + words[29]; y = f2(d,e,f); g += x; c = x + y;
+    x = b + f1(g,h,a) + 0x06ca6351 + words[30]; y = f2(c,d,e); f += x; b = x + y;
+    x = a + f1(f,g,h) + 0x14292967 + words[31]; y = f2(b,c,d); e += x; a = x + y;
+    // extend to 40 words
+    for (; i < 40; i++)
+      words[i] = words[i-16] +
+        (rotate(words[i-15],  7) ^ rotate(words[i-15], 18) ^ (words[i-15] >>  3)) +
+        words[i-7] +
+        (rotate(words[i- 2], 17) ^ rotate(words[i- 2], 19) ^ (words[i- 2] >> 10));
+    // fifth round
+    x = h + f1(e,f,g) + 0x27b70a85 + words[32]; y = f2(a,b,c); d += x; h = x + y;
+    x = g + f1(d,e,f) + 0x2e1b2138 + words[33]; y = f2(h,a,b); c += x; g = x + y;
+    x = f + f1(c,d,e) + 0x4d2c6dfc + words[34]; y = f2(g,h,a); b += x; f = x + y;
+    x = e + f1(b,c,d) + 0x53380d13 + words[35]; y = f2(f,g,h); a += x; e = x + y;
+    x = d + f1(a,b,c) + 0x650a7354 + words[36]; y = f2(e,f,g); h += x; d = x + y;
+    x = c + f1(h,a,b) + 0x766a0abb + words[37]; y = f2(d,e,f); g += x; c = x + y;
+    x = b + f1(g,h,a) + 0x81c2c92e + words[38]; y = f2(c,d,e); f += x; b = x + y;
+    x = a + f1(f,g,h) + 0x92722c85 + words[39]; y = f2(b,c,d); e += x; a = x + y;
+    // extend to 48 words
+    for (; i < 48; i++)
+      words[i] = words[i-16] +
+        (rotate(words[i-15],  7) ^ rotate(words[i-15], 18) ^ (words[i-15] >>  3)) +
+        words[i-7] +
+        (rotate(words[i- 2], 17) ^ rotate(words[i- 2], 19) ^ (words[i- 2] >> 10));
+    // sixth round
+    x = h + f1(e,f,g) + 0xa2bfe8a1 + words[40]; y = f2(a,b,c); d += x; h = x + y;
+    x = g + f1(d,e,f) + 0xa81a664b + words[41]; y = f2(h,a,b); c += x; g = x + y;
+    x = f + f1(c,d,e) + 0xc24b8b70 + words[42]; y = f2(g,h,a); b += x; f = x + y;
+    x = e + f1(b,c,d) + 0xc76c51a3 + words[43]; y = f2(f,g,h); a += x; e = x + y;
+    x = d + f1(a,b,c) + 0xd192e819 + words[44]; y = f2(e,f,g); h += x; d = x + y;
+    x = c + f1(h,a,b) + 0xd6990624 + words[45]; y = f2(d,e,f); g += x; c = x + y;
+    x = b + f1(g,h,a) + 0xf40e3585 + words[46]; y = f2(c,d,e); f += x; b = x + y;
+    x = a + f1(f,g,h) + 0x106aa070 + words[47]; y = f2(b,c,d); e += x; a = x + y;
+    // extend to 56 words
+    for (; i < 56; i++)
+      words[i] = words[i-16] +
+        (rotate(words[i-15],  7) ^ rotate(words[i-15], 18) ^ (words[i-15] >>  3)) +
+        words[i-7] +
+        (rotate(words[i- 2], 17) ^ rotate(words[i- 2], 19) ^ (words[i- 2] >> 10));
+    // seventh round
+    x = h + f1(e,f,g) + 0x19a4c116 + words[48]; y = f2(a,b,c); d += x; h = x + y;
+    x = g + f1(d,e,f) + 0x1e376c08 + words[49]; y = f2(h,a,b); c += x; g = x + y;
+    x = f + f1(c,d,e) + 0x2748774c + words[50]; y = f2(g,h,a); b += x; f = x + y;
+    x = e + f1(b,c,d) + 0x34b0bcb5 + words[51]; y = f2(f,g,h); a += x; e = x + y;
+    x = d + f1(a,b,c) + 0x391c0cb3 + words[52]; y = f2(e,f,g); h += x; d = x + y;
+    x = c + f1(h,a,b) + 0x4ed8aa4a + words[53]; y = f2(d,e,f); g += x; c = x + y;
+    x = b + f1(g,h,a) + 0x5b9cca4f + words[54]; y = f2(c,d,e); f += x; b = x + y;
+    x = a + f1(f,g,h) + 0x682e6ff3 + words[55]; y = f2(b,c,d); e += x; a = x + y;
+    // extend to 64 words
+    for (; i < 64; i++)
+      words[i] = words[i-16] +
+        (rotate(words[i-15],  7) ^ rotate(words[i-15], 18) ^ (words[i-15] >>  3)) +
+        words[i-7] +
+        (rotate(words[i- 2], 17) ^ rotate(words[i- 2], 19) ^ (words[i- 2] >> 10));
+    // eigth round
+    x = h + f1(e,f,g) + 0x748f82ee + words[56]; y = f2(a,b,c); d += x; h = x + y;
+    x = g + f1(d,e,f) + 0x78a5636f + words[57]; y = f2(h,a,b); c += x; g = x + y;
+    x = f + f1(c,d,e) + 0x84c87814 + words[58]; y = f2(g,h,a); b += x; f = x + y;
+    x = e + f1(b,c,d) + 0x8cc70208 + words[59]; y = f2(f,g,h); a += x; e = x + y;
+    x = d + f1(a,b,c) + 0x90befffa + words[60]; y = f2(e,f,g); h += x; d = x + y;
+    x = c + f1(h,a,b) + 0xa4506ceb + words[61]; y = f2(d,e,f); g += x; c = x + y;
+    x = b + f1(g,h,a) + 0xbef9a3f7 + words[62]; y = f2(c,d,e); f += x; b = x + y;
+    x = a + f1(f,g,h) + 0xc67178f2 + words[63]; y = f2(b,c,d); e += x; a = x + y;
+    // update hash
+    newHash[0] = a + oldHash[0];
+    newHash[1] = b + oldHash[1];
+    newHash[2] = c + oldHash[2];
+    newHash[3] = d + oldHash[3];
+    newHash[4] = e + oldHash[4];
+    newHash[5] = f + oldHash[5];
+    newHash[6] = g + oldHash[6];
+    newHash[7] = h + oldHash[7];
+  }
+
+  inline void processInput(
+      uint32_t hash[8],
+      const uint32_t oldHash[8], const uint64_t numBytes,
+      const uint8_t* input, const size_t inputSize)
+    // process final block, less than 64 bytes
+    // newHash and oldHash and be the same
+  {
+    // the input bytes are considered as bits strings, where the first bit is the most significant bit of the byte
+    // - append "1" bit to message
+    // - append "0" bits until message length in bit mod 512 is 448
+    // - append length as 64 bit integer
+    // process initial parts of input
+    std::memmove(hash, oldHash, 32);
+    const int nBlocks = inputSize / 64;
+    for (int i = 0; i < nBlocks; ++i) {
+      processBlock(hash, hash, input + i * 64);
+    }
+    // initialize buffer from input
+    const size_t bufferSize = inputSize - nBlocks * 64;
+    unsigned char buffer[BlockSize];
+    std::memcpy(buffer, input + nBlocks * 64, bufferSize);
+    // number of bits
+    size_t paddedLength = bufferSize * 8;
+    // plus one bit set to 1 (always appended)
+    paddedLength++;
+    // number of bits must be (numBits % 512) = 448
+    size_t lower11Bits = paddedLength & 511;
+    if (lower11Bits <= 448) {
+      paddedLength +=       448 - lower11Bits;
+    } else {
+      paddedLength += 512 + 448 - lower11Bits;
+    }
+    // convert from bits to bytes
+    paddedLength /= 8;
+    // only needed if additional data flows over into a second block
+    unsigned char extra[BlockSize];
+    // append a "1" bit, 128 => binary 10000000
+    if (bufferSize < BlockSize) {
+      buffer[bufferSize] = 128;
+    } else {
+      extra[0] = 128;
+    }
+    size_t i;
+    for (i = bufferSize + 1; i < BlockSize; i++) {
+      buffer[i] = 0;
+    }
+    for (; i < paddedLength; i++) {
+      extra[i - BlockSize] = 0;
+    }
+    // add message length in bits as 64 bit number
+    uint64_t msgBits = 8 * (numBytes + inputSize);
+    // find right position
+    unsigned char* addLength;
+    if (paddedLength < BlockSize) {
+      addLength = buffer + paddedLength;
+    } else {
+      addLength = extra + paddedLength - BlockSize;
+    }
+    // must be big endian
+    *addLength++ = (unsigned char)((msgBits >> 56) & 0xFF);
+    *addLength++ = (unsigned char)((msgBits >> 48) & 0xFF);
+    *addLength++ = (unsigned char)((msgBits >> 40) & 0xFF);
+    *addLength++ = (unsigned char)((msgBits >> 32) & 0xFF);
+    *addLength++ = (unsigned char)((msgBits >> 24) & 0xFF);
+    *addLength++ = (unsigned char)((msgBits >> 16) & 0xFF);
+    *addLength++ = (unsigned char)((msgBits >>  8) & 0xFF);
+    *addLength   = (unsigned char)( msgBits        & 0xFF);
+    // process blocks
+    processBlock(hash, hash, buffer);
+    // flowed over into a second block ?
+    if (paddedLength > BlockSize) {
+      processBlock(hash, hash, extra);
+    }
+  }
+
+  inline void setInitialHash(uint32_t hash[8])
+  {
+    hash[0] = 0x6a09e667;
+    hash[1] = 0xbb67ae85;
+    hash[2] = 0x3c6ef372;
+    hash[3] = 0xa54ff53a;
+    hash[4] = 0x510e527f;
+    hash[5] = 0x9b05688c;
+    hash[6] = 0x1f83d9ab;
+    hash[7] = 0x5be0cd19;
+  }
+
+  inline void computeHash(uint32_t hash[8], const void* data, const size_t size)
+  {
+    uint32_t initHash[8];
+    setInitialHash(initHash);
+    processInput(hash, initHash, 0, (const uint8_t*)data, size);
+  }
+
+  inline void rawHashFromHash(uint8_t rawHash[HashBytes], const uint32_t hash[HashValues])
+  {
+    uint8_t* current = rawHash;
+    for (size_t i = 0; i < HashValues; i++) {
+      *current++ = (hash[i] >> 24) & 0xFF;
+      *current++ = (hash[i] >> 16) & 0xFF;
+      *current++ = (hash[i] >>  8) & 0xFF;
+      *current++ =  hash[i]        & 0xFF;
+    }
+  }
+
+  inline std::string showRawHash(const uint8_t rawHash[HashBytes])
+  {
+    std::string result;
+    result.reserve(2 * HashBytes);
+    for (size_t i = 0; i < HashBytes; i++) {
+      static const char dec2hex[16+1] = "0123456789abcdef";
+      result += dec2hex[(rawHash[i] >> 4) & 15];
+      result += dec2hex[ rawHash[i]       & 15];
+    }
+    return result;
+  }
+
+  inline std::string showHash(const uint32_t hash[8])
+  {
+    unsigned char rawHash[HashBytes];
+    rawHashFromHash(rawHash, hash);
+    return showRawHash(rawHash);
+  }
+
+}
+
+#ifdef CURRENT_DEFAULT_NAMESPACE_NAME
+}
+#endif
--- a/lib/lattice/rng/show.h
+++ b/lib/lattice/rng/show.h
@@ -0,0 +1,125 @@
+// vim: set ts=2 sw=2 expandtab:
+
+// Copyright (c) 2014 Luchang Jin
+// All rights reserved.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+#pragma once
+
+#ifndef INCLUDE_SHOW_H
+#define INCLUDE_SHOW_H
+
+#include <sstream>
+#include <string>
+#include <cstdarg>
+#include <cstring>
+#include <cstdlib>
+#include <cstdio>
+#include <sstream>
+
+#ifdef CURRENT_DEFAULT_NAMESPACE_NAME
+namespace CURRENT_DEFAULT_NAMESPACE_NAME {
+#endif
+
+inline std::string vssprintf(const char* fmt, va_list args)
+{
+  std::string str;
+  char* cstr;
+  vasprintf(&cstr, fmt, args);
+  str += std::string(cstr);
+  std::free(cstr);
+  return str;
+}
+
+inline std::string ssprintf(const char* fmt, ...)
+{
+  va_list args;
+  va_start(args, fmt);
+  return vssprintf(fmt, args);
+}
+
+inline std::string show()
+{
+  return "";
+}
+
+inline std::string show(const int& x)
+{
+  return ssprintf("%d", x);
+}
+
+inline std::string show(const unsigned int& x)
+{
+  return ssprintf("%u", x);
+}
+
+inline std::string show(const long& x)
+{
+  return ssprintf("%ld", x);
+}
+
+inline std::string show(const unsigned long& x)
+{
+  return ssprintf("%lu", x);
+}
+
+inline std::string show(const double& x)
+{
+  return ssprintf("%24.17E", x);
+}
+
+inline std::string show(const bool& x)
+{
+  return x ? "true" : "false";
+}
+
+inline std::string show(const std::string& x)
+{
+  std::ostringstream out;
+  out << x;
+  return out.str();
+}
+
+template <class T>
+std::string shows(const T& x)
+{
+  std::ostringstream out;
+  out << x;
+  return out.str();
+}
+
+template <class T>
+T& reads(T& x, const std::string& str)
+{
+  std::istringstream in(str);
+  in >> x;
+  return x;
+}
+
+inline void fdisplay(FILE* fp, const std::string& str)
+{
+  fprintf(fp, "%s", str.c_str());
+}
+
+inline void fdisplayln(FILE* fp, const std::string& str)
+{
+  fprintf(fp, "%s\n", str.c_str());
+}
+
+#ifdef CURRENT_DEFAULT_NAMESPACE_NAME
+}
+#endif
+
+#endif
--- a/lib/lattice/rng/sprng-sha256.h
+++ b/lib/lattice/rng/sprng-sha256.h
@@ -0,0 +1,115 @@
+// vim: set ts=2 sw=2 expandtab:
+
+// Copyright (c) 2016 Luchang Jin
+// All rights reserved.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+#pragma once
+
+#ifndef INCLUDE_SPRNG_SHA256_H
+#define INCLUDE_SPRNG_SHA256_H
+
+#include "rng-state.h"
+
+#include <array>
+#include <cstring>
+#include <ostream>
+#include <istream>
+
+#ifdef CURRENT_DEFAULT_NAMESPACE_NAME
+namespace CURRENT_DEFAULT_NAMESPACE_NAME {
+#endif
+
+struct SprngSha256
+{
+  RngState rs;
+  //
+  using result_type = uint64_t;
+  //
+  static constexpr result_type default_seed = 0;
+  //
+  explicit SprngSha256(result_type val = default_seed)
+  {
+    seed(val);
+  }
+  template<typename Sseq, typename = typename
+    std::enable_if<!std::is_same<Sseq, SprngSha256>::value>
+    ::type>
+  explicit SprngSha256(Sseq& q)
+  {
+    seed(q);
+  }
+  //
+  static constexpr result_type min()
+  {
+    return 0;
+  }
+  //
+  static constexpr result_type max()
+  {
+    return UINT64_MAX;
+  }
+  //
+  void seed(result_type val = default_seed)
+  {
+    reset(rs, (long)val);
+  }
+  template <class Sseq>
+  typename std::enable_if<std::is_class<Sseq>::value>::type
+  seed(Sseq& q)
+  {
+    std::array<uint32_t, 8> seq;
+    q.generate(seq.begin(), seq.end());
+    reset(rs);
+    for (size_t i = 0; i < seq.size(); ++i) {
+      splitRngState(rs, rs, seq[i]);
+    }
+  }
+  //
+  result_type operator()()
+  {
+    return randGen(rs);
+  }
+  //
+  void discard(unsigned long long z)
+  {
+    for (unsigned long long i = 0; i < z; ++i) {
+      randGen(rs);
+    }
+  }
+};
+
+inline std::ostream& operator<<(std::ostream& os, const SprngSha256& ss)
+{
+  os << ss.rs;
+  return os;
+}
+
+inline std::istream& operator>>(std::istream& is, SprngSha256& ss)
+{
+  is >> ss.rs;
+  return is;
+}
+
+inline bool operator==(const SprngSha256& ss1, const SprngSha256& ss2)
+{
+  return ss1.rs == ss2.rs;
+}
+
+#ifdef CURRENT_DEFAULT_NAMESPACE_NAME
+}
+#endif
+
+#endif
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@@ -194,22 +194,22 @@ class BinaryIO {

      std::vector<int> site({x,y,z,t});

-      if ( grid->IsBoss() ) {
-	fin.read((char *)&file_object,sizeof(file_object));
-	bytes += sizeof(file_object);
-	if(ieee32big) be32toh_v((void *)&file_object,sizeof(file_object));
-	if(ieee32)    le32toh_v((void *)&file_object,sizeof(file_object));
-	if(ieee64big) be64toh_v((void *)&file_object,sizeof(file_object));
-	if(ieee64)    le64toh_v((void *)&file_object,sizeof(file_object));
+      if (grid->IsBoss()) {
+        fin.read((char *)&file_object, sizeof(file_object));
+        bytes += sizeof(file_object);
+        if (ieee32big) be32toh_v((void *)&file_object, sizeof(file_object));
+        if (ieee32) le32toh_v((void *)&file_object, sizeof(file_object));
+        if (ieee64big) be64toh_v((void *)&file_object, sizeof(file_object));
+        if (ieee64) le64toh_v((void *)&file_object, sizeof(file_object));

-	munge(file_object,munged,csum);
+        munge(file_object, munged, csum);
      }
      // The boss who read the file has their value poked
      pokeSite(munged,Umu,site);
    }}}}
    timer.Stop();
    std::cout<<GridLogPerformance<<"readObjectSerial: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/ (double)timer.useconds() <<" MB/s "  <<std::endl;
+       << (double)bytes/ (double)timer.useconds() <<" MB/s "  <<std::endl;

    return csum;
  }
@@ -254,20 +254,20 @@ class BinaryIO {

      
      if ( grid->IsBoss() ) {
-	
-	if(ieee32big) htobe32_v((void *)&file_object,sizeof(file_object));
-	if(ieee32)    htole32_v((void *)&file_object,sizeof(file_object));
-	if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
-	if(ieee64)    htole64_v((void *)&file_object,sizeof(file_object));
+  
+  if(ieee32big) htobe32_v((void *)&file_object,sizeof(file_object));
+  if(ieee32)    htole32_v((void *)&file_object,sizeof(file_object));
+  if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
+  if(ieee64)    htole64_v((void *)&file_object,sizeof(file_object));

-	// NB could gather an xstrip as an optimisation.
-	fout.write((char *)&file_object,sizeof(file_object));
-	bytes+=sizeof(file_object);
+  // NB could gather an xstrip as an optimisation.
+  fout.write((char *)&file_object,sizeof(file_object));
+  bytes+=sizeof(file_object);
      }
    }}}}
    timer.Stop();
    std::cout<<GridLogPerformance<<"writeObjectSerial: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
+       << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;

    return csum;
  }
@@ -305,15 +305,15 @@ class BinaryIO {
      int l_idx=parallel.generator_idx(o_idx,i_idx);

      if( rank == grid->ThisRank() ){
-	//	std::cout << "rank" << rank<<" Getting state for index "<<l_idx<<std::endl;
-	parallel.GetState(saved,l_idx);
+  //  std::cout << "rank" << rank<<" Getting state for index "<<l_idx<<std::endl;
+  parallel.GetState(saved,l_idx);
      }

      grid->Broadcast(rank,(void *)&saved[0],bytes);

      if ( grid->IsBoss() ) {
-	Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
-	fout.write((char *)&saved[0],bytes);
+  Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
+  fout.write((char *)&saved[0],bytes);
      }

    }
@@ -355,14 +355,14 @@ class BinaryIO {
      int l_idx=parallel.generator_idx(o_idx,i_idx);

      if ( grid->IsBoss() ) {
-	fin.read((char *)&saved[0],bytes);
-	Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
+  fin.read((char *)&saved[0],bytes);
+  Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
      }

      grid->Broadcast(0,(void *)&saved[0],bytes);

      if( rank == grid->ThisRank() ){
-	parallel.SetState(saved,l_idx);
+  parallel.SetState(saved,l_idx);
      }

    }
@@ -415,15 +415,15 @@ class BinaryIO {

      if ( d == 0 ) parallel[d] = 0;
      if (parallel[d]) {
-	range[d] = grid->_ldimensions[d];
-	start[d] = grid->_processor_coor[d]*range[d];
-	ioproc[d]= grid->_processor_coor[d];
+  range[d] = grid->_ldimensions[d];
+  start[d] = grid->_processor_coor[d]*range[d];
+  ioproc[d]= grid->_processor_coor[d];
      } else {
-	range[d] = grid->_gdimensions[d];
-	start[d] = 0;
-	ioproc[d]= 0;
+  range[d] = grid->_gdimensions[d];
+  start[d] = 0;
+  ioproc[d]= 0;

-	if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
+  if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
      }
      slice_vol = slice_vol * range[d];
    }
@@ -434,9 +434,9 @@ class BinaryIO {
      std::cout<< std::dec ;
      std::cout<< GridLogMessage<< "Parallel read I/O to "<< file << " with " <<tmp<< " IOnodes for subslice ";
      for(int d=0;d<grid->_ndimension;d++){
-	std::cout<< range[d];
-	if( d< grid->_ndimension-1 ) 
-	  std::cout<< " x ";
+  std::cout<< range[d];
+  if( d< grid->_ndimension-1 ) 
+    std::cout<< " x ";
      }
      std::cout << std::endl;
    }
@@ -463,7 +463,7 @@ class BinaryIO {

      // need to implement these loops in Nd independent way with a lexico conversion
    for(int tlex=0;tlex<slice_vol;tlex++){
-	
+  
      std::vector<int> tsite(nd); // temporary mixed up site
      std::vector<int> gsite(nd);
      std::vector<int> lsite(nd);
@@ -472,8 +472,8 @@ class BinaryIO {
      Lexicographic::CoorFromIndex(tsite,tlex,range);

      for(int d=0;d<nd;d++){
-	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
-	gsite[d] = tsite[d]+start[d];               // global site
+  lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
+  gsite[d] = tsite[d]+start[d];               // global site
      }

      /////////////////////////
@@ -487,29 +487,29 @@ class BinaryIO {
      // iorank reads from the seek
      ////////////////////////////////
      if (myrank == iorank) {
-	
-	fin.seekg(offset+g_idx*sizeof(fileObj));
-	fin.read((char *)&fileObj,sizeof(fileObj));
-	bytes+=sizeof(fileObj);
-	
-	if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee32)    le32toh_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee64big) be64toh_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee64)    le64toh_v((void *)&fileObj,sizeof(fileObj));
-	
-	munge(fileObj,siteObj,csum);
+  
+  fin.seekg(offset+g_idx*sizeof(fileObj));
+  fin.read((char *)&fileObj,sizeof(fileObj));
+  bytes+=sizeof(fileObj);
+  
+  if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee32)    le32toh_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee64big) be64toh_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee64)    le64toh_v((void *)&fileObj,sizeof(fileObj));
+  
+  munge(fileObj,siteObj,csum);

-      }	
+      } 

      // Possibly do transport through pt2pt 
      if ( rank != iorank ) { 
-	if ( (myrank == rank) || (myrank==iorank) ) {
-	  grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,iorank,rank,sizeof(siteObj));
-	}
+  if ( (myrank == rank) || (myrank==iorank) ) {
+    grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,iorank,rank,sizeof(siteObj));
+  }
      }
      // Poke at destination
      if ( myrank == rank ) {
-	  pokeLocalSite(siteObj,Umu,lsite);
+    pokeLocalSite(siteObj,Umu,lsite);
      }
      grid->Barrier(); // necessary?
    }
@@ -520,7 +520,7 @@ class BinaryIO {

    timer.Stop();
    std::cout<<GridLogPerformance<<"readObjectParallel: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
+       << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
    
    return csum;
  }
@@ -558,15 +558,15 @@ class BinaryIO {
      if ( d!= grid->_ndimension-1 ) parallel[d] = 0;

      if (parallel[d]) {
-	range[d] = grid->_ldimensions[d];
-	start[d] = grid->_processor_coor[d]*range[d];
-	ioproc[d]= grid->_processor_coor[d];
+  range[d] = grid->_ldimensions[d];
+  start[d] = grid->_processor_coor[d]*range[d];
+  ioproc[d]= grid->_processor_coor[d];
      } else {
-	range[d] = grid->_gdimensions[d];
-	start[d] = 0;
-	ioproc[d]= 0;
+  range[d] = grid->_gdimensions[d];
+  start[d] = 0;
+  ioproc[d]= 0;

-	if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
+  if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
      }

      slice_vol = slice_vol * range[d];
@@ -577,9 +577,9 @@ class BinaryIO {
      grid->GlobalSum(tmp);
      std::cout<< GridLogMessage<< "Parallel write I/O from "<< file << " with " <<tmp<< " IOnodes for subslice ";
      for(int d=0;d<grid->_ndimension;d++){
-	std::cout<< range[d];
-	if( d< grid->_ndimension-1 ) 
-	  std::cout<< " x ";
+  std::cout<< range[d];
+  if( d< grid->_ndimension-1 ) 
+    std::cout<< " x ";
      }
      std::cout << std::endl;
    }
@@ -610,7 +610,7 @@ class BinaryIO {
    // should aggregate a whole chunk and then write.
    // need to implement these loops in Nd independent way with a lexico conversion
    for(int tlex=0;tlex<slice_vol;tlex++){
-	
+  
      std::vector<int> tsite(nd); // temporary mixed up site
      std::vector<int> gsite(nd);
      std::vector<int> lsite(nd);
@@ -619,8 +619,8 @@ class BinaryIO {
      Lexicographic::CoorFromIndex(tsite,tlex,range);

      for(int d=0;d<nd;d++){
-	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
-	gsite[d] = tsite[d]+start[d];               // global site
+  lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
+  gsite[d] = tsite[d]+start[d];               // global site
      }


@@ -640,26 +640,26 @@ class BinaryIO {

      // Pair of nodes may need to do pt2pt send
      if ( rank != iorank ) { // comms is necessary
-	if ( (myrank == rank) || (myrank==iorank) ) { // and we have to do it
-	  // Send to IOrank 
-	  grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,rank,iorank,sizeof(siteObj));
-	}
+  if ( (myrank == rank) || (myrank==iorank) ) { // and we have to do it
+    // Send to IOrank 
+    grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,rank,iorank,sizeof(siteObj));
+  }
      }

      grid->Barrier(); // necessary?

      if (myrank == iorank) {
-	
-	munge(siteObj,fileObj,csum);
+  
+  munge(siteObj,fileObj,csum);

-	if(ieee32big) htobe32_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee32)    htole32_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee64big) htobe64_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee64)    htole64_v((void *)&fileObj,sizeof(fileObj));
-	
-	fout.seekp(offset+g_idx*sizeof(fileObj));
-	fout.write((char *)&fileObj,sizeof(fileObj));
-	bytes+=sizeof(fileObj);
+  if(ieee32big) htobe32_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee32)    htole32_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee64big) htobe64_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee64)    htole64_v((void *)&fileObj,sizeof(fileObj));
+  
+  fout.seekp(offset+g_idx*sizeof(fileObj));
+  fout.write((char *)&fileObj,sizeof(fileObj));
+  bytes+=sizeof(fileObj);
      }
    }

@@ -668,7 +668,7 @@ class BinaryIO {

    timer.Stop();
    std::cout<<GridLogPerformance<<"writeObjectParallel: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
+       << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;

    return csum;
  }
--- a/lib/pugixml/pugixml.h
+++ b/lib/pugixml/pugixml.h
@@ -17,7 +17,7 @@
 #endif

 // Include user configuration file (this can define various configuration macros)
-#include <pugixml/pugiconfig.hpp>
+#include "pugiconfig.hpp"

 #ifndef HEADER_PUGIXML_HPP
 #define HEADER_PUGIXML_HPP
--- a/lib/qcd/QCD.h
+++ b/lib/qcd/QCD.h
@@ -55,10 +55,19 @@ namespace QCD {
    //////////////////////////////////////////////////////////////////////////////
    // QCD iMatrix types
    // Index conventions:                            Lorentz x Spin x Colour
+    // note: static const int or constexpr will work for type deductions
+    //       with the intel compiler (up to version 17)
    //////////////////////////////////////////////////////////////////////////////
-    static const int ColourIndex = 2;
-    static const int SpinIndex   = 1;
-    static const int LorentzIndex= 0;
+    #define ColourIndex  2
+    #define SpinIndex    1
+    #define LorentzIndex 0
+
+  
+    // Also should make these a named enum type
+    static const int DaggerNo=0;
+    static const int DaggerYes=1;
+    static const int InverseNo=0;
+    static const int InverseYes=1;

    // Useful traits is this a spin index
    //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
@@ -484,16 +493,27 @@ namespace QCD {
 }   //namespace QCD
 } // Grid

-#include <qcd/utils/SpaceTimeGrid.h>
-#include <qcd/spin/Dirac.h>
-#include <qcd/spin/TwoSpinor.h>
-#include <qcd/utils/LinalgUtils.h>
-#include <qcd/utils/CovariantCshift.h>
-#include <qcd/utils/SUn.h>
-#include <qcd/action/Actions.h>
-#include <qcd/hmc/integrators/Integrator.h>
-#include <qcd/hmc/integrators/Integrator_algorithm.h>
-#include <qcd/hmc/HMC.h>
+
+#include <Grid/qcd/utils/SpaceTimeGrid.h>
+#include <Grid/qcd/spin/Dirac.h>
+#include <Grid/qcd/spin/TwoSpinor.h>
+#include <Grid/qcd/utils/LinalgUtils.h>
+#include <Grid/qcd/utils/CovariantCshift.h>
+
+// Include representations 	
+#include <Grid/qcd/utils/SUn.h>
+#include <Grid/qcd/utils/SUnAdjoint.h>
+#include <Grid/qcd/utils/SUnTwoIndex.h>
+#include <Grid/qcd/representations/hmc_types.h>
+
+#include <Grid/qcd/action/Actions.h>
+
+#include <Grid/qcd/smearing/Smearing.h>
+
+#include <Grid/qcd/hmc/integrators/Integrator.h>
+#include <Grid/qcd/hmc/integrators/Integrator_algorithm.h>
+#include <Grid/qcd/hmc/HMC.h>
+


 #endif
--- a/lib/qcd/action/ActionBase.h
+++ b/lib/qcd/action/ActionBase.h
@@ -1,86 +1,153 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/qcd/action/ActionBase.h
+Source file: ./lib/qcd/action/ActionBase.h

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef QCD_ACTION_BASE
 #define QCD_ACTION_BASE
 namespace Grid {
-namespace QCD{
-
-template<class GaugeField>
-class Action { 
+namespace QCD {

+template <class GaugeField>
+class Action {
 public:
+  bool is_smeared = false;
  // Boundary conditions? // Heatbath?
-  virtual void  refresh(const GaugeField &U, GridParallelRNG& pRNG) = 0;// refresh pseudofermions
-  virtual RealD S    (const GaugeField &U)                        = 0;  // evaluate the action
-  virtual void  deriv(const GaugeField &U,GaugeField & dSdU )     = 0;  // evaluate the action derivative
-  virtual ~Action() {};
+  virtual void refresh(const GaugeField& U,
+                       GridParallelRNG& pRNG) = 0;  // refresh pseudofermions
+  virtual RealD S(const GaugeField& U) = 0;         // evaluate the action
+  virtual void deriv(const GaugeField& U,
+                     GaugeField& dSdU) = 0;  // evaluate the action derivative
+  virtual ~Action(){};
+};
+
+// Indexing of tuple types
+template <class T, class Tuple>
+struct Index;
+
+template <class T, class... Types>
+struct Index<T, std::tuple<T, Types...>> {
+  static const std::size_t value = 0;
+};
+
+template <class T, class U, class... Types>
+struct Index<T, std::tuple<U, Types...>> {
+  static const std::size_t value = 1 + Index<T, std::tuple<Types...>>::value;
 };

-// Could derive PseudoFermion action with a PF field, FermionField, and a Grid; implement refresh
 /*
-template<class GaugeField, class FermionField>
-class PseudoFermionAction : public Action<GaugeField> {
+template <class GaugeField>
+struct ActionLevel {
 public:
-  FermionField Phi;
-  GridParallelRNG &pRNG;
-  GridBase &Grid;
+  typedef Action<GaugeField>*
+      ActPtr;  // now force the same colours as the rest of the code

-  PseudoFermionAction(GridBase &_Grid,GridParallelRNG &_pRNG) : Grid(_Grid), Phi(&_Grid), pRNG(_pRNG) {
-  };
+  //Add supported representations here

-  virtual void refresh(const GaugeField &gauge) {
-    gaussian(Phi,pRNG);
-  };

-};
-*/
-
-template<class GaugeField> struct ActionLevel{
-public:
-   
-  typedef Action<GaugeField>*  ActPtr; // now force the same colours as the rest of the code
-
-  int multiplier;
+  unsigned int multiplier;

  std::vector<ActPtr> actions;

-  ActionLevel(int mul = 1) : multiplier(mul) {
-    assert (mul > 0);
+  ActionLevel(unsigned int mul = 1) : actions(0), multiplier(mul) {
+    assert(mul >= 1);
  };
-   
-  void push_back(ActPtr ptr){
-    actions.push_back(ptr);
+
+  void push_back(ActPtr ptr) { actions.push_back(ptr); }
+};
+*/
+
+template <class GaugeField, class Repr = NoHirep >
+struct ActionLevel {
+ public:
+  unsigned int multiplier; 
+
+  // Fundamental repr actions separated because of the smearing
+  typedef Action<GaugeField>* ActPtr;
+
+  // construct a tuple of vectors of the actions for the corresponding higher
+  // representation fields
+  typedef typename AccessTypes<Action, Repr>::VectorCollection action_collection;
+  action_collection actions_hirep;
+  typedef typename  AccessTypes<Action, Repr>::FieldTypeCollection action_hirep_types;
+
+  std::vector<ActPtr>& actions;
+
+  // Temporary conversion between ActionLevel and ActionLevelHirep
+  //ActionLevelHirep(ActionLevel<GaugeField>& AL ):actions(AL.actions), multiplier(AL.multiplier){}
+
+  ActionLevel(unsigned int mul = 1) : actions(std::get<0>(actions_hirep)), multiplier(mul) {
+    // initialize the hirep vectors to zero.
+    //apply(this->resize, actions_hirep, 0); //need a working resize
+    assert(mul >= 1);
+  };
+
+  //void push_back(ActPtr ptr) { actions.push_back(ptr); }
+
+
+
+  template < class Field >
+  void push_back(Action<Field>* ptr) {
+    // insert only in the correct vector
+    std::get< Index < Field, action_hirep_types>::value >(actions_hirep).push_back(ptr);
+  };
+
+ 
+
+  template < class ActPtr>
+  static void resize(ActPtr ap, unsigned int n){
+    ap->resize(n);
+
  }
+
+  //template <std::size_t I>
+  //auto getRepresentation(Repr& R)->decltype(std::get<I>(R).U)  {return std::get<I>(R).U;}
+
+  // Loop on tuple for a callable function
+  template <std::size_t I = 1, typename Callable, typename ...Args>
+  inline typename std::enable_if<I == std::tuple_size<action_collection>::value, void>::type apply(
+      Callable, Repr& R,Args&...) const {}
+
+  template <std::size_t I = 1, typename Callable, typename ...Args>
+  inline typename std::enable_if<I < std::tuple_size<action_collection>::value, void>::type apply(
+      Callable fn, Repr& R, Args&... arguments) const {
+    fn(std::get<I>(actions_hirep), std::get<I>(R.rep), arguments...);
+    apply<I + 1>(fn, R, arguments...);
+  }  
+
 };

-template<class GaugeField> using ActionSet = std::vector<ActionLevel< GaugeField > >;

+//template <class GaugeField>
+//using ActionSet = std::vector<ActionLevel<GaugeField> >;

-}}
+template <class GaugeField, class R>
+using ActionSet = std::vector<ActionLevel<GaugeField, R> >;
+
+}
+}
 #endif
--- a/lib/qcd/action/Actions.h
+++ b/lib/qcd/action/Actions.h
@@ -40,25 +40,25 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 ////////////////////////////////////////////
 // Abstract base interface
 ////////////////////////////////////////////
-#include <qcd/action/ActionBase.h>
-#include <qcd/action/ActionParams.h>
+#include <Grid/qcd/action/ActionBase.h>
+#include <Grid/qcd/action/ActionParams.h>

 ////////////////////////////////////////////
 // Utility functions
 ////////////////////////////////////////////
-#include <qcd/action/gauge/GaugeImpl.h>
-#include <qcd/utils/WilsonLoops.h>
+#include <Grid/qcd/action/gauge/GaugeImpl.h>
+#include <Grid/qcd/utils/WilsonLoops.h>

-#include <qcd/action/fermion/WilsonCompressor.h>     //used by all wilson type fermions
-#include <qcd/action/fermion/FermionOperatorImpl.h>
-#include <qcd/action/fermion/FermionOperator.h>
-#include <qcd/action/fermion/WilsonKernels.h>        //used by all wilson type fermions
+#include <Grid/qcd/action/fermion/WilsonCompressor.h>     //used by all wilson type fermions
+#include <Grid/qcd/action/fermion/FermionOperatorImpl.h>
+#include <Grid/qcd/action/fermion/FermionOperator.h>
+#include <Grid/qcd/action/fermion/WilsonKernels.h>        //used by all wilson type fermions

 ////////////////////////////////////////////
 // Gauge Actions
 ////////////////////////////////////////////
-#include <qcd/action/gauge/WilsonGaugeAction.h>
-#include <qcd/action/gauge/PlaqPlusRectangleAction.h>
+#include <Grid/qcd/action/gauge/WilsonGaugeAction.h>
+#include <Grid/qcd/action/gauge/PlaqPlusRectangleAction.h>

 namespace Grid {
 namespace QCD {
@@ -107,41 +107,64 @@ typedef SymanzikGaugeAction<ConjugateGimplD>        ConjugateSymanzikGaugeAction
 // for EVERY .cc file. This define centralises the list and restores global push of impl cases
 ////////////////////////////////////////////////////////////////////////////////////////////////////

-#define FermOpTemplateInstantiate(A) \
+
+#define FermOp4dVecTemplateInstantiate(A) \
  template class A<WilsonImplF>;		\
  template class A<WilsonImplD>;		\
+  template class A<ZWilsonImplF>;		\
+  template class A<ZWilsonImplD>;		\
  template class A<GparityWilsonImplF>;		\
  template class A<GparityWilsonImplD>;		

+#define AdjointFermOpTemplateInstantiate(A) \
+  template class A<WilsonAdjImplF>; \
+  template class A<WilsonAdjImplD>; 
+
+#define TwoIndexFermOpTemplateInstantiate(A) \
+  template class A<WilsonTwoIndexSymmetricImplF>; \
+  template class A<WilsonTwoIndexSymmetricImplD>; 
+
+#define FermOp5dVecTemplateInstantiate(A) \
+  template class A<DomainWallVec5dImplF>;	\
+  template class A<DomainWallVec5dImplD>;	\
+  template class A<ZDomainWallVec5dImplF>;	\
+  template class A<ZDomainWallVec5dImplD>;	
+
+#define FermOpTemplateInstantiate(A) \
+ FermOp4dVecTemplateInstantiate(A) \
+ FermOp5dVecTemplateInstantiate(A) 
+
+
 #define GparityFermOpTemplateInstantiate(A) 

 ////////////////////////////////////////////
 // Fermion operators / actions
 ////////////////////////////////////////////

-#include <qcd/action/fermion/WilsonFermion.h>       // 4d wilson like
-#include <qcd/action/fermion/WilsonTMFermion.h>       // 4d wilson like
-#include <qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
+#include <Grid/qcd/action/fermion/WilsonFermion.h>       // 4d wilson like
+#include <Grid/qcd/action/fermion/WilsonTMFermion.h>       // 4d wilson like
+#include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types

-//#include <qcd/action/fermion/CloverFermion.h>
+//#include <Grid/qcd/action/fermion/CloverFermion.h>

-#include <qcd/action/fermion/CayleyFermion5D.h>     // Cayley types
-#include <qcd/action/fermion/DomainWallFermion.h>
-#include <qcd/action/fermion/DomainWallFermion.h>
-#include <qcd/action/fermion/MobiusFermion.h>
-#include <qcd/action/fermion/ScaledShamirFermion.h>
-#include <qcd/action/fermion/MobiusZolotarevFermion.h>
-#include <qcd/action/fermion/ShamirZolotarevFermion.h>
-#include <qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h>
-#include <qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h>
+#include <Grid/qcd/action/fermion/CayleyFermion5D.h>     // Cayley types
+#include <Grid/qcd/action/fermion/DomainWallFermion.h>
+#include <Grid/qcd/action/fermion/DomainWallFermion.h>
+#include <Grid/qcd/action/fermion/MobiusFermion.h>
+#include <Grid/qcd/action/fermion/ZMobiusFermion.h>
+#include <Grid/qcd/action/fermion/ScaledShamirFermion.h>
+#include <Grid/qcd/action/fermion/MobiusZolotarevFermion.h>
+#include <Grid/qcd/action/fermion/ShamirZolotarevFermion.h>
+#include <Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h>
+#include <Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h>

-#include <qcd/action/fermion/ContinuedFractionFermion5D.h>               // Continued fraction
-#include <qcd/action/fermion/OverlapWilsonContfracTanhFermion.h>
-#include <qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h>
+#include <Grid/qcd/action/fermion/ContinuedFractionFermion5D.h>               // Continued fraction
+#include <Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h>
+#include <Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h>

-#include <qcd/action/fermion/PartialFractionFermion5D.h>                 // Partial fraction
-#include <qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
-#include <qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h>
+#include <Grid/qcd/action/fermion/PartialFractionFermion5D.h>                 // Partial fraction
+#include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
+#include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h>

 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // More maintainable to maintain the following typedef list centrally, as more "impl" targets
@@ -157,6 +180,14 @@ typedef WilsonFermion<WilsonImplR> WilsonFermionR;
 typedef WilsonFermion<WilsonImplF> WilsonFermionF;
 typedef WilsonFermion<WilsonImplD> WilsonFermionD;

+typedef WilsonFermion<WilsonAdjImplR> WilsonAdjFermionR;
+typedef WilsonFermion<WilsonAdjImplF> WilsonAdjFermionF;
+typedef WilsonFermion<WilsonAdjImplD> WilsonAdjFermionD;
+
+typedef WilsonFermion<WilsonTwoIndexSymmetricImplR> WilsonTwoIndexSymmetricFermionR;
+typedef WilsonFermion<WilsonTwoIndexSymmetricImplF> WilsonTwoIndexSymmetricFermionF;
+typedef WilsonFermion<WilsonTwoIndexSymmetricImplD> WilsonTwoIndexSymmetricFermionD;
+
 typedef WilsonTMFermion<WilsonImplR> WilsonTMFermionR;
 typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
 typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
@@ -167,6 +198,11 @@ typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
 typedef MobiusFermion<WilsonImplR> MobiusFermionR;
 typedef MobiusFermion<WilsonImplF> MobiusFermionF;
 typedef MobiusFermion<WilsonImplD> MobiusFermionD;
+
+typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
+typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
+typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
+
 typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
 typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
 typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
@@ -222,21 +258,21 @@ typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
 ///////////////////////////////////////////////////////////////////////////////
 // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
 ///////////////////////////////////////////////////////////////////////////////
-#include <qcd/action/fermion/g5HermitianLinop.h>
+#include <Grid/qcd/action/fermion/g5HermitianLinop.h>

 ////////////////////////////////////////
 // Pseudo fermion combinations for HMC
 ////////////////////////////////////////
-#include <qcd/action/pseudofermion/EvenOddSchurDifferentiable.h>
+#include <Grid/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h>

-#include <qcd/action/pseudofermion/TwoFlavour.h>
-#include <qcd/action/pseudofermion/TwoFlavourRatio.h>
-#include <qcd/action/pseudofermion/TwoFlavourEvenOdd.h>
-#include <qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h>
+#include <Grid/qcd/action/pseudofermion/TwoFlavour.h>
+#include <Grid/qcd/action/pseudofermion/TwoFlavourRatio.h>
+#include <Grid/qcd/action/pseudofermion/TwoFlavourEvenOdd.h>
+#include <Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h>

-#include <qcd/action/pseudofermion/OneFlavourRational.h>
-#include <qcd/action/pseudofermion/OneFlavourRationalRatio.h>
-#include <qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
-#include <qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>
+#include <Grid/qcd/action/pseudofermion/OneFlavourRational.h>
+#include <Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h>
+#include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
+#include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>

 #endif
--- a/lib/qcd/action/fermion/.dirstamp
+++ b/lib/qcd/action/fermion/.dirstamp
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@@ -28,7 +28,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+
 #include <Grid.h>
+
+
 namespace Grid {
 namespace QCD {

@@ -45,486 +48,352 @@ namespace QCD {
 		   FourDimGrid,
 	 	   FourDimRedBlackGrid,_M5,p),
   mass(_mass)
- {
- }
+ { }

- template<class Impl>
-  void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din)
-  {
-    // Assemble Din
-    int Ls=this->Ls;
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	//	Din = bs psi[s] + cs[s] psi[s+1}
-	axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
-	//      Din+= -mass*cs[s] psi[s+1}
-	axpby_ssp_pplus (Din,1.0,Din,-mass*cs[s],psi,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pminus(Din,bs[s],psi,-mass*cs[s],psi,s,0);
-	axpby_ssp_pplus (Din,1.0,Din,cs[s],psi,s,s-1);
-      } else {
-	axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
-	axpby_ssp_pplus(Din,1.0,Din,cs[s],psi,s,s-1);
-      }
-    }
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  std::vector<Coeff_t> diag (Ls,1.0);
+  std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
+  std::vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass;
+  M5D(psi,chi,chi,lower,diag,upper);
+}
+template<class Impl>
+void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din)
+{
+  int Ls=this->Ls;
+  std::vector<Coeff_t> diag = bs;
+  std::vector<Coeff_t> upper= cs;
+  std::vector<Coeff_t> lower= cs; 
+  upper[Ls-1]=-mass*upper[Ls-1];
+  lower[0]   =-mass*lower[0];
+  M5D(psi,psi,Din,lower,diag,upper);
+}
+template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  std::vector<Coeff_t> diag = beo;
+  std::vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> lower(Ls);
+  for(int i=0;i<Ls;i++) {
+    upper[i]=-ceo[i];
+    lower[i]=-ceo[i];
  }
- template<class Impl>
-  void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din)
-  {
-    int Ls=this->Ls;
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	axpby_ssp_pplus (Din,bs[s],psi,cs[s+1],psi,s,s+1);
-	axpby_ssp_pminus(Din,1.0,Din,-mass*cs[Ls-1],psi,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pplus (Din,bs[s],psi,-mass*cs[0],psi,s,0);
-	axpby_ssp_pminus(Din,1.0,Din,cs[s-1],psi,s,s-1);
-      } else {
-	axpby_ssp_pplus (Din,bs[s],psi,cs[s+1],psi,s,s+1);
-	axpby_ssp_pminus(Din,1.0,Din,cs[s-1],psi,s,s-1);
-      }
-    }
+  upper[Ls-1]=-mass*upper[Ls-1];
+  lower[0]   =-mass*lower[0];
+  M5D(psi,psi,chi,lower,diag,upper);
+}
+template<class Impl>
+void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  std::vector<Coeff_t> diag = bee;
+  std::vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> lower(Ls);
+  for(int i=0;i<Ls;i++) {
+    upper[i]=-cee[i];
+    lower[i]=-cee[i];
  }
+  upper[Ls-1]=-mass*upper[Ls-1];
+  lower[0]   =-mass*lower[0];
+  M5D(psi,psi,chi,lower,diag,upper);
+}

-  // override multiply
- template<class Impl>
-  RealD CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
-  {
-    int Ls=this->Ls;
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  std::vector<Coeff_t> diag = bee;
+  std::vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> lower(Ls);

-    FermionField Din(psi._grid);
-
-    // Assemble Din
-    /*
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	//	Din = bs psi[s] + cs[s] psi[s+1}
-	axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
-	//      Din+= -mass*cs[s] psi[s+1}
-	axpby_ssp_pplus (Din,1.0,Din,-mass*cs[s],psi,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pminus(Din,bs[s],psi,-mass*cs[s],psi,s,0);
-	axpby_ssp_pplus (Din,1.0,Din,cs[s],psi,s,s-1);
-      } else {
-	axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
-	axpby_ssp_pplus(Din,1.0,Din,cs[s],psi,s,s-1);
-      }
-    }
-    */
-    Meooe5D(psi,Din);
-
-    this->DW(Din,chi,DaggerNo);
-    // ((b D_W + D_w hop terms +1) on s-diag
-    axpby(chi,1.0,1.0,chi,psi); 
-
-    // Call Mooee??
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ){
-	axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s+1);
-	axpby_ssp_pplus (chi,1.0,chi,mass,psi,s,Ls-1);
-      } else if ( s==(Ls-1)) {
-	axpby_ssp_pminus(chi,1.0,chi,mass,psi,s,0);
-	axpby_ssp_pplus (chi,1.0,chi,-1.0,psi,s,s-1);
-      } else {
-	axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s+1);
-	axpby_ssp_pplus (chi,1.0,chi,-1.0,psi,s,s-1);
-      }
-    }
-    return norm2(chi);
-  }
-
- template<class Impl>
-  RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
-  {
-    // Under adjoint
-    //D1+        D1- P-    ->   D1+^dag   P+ D2-^dag
-    //D2- P+     D2+            P-D1-^dag D2+dag
-
-    FermionField Din(psi._grid);
-    // Apply Dw
-    this->DW(psi,Din,DaggerYes); 
-
-    MeooeDag5D(Din,chi);
-
-    int Ls=this->Ls;
-    for(int s=0;s<Ls;s++){
-
-      // Collect the terms in DW
-      //	Chi = bs Din[s] + cs[s] Din[s+1}
-      //    Chi+= -mass*cs[s] psi[s+1}
-      /*
-      if ( s==0 ) {
-	axpby_ssp_pplus (chi,bs[s],Din,cs[s+1],Din,s,s+1);
-	axpby_ssp_pminus(chi,1.0,chi,-mass*cs[Ls-1],Din,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pplus (chi,bs[s],Din,-mass*cs[0],Din,s,0);
-	axpby_ssp_pminus(chi,1.0,chi,cs[s-1],Din,s,s-1);
-      } else {
-	axpby_ssp_pplus (chi,bs[s],Din,cs[s+1],Din,s,s+1);
-	axpby_ssp_pminus(chi,1.0,chi,cs[s-1],Din,s,s-1);
-      }
-      */
-
-      // FIXME just call MooeeDag??
-
-      // Collect the terms indept of DW
-      if ( s==0 ){
-	axpby_ssp_pplus (chi,1.0,chi,-1.0,psi,s,s+1);
-	axpby_ssp_pminus(chi,1.0,chi,mass,psi,s,Ls-1);
-      } else if ( s==(Ls-1)) {
-	axpby_ssp_pplus (chi,1.0,chi,mass,psi,s,0);
-	axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s-1);
-      } else {
-	axpby_ssp_pplus(chi,1.0,chi,-1.0,psi,s,s+1);
-	axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s-1);
-      }
-    }
-    // ((b D_W + D_w hop terms +1) on s-diag
-    axpby (chi,1.0,1.0,chi,psi); 
-    return norm2(chi);
-  }
-
-  // half checkerboard operations
- template<class Impl>
-  void CayleyFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
-  {
-    int Ls=this->Ls;
-
-    FermionField tmp(psi._grid);
+  for (int s=0;s<Ls;s++){
    // Assemble the 5d matrix
-    Meooe5D(psi,tmp); 
-#if 0
-    std::cout << "Meooe Test replacement norm2 tmp = " <<norm2(tmp)<<std::endl;
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	//	tmp = bs psi[s] + cs[s] psi[s+1}
-	//      tmp+= -mass*cs[s] psi[s+1}
-	axpby_ssp_pminus(tmp,beo[s],psi,-ceo[s],psi ,s, s+1);
-	axpby_ssp_pplus(tmp,1.0,tmp,mass*ceo[s],psi,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pminus(tmp,beo[s],psi,mass*ceo[s],psi,s,0);
-	axpby_ssp_pplus(tmp,1.0,tmp,-ceo[s],psi,s,s-1);
-      } else {
-	axpby_ssp_pminus(tmp,beo[s],psi,-ceo[s],psi,s,s+1);
-	axpby_ssp_pplus (tmp,1.0,tmp,-ceo[s],psi,s,s-1);
-      }
-    }
-    std::cout << "Meooe Test replacement norm2 tmp old = " <<norm2(tmp)<<std::endl;
-#endif
-
-    // Apply 4d dslash
-    if ( psi.checkerboard == Odd ) {
-      this->DhopEO(tmp,chi,DaggerNo);
+    if ( s==0 ) {
+      upper[s] = -cee[s+1] ;
+      lower[s] = mass*cee[Ls-1];
+    } else if ( s==(Ls-1)) { 
+      upper[s] = mass*cee[0];
+      lower[s] = -cee[s-1];
    } else {
-      this->DhopOE(tmp,chi,DaggerNo);
+      upper[s]=-cee[s+1];
+      lower[s]=-cee[s-1];
    }
  }

-  template<class Impl>
-  void CayleyFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
-  {
-    FermionField tmp(psi._grid);
-    // Apply 4d dslash
-    if ( psi.checkerboard == Odd ) {
-      this->DhopEO(psi,tmp,DaggerYes);
-    } else {
-      this->DhopOE(psi,tmp,DaggerYes);
-    }
+  M5Ddag(psi,psi,chi,lower,diag,upper);
+}

-    MeooeDag5D(tmp,chi); 
-#if 0
-    std::cout << "Meooe Test replacement norm2 chi new = " <<norm2(chi)<<std::endl;
-    // Assemble the 5d matrix
-    int Ls=this->Ls;
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	axpby_ssp_pplus(chi,beo[s],tmp,   -ceo[s+1]  ,tmp,s,s+1);
-	axpby_ssp_pminus(chi,   1.0,chi,mass*ceo[Ls-1],tmp,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pplus(chi,beo[s],tmp,mass*ceo[0],tmp,s,0);
-	axpby_ssp_pminus(chi,1.0,chi,-ceo[s-1],tmp,s,s-1);
-      } else {
-	axpby_ssp_pplus(chi,beo[s],tmp,-ceo[s+1],tmp,s,s+1);
-	axpby_ssp_pminus(chi,1.0   ,chi,-ceo[s-1],tmp,s,s-1);
-      }
-    }
-    std::cout << "Meooe Test replacement norm2 chi old = " <<norm2(chi)<<std::endl;
-#endif
+template<class Impl>
+void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  std::vector<Coeff_t> diag(Ls,1.0);
+  std::vector<Coeff_t> upper(Ls,-1.0);
+  std::vector<Coeff_t> lower(Ls,-1.0);
+  upper[Ls-1]=-mass*upper[Ls-1];
+  lower[0]   =-mass*lower[0];
+  M5Ddag(psi,chi,chi,lower,diag,upper);
+}

+template<class Impl>
+void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din)
+{
+  int Ls=this->Ls;
+  std::vector<Coeff_t> diag =bs;
+  std::vector<Coeff_t> upper=cs;
+  std::vector<Coeff_t> lower=cs;
+  upper[Ls-1]=-mass*upper[Ls-1];
+  lower[0]   =-mass*lower[0];
+  M5Ddag(psi,psi,Din,lower,diag,upper);
+}
+
+template<class Impl>
+RealD CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  
+  FermionField Din(psi._grid);
+  
+  // Assemble Din
+  Meooe5D(psi,Din);
+  
+  this->DW(Din,chi,DaggerNo);
+  // ((b D_W + D_w hop terms +1) on s-diag
+  axpby(chi,1.0,1.0,chi,psi); 
+  
+  M5D(psi,chi);
+  return(norm2(chi));
+}
+
+template<class Impl>
+RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
+{
+  // Under adjoint
+  //D1+        D1- P-    ->   D1+^dag   P+ D2-^dag
+  //D2- P+     D2+            P-D1-^dag D2+dag
+  
+  FermionField Din(psi._grid);
+  // Apply Dw
+  this->DW(psi,Din,DaggerYes); 
+  
+  MeooeDag5D(Din,chi);
+  
+  M5Ddag(psi,chi);
+  // ((b D_W + D_w hop terms +1) on s-diag
+  axpby (chi,1.0,1.0,chi,psi); 
+  return norm2(chi);
+}
+
+// half checkerboard operations
+template<class Impl>
+void CayleyFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
+{
+  int Ls=this->Ls;
+  FermionField tmp(psi._grid);
+
+  Meooe5D(psi,tmp); 
+
+  if ( psi.checkerboard == Odd ) {
+    this->DhopEO(tmp,chi,DaggerNo);
+  } else {
+    this->DhopOE(tmp,chi,DaggerNo);
  }
+}

- template<class Impl>
-  void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
-  {
-    int Ls=this->Ls;
-    for (int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	axpby_ssp_pminus(chi,bee[s],psi ,-cee[s],psi,s,s+1);
-	axpby_ssp_pplus (chi,1.0,chi,mass*cee[s],psi,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pminus(chi,bee[s],psi,mass*cee[s],psi,s,0);
-	axpby_ssp_pplus (chi,1.0,chi,-cee[s],psi,s,s-1);
-      } else {
-	axpby_ssp_pminus(chi,bee[s],psi,-cee[s],psi,s,s+1);
-	axpby_ssp_pplus (chi,1.0,chi,-cee[s],psi,s,s-1);
-      }
-    }
+template<class Impl>
+void CayleyFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
+{
+  FermionField tmp(psi._grid);
+  // Apply 4d dslash
+  if ( psi.checkerboard == Odd ) {
+    this->DhopEO(psi,tmp,DaggerYes);
+  } else {
+    this->DhopOE(psi,tmp,DaggerYes);
  }
+  MeooeDag5D(tmp,chi); 
+}

- template<class Impl>
-  void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
-    int Ls=this->Ls;
-    FermionField tmp(psi._grid);
-    // Assemble the 5d matrix
-    for(int s=0;s<Ls;s++){
-      if ( s==0 ) {
-	//	tmp = bs psi[s] + cs[s] psi[s+1}
-	//      tmp+= -mass*cs[s] psi[s+1}
-	axpby_ssp_pminus(tmp,beo[s],psi,-ceo[s],psi ,s, s+1);
-	axpby_ssp_pplus(tmp,1.0,tmp,mass*ceo[s],psi,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pminus(tmp,beo[s],psi,mass*ceo[s],psi,s,0);
-	axpby_ssp_pplus(tmp,1.0,tmp,-ceo[s],psi,s,s-1);
-      } else {
-	axpby_ssp_pminus(tmp,beo[s],psi,-ceo[s],psi,s,s+1);
-	axpby_ssp_pplus (tmp,1.0,tmp,-ceo[s],psi,s,s-1);
-      }
-    }
-    // Apply 4d dslash fragment
-    this->DhopDir(tmp,chi,dir,disp);
+template<class Impl>
+void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
+  FermionField tmp(psi._grid);
+  Meo5D(psi,tmp);
+  // Apply 4d dslash fragment
+  this->DhopDir(tmp,chi,dir,disp);
+}
+// force terms; five routines; default to Dhop on diagonal
+template<class Impl>
+void CayleyFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  FermionField Din(V._grid);
+  
+  if ( dag == DaggerNo ) {
+    //      U d/du [D_w D5] V = U d/du DW D5 V
+    Meooe5D(V,Din);
+    this->DhopDeriv(mat,U,Din,dag);
+  } else {
+    //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
+    Meooe5D(U,Din);
+    this->DhopDeriv(mat,Din,V,dag);
  }
-
- template<class Impl>
-  void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
-  {
-    int Ls=this->Ls;
-    for (int s=0;s<Ls;s++){
-      // Assemble the 5d matrix
-      if ( s==0 ) {
-	axpby_ssp_pplus(chi,bee[s],psi,-cee[s+1]  ,psi,s,s+1);
-	axpby_ssp_pminus(chi,1.0,chi,mass*cee[Ls-1],psi,s,Ls-1);
-      } else if ( s==(Ls-1)) { 
-	axpby_ssp_pplus(chi,bee[s],psi,mass*cee[0],psi,s,0);
-	axpby_ssp_pminus(chi,1.0,chi,-cee[s-1],psi,s,s-1);
-      } else {
-	axpby_ssp_pplus(chi,bee[s],psi,-cee[s+1],psi,s,s+1);
-	axpby_ssp_pminus(chi,1.0   ,chi,-cee[s-1],psi,s,s-1);
-      }
-    }
-  }
-
- template<class Impl>
-  void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
-  {
-    int Ls=this->Ls;
-    // Apply (L^{\prime})^{-1}
-    axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0]
-    for (int s=1;s<Ls;s++){
-      axpby_ssp_pplus(chi,1.0,psi,-lee[s-1],chi,s,s-1);// recursion Psi[s] -lee P_+ chi[s-1]
-    }
-    // L_m^{-1} 
-    for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-      axpby_ssp_pminus(chi,1.0,chi,-leem[s],chi,Ls-1,s);
-    }
-    // U_m^{-1} D^{-1}
-    for (int s=0;s<Ls-1;s++){
-      // Chi[s] + 1/d chi[s] 
-      axpby_ssp_pplus(chi,1.0/dee[s],chi,-ueem[s]/dee[Ls-1],chi,s,Ls-1);
-    }	
-    axpby_ssp(chi,1.0/dee[Ls-1],chi,0.0,chi,Ls-1,Ls-1); // Modest avoidable 
-    
-    // Apply U^{-1}
-    for (int s=Ls-2;s>=0;s--){
-      axpby_ssp_pminus (chi,1.0,chi,-uee[s],chi,s,s+1);  // chi[Ls]
-    }
-  }
-
- template<class Impl>
-  void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
-  {
-    int Ls=this->Ls;
-    // Apply (U^{\prime})^{-dagger}
-    axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0]
-    for (int s=1;s<Ls;s++){
-      axpby_ssp_pminus(chi,1.0,psi,-uee[s-1],chi,s,s-1);
-    }
-    // U_m^{-\dagger} 
-    for (int s=0;s<Ls-1;s++){
-      axpby_ssp_pplus(chi,1.0,chi,-ueem[s],chi,Ls-1,s);
-    }
-    // L_m^{-\dagger} D^{-dagger}
-    for (int s=0;s<Ls-1;s++){
-      axpby_ssp_pminus(chi,1.0/dee[s],chi,-leem[s]/dee[Ls-1],chi,s,Ls-1);
-    }	
-    axpby_ssp(chi,1.0/dee[Ls-1],chi,0.0,chi,Ls-1,Ls-1); // Modest avoidable 
-    
-    // Apply L^{-dagger}
-    for (int s=Ls-2;s>=0;s--){
-      axpby_ssp_pplus (chi,1.0,chi,-lee[s],chi,s,s+1);  // chi[Ls]
-    }
-  }
-
-  // force terms; five routines; default to Dhop on diagonal
-  template<class Impl>
-  void CayleyFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-  {
-    FermionField Din(V._grid);
-
-    if ( dag == DaggerNo ) {
-      //      U d/du [D_w D5] V = U d/du DW D5 V
-      Meooe5D(V,Din);
-      this->DhopDeriv(mat,U,Din,dag);
-    } else {
-      //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
-      Meooe5D(U,Din);
-      this->DhopDeriv(mat,Din,V,dag);
-    }
-  };
- template<class Impl>
-  void CayleyFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-  {
-    FermionField Din(V._grid);
-
-    if ( dag == DaggerNo ) {
-      //      U d/du [D_w D5] V = U d/du DW D5 V
-      Meooe5D(V,Din);
-      this->DhopDerivOE(mat,U,Din,dag);
-    } else {
-      //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
+};
+template<class Impl>
+void CayleyFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  FermionField Din(V._grid);
+  
+  if ( dag == DaggerNo ) {
+    //      U d/du [D_w D5] V = U d/du DW D5 V
+    Meooe5D(V,Din);
+    this->DhopDerivOE(mat,U,Din,dag);
+  } else {
+    //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
      Meooe5D(U,Din);
      this->DhopDerivOE(mat,Din,V,dag);
-    }
-  };
- template<class Impl>
-  void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-  {
-    FermionField Din(V._grid);
-
-    if ( dag == DaggerNo ) {
-      //      U d/du [D_w D5] V = U d/du DW D5 V
-      Meooe5D(V,Din);
-      this->DhopDerivEO(mat,U,Din,dag);
-    } else {
-      //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
-      Meooe5D(U,Din);
-      this->DhopDerivEO(mat,Din,V,dag);
-    }
-  };
+  }
+};
+template<class Impl>
+void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+{
+  FermionField Din(V._grid);
  
-  // Tanh
- template<class Impl>
-  void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
-  {
-    SetCoefficientsZolotarev(1.0,zdata,b,c);
-
+  if ( dag == DaggerNo ) {
+    //      U d/du [D_w D5] V = U d/du DW D5 V
+    Meooe5D(V,Din);
+    this->DhopDerivEO(mat,U,Din,dag);
+  } else {
+    //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
+    Meooe5D(U,Din);
+    this->DhopDerivEO(mat,Din,V,dag);
  }
-  //Zolo
- template<class Impl>
-  void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
-  {
-    int Ls=this->Ls;
+};
+  
+// Tanh
+template<class Impl>
+void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
+{
+  std::vector<Coeff_t> gamma(this->Ls);
+  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
+  SetCoefficientsInternal(1.0,gamma,b,c);
+}
+//Zolo
+template<class Impl>
+void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
+{
+  std::vector<Coeff_t> gamma(this->Ls);
+  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
+  SetCoefficientsInternal(zolo_hi,gamma,b,c);
+}
+//Zolo
+template<class Impl>
+void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c)
+{
+  int Ls=this->Ls;

-    ///////////////////////////////////////////////////////////
-    // The Cayley coeffs (unprec)
-    ///////////////////////////////////////////////////////////
-    omega.resize(Ls);
-    bs.resize(Ls);
-    cs.resize(Ls);
-    as.resize(Ls);
+  ///////////////////////////////////////////////////////////
+  // The Cayley coeffs (unprec)
+  ///////////////////////////////////////////////////////////
+  omega.resize(Ls);
+  bs.resize(Ls);
+  cs.resize(Ls);
+  as.resize(Ls);
+  
+  // 
+  // Ts = (    [bs+cs]Dw        )^-1 (    (bs+cs) Dw         )
+  //     -(g5  -------       -1 )    ( g5 ---------     + 1  )
+  //      (   {2+(bs-cs)Dw}     )    (    2+(bs-cs) Dw       )
+  //
+  //  bs = 1/2( (1/omega_s + 1)*b + (1/omega - 1)*c ) = 1/2(  1/omega(b+c) + (b-c) )
+  //  cs = 1/2( (1/omega_s - 1)*b + (1/omega + 1)*c ) = 1/2(  1/omega(b+c) - (b-c) )
+  //
+  // bs+cs = 0.5*( 1/omega(b+c) + (b-c) + 1/omega(b+c) - (b-c) ) = 1/omega(b+c)
+  // bs-cs = 0.5*( 1/omega(b+c) + (b-c) - 1/omega(b+c) + (b-c) ) = b-c
+  //
+  // So 
+  //
+  // Ts = (    [b+c]Dw/omega_s    )^-1 (    (b+c) Dw /omega_s        )
+  //     -(g5  -------         -1 )    ( g5 ---------           + 1  )
+  //      (   {2+(b-c)Dw}         )    (    2+(b-c) Dw               )
+  //
+  // Ts = (    [b+c]Dw            )^-1 (    (b+c) Dw                 )
+  //     -(g5  -------    -omega_s)    ( g5 ---------      + omega_s )
+  //      (   {2+(b-c)Dw}         )    (    2+(b-c) Dw               )
+  // 
    
-    // 
-    // Ts = (    [bs+cs]Dw        )^-1 (    (bs+cs) Dw         )
-    //     -(g5  -------       -1 )    ( g5 ---------     + 1  )
-    //      (   {2+(bs-cs)Dw}     )    (    2+(bs-cs) Dw       )
-    //
-    //  bs = 1/2( (1/omega_s + 1)*b + (1/omega - 1)*c ) = 1/2(  1/omega(b+c) + (b-c) )
-    //  cs = 1/2( (1/omega_s - 1)*b + (1/omega + 1)*c ) = 1/2(  1/omega(b+c) - (b-c) )
-    //
-    // bs+cs = 0.5*( 1/omega(b+c) + (b-c) + 1/omega(b+c) - (b-c) ) = 1/omega(b+c)
-    // bs-cs = 0.5*( 1/omega(b+c) + (b-c) - 1/omega(b+c) + (b-c) ) = b-c
-    //
-    // So 
-    //
-    // Ts = (    [b+c]Dw/omega_s    )^-1 (    (b+c) Dw /omega_s        )
-    //     -(g5  -------         -1 )    ( g5 ---------           + 1  )
-    //      (   {2+(b-c)Dw}         )    (    2+(b-c) Dw               )
-    //
-    // Ts = (    [b+c]Dw            )^-1 (    (b+c) Dw                 )
-    //     -(g5  -------    -omega_s)    ( g5 ---------      + omega_s )
-    //      (   {2+(b-c)Dw}         )    (    2+(b-c) Dw               )
-    // 
+  double bpc = b+c;
+  double bmc = b-c;
+  for(int i=0; i < Ls; i++){
+    as[i] = 1.0;
+    omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code
+    bs[i] = 0.5*(bpc/omega[i] + bmc);
+    cs[i] = 0.5*(bpc/omega[i] - bmc);
+  }
+  
+  ////////////////////////////////////////////////////////
+  // Constants for the preconditioned matrix Cayley form
+  ////////////////////////////////////////////////////////
+  bee.resize(Ls);
+  cee.resize(Ls);
+  beo.resize(Ls);
+  ceo.resize(Ls);
+  
+  for(int i=0;i<Ls;i++){
+    bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);
+    cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
+    beo[i]=as[i]*bs[i];
+    ceo[i]=-as[i]*cs[i];
+  }
+  
+  aee.resize(Ls);
+  aeo.resize(Ls);
+  for(int i=0;i<Ls;i++){
+    aee[i]=cee[i];
+    aeo[i]=ceo[i];
+  }
+  
+  //////////////////////////////////////////
+  // LDU decomposition of eeoo
+  //////////////////////////////////////////
+  dee.resize(Ls);
+  lee.resize(Ls);
+  leem.resize(Ls);
+  uee.resize(Ls);
+  ueem.resize(Ls);
+  
+  for(int i=0;i<Ls;i++){
    
-    double bpc = b+c;
-    double bmc = b-c;
-    for(int i=0; i < Ls; i++){
-      as[i] = 1.0;
-      omega[i] = ((double)zdata->gamma[i])*zolo_hi; //NB reciprocal relative to Chroma NEF code
-      bs[i] = 0.5*(bpc/omega[i] + bmc);
-      cs[i] = 0.5*(bpc/omega[i] - bmc);
-    }
-
-    ////////////////////////////////////////////////////////
-    // Constants for the preconditioned matrix Cayley form
-    ////////////////////////////////////////////////////////
-    bee.resize(Ls);
-    cee.resize(Ls);
-    beo.resize(Ls);
-    ceo.resize(Ls);
+    dee[i] = bee[i];
    
-    for(int i=0;i<Ls;i++){
-      bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);
-      cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
-      beo[i]=as[i]*bs[i];
-      ceo[i]=-as[i]*cs[i];
-    }
-
-    aee.resize(Ls);
-    aeo.resize(Ls);
-    for(int i=0;i<Ls;i++){
-      aee[i]=cee[i];
-      aeo[i]=ceo[i];
-    }
-
-    //////////////////////////////////////////
-    // LDU decomposition of eeoo
-    //////////////////////////////////////////
-    dee.resize(Ls);
-    lee.resize(Ls);
-    leem.resize(Ls);
-    uee.resize(Ls);
-    ueem.resize(Ls);
-    
-    for(int i=0;i<Ls;i++){
+    if ( i < Ls-1 ) {
      
-      dee[i] = bee[i];
+      lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
      
-      if ( i < Ls-1 ) {
-	
-	lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
-	    
-	leem[i]=mass*cee[Ls-1]/bee[0];
-	for(int j=0;j<i;j++)  leem[i]*= aee[j]/bee[j+1];
-	
-	uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
-	
-	ueem[i]=mass;
-	for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
-	ueem[i]*= aee[0]/bee[0];
-	    
-      } else { 
-	lee[i] =0.0;
-	leem[i]=0.0;
-	uee[i] =0.0;
-	ueem[i]=0.0;
-      }
-    }
-	
-    { 
-      double delta_d=mass*cee[Ls-1];
-      for(int j=0;j<Ls-1;j++) delta_d *= cee[j]/bee[j];
-      dee[Ls-1] += delta_d;
+      leem[i]=mass*cee[Ls-1]/bee[0];
+      for(int j=0;j<i;j++)  leem[i]*= aee[j]/bee[j+1];
+      
+      uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
+      
+      ueem[i]=mass;
+      for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
+      ueem[i]*= aee[0]/bee[0];
+      
+    } else { 
+      lee[i] =0.0;
+      leem[i]=0.0;
+      uee[i] =0.0;
+      ueem[i]=0.0;
    }
  }
+	
+  { 
+    Coeff_t delta_d=mass*cee[Ls-1];
+    for(int j=0;j<Ls-1;j++) delta_d *= cee[j]/bee[j];
+    dee[Ls-1] += delta_d;
+  }  
+}
+
+

  FermOpTemplateInstantiate(CayleyFermion5D);
  GparityFermOpTemplateInstantiate(CayleyFermion5D);
--- a/lib/qcd/action/fermion/CayleyFermion5D.h
+++ b/lib/qcd/action/fermion/CayleyFermion5D.h
@@ -51,6 +51,29 @@ namespace Grid {
      virtual void   MooeeDag    (const FermionField &in, FermionField &out);
      virtual void   MooeeInv    (const FermionField &in, FermionField &out);
      virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
+      virtual void   Meo5D (const FermionField &psi, FermionField &chi);
+
+      virtual void   M5D   (const FermionField &psi, FermionField &chi);
+      virtual void   M5Ddag(const FermionField &psi, FermionField &chi);
+
+      /////////////////////////////////////////////////////
+      // Instantiate different versions depending on Impl
+      /////////////////////////////////////////////////////
+      void M5D(const FermionField &psi,
+	       const FermionField &phi, 
+	       FermionField &chi,
+	       std::vector<Coeff_t> &lower,
+	       std::vector<Coeff_t> &diag,
+	       std::vector<Coeff_t> &upper);
+
+      void M5Ddag(const FermionField &psi,
+		  const FermionField &phi, 
+		  FermionField &chi,
+		  std::vector<Coeff_t> &lower,
+		  std::vector<Coeff_t> &diag,
+		  std::vector<Coeff_t> &upper);
+      void MooeeInternal(const FermionField &in, FermionField &out,int dag,int inv);
+
      virtual void   Instantiatable(void)=0;

      // force terms; five routines; default to Dhop on diagonal
@@ -68,23 +91,23 @@ namespace Grid {
      RealD mass;

      // Cayley form Moebius (tanh and zolotarev)
-      std::vector<RealD> omega; 
-      std::vector<RealD> bs;    // S dependent coeffs
-      std::vector<RealD> cs;    
-      std::vector<RealD> as;    
+      std::vector<Coeff_t> omega; 
+      std::vector<Coeff_t> bs;    // S dependent coeffs
+      std::vector<Coeff_t> cs;    
+      std::vector<Coeff_t> as;    
      // For preconditioning Cayley form
-      std::vector<RealD> bee;    
-      std::vector<RealD> cee;    
-      std::vector<RealD> aee;    
-      std::vector<RealD> beo;    
-      std::vector<RealD> ceo;    
-      std::vector<RealD> aeo;    
+      std::vector<Coeff_t> bee;    
+      std::vector<Coeff_t> cee;    
+      std::vector<Coeff_t> aee;    
+      std::vector<Coeff_t> beo;    
+      std::vector<Coeff_t> ceo;    
+      std::vector<Coeff_t> aeo;    
      // LDU factorisation of the eeoo matrix
-      std::vector<RealD> lee;    
-      std::vector<RealD> leem;    
-      std::vector<RealD> uee;    
-      std::vector<RealD> ueem;    
-      std::vector<RealD> dee;    
+      std::vector<Coeff_t> lee;    
+      std::vector<Coeff_t> leem;    
+      std::vector<Coeff_t> uee;    
+      std::vector<Coeff_t> ueem;    
+      std::vector<Coeff_t> dee;    

      // Constructors
      CayleyFermion5D(GaugeField &_Umu,
@@ -97,9 +120,20 @@ namespace Grid {
    protected:
      void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
      void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
+      void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
    };

  }
 }
+#define INSTANTIATE_DPERP(A)\
+template void CayleyFermion5D< A >::M5D(const FermionField &psi,const FermionField &phi,FermionField &chi,\
+					std::vector<Coeff_t> &lower,std::vector<Coeff_t> &diag,std::vector<Coeff_t> &upper); \
+template void CayleyFermion5D< A >::M5Ddag(const FermionField &psi,const FermionField &phi,FermionField &chi,\
+					   std::vector<Coeff_t> &lower,std::vector<Coeff_t> &diag,std::vector<Coeff_t> &upper); \
+template void CayleyFermion5D< A >::MooeeInv    (const FermionField &psi, FermionField &chi); \
+template void CayleyFermion5D< A >::MooeeInvDag (const FermionField &psi, FermionField &chi);
+
+#define CAYLEY_DPERP_CACHE
+#undef  CAYLEY_DPERP_LINALG

 #endif
--- a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
@@ -0,0 +1,211 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid.h>
+
+
+namespace Grid {
+namespace QCD {
+
+  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
+
+  // Pminus fowards
+  // Pplus  backwards..
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
+				const FermionField &phi, 
+				FermionField &chi,
+				std::vector<Coeff_t> &lower,
+				std::vector<Coeff_t> &diag,
+				std::vector<Coeff_t> &upper)
+{
+  int Ls =this->Ls;
+  GridBase *grid=psi._grid;
+  assert(phi.checkerboard == psi.checkerboard);
+  chi.checkerboard=psi.checkerboard;
+PARALLEL_FOR_LOOP
+  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+    for(int s=0;s<Ls;s++){
+      auto tmp = psi._odata[0];
+      if ( s==0 ) {
+ 	                            spProj5m(tmp,psi._odata[ss+s+1]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	                    spProj5p(tmp,psi._odata[ss+Ls-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      } else if ( s==(Ls-1)) {
+	                            spProj5m(tmp,psi._odata[ss+0]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+ 	                    spProj5p(tmp,psi._odata[ss+s-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      } else { 
+	                            spProj5m(tmp,psi._odata[ss+s+1]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	                    spProj5p(tmp,psi._odata[ss+s-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      }
+    }
+  }
+}
+
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
+				   const FermionField &phi, 
+				   FermionField &chi,
+				   std::vector<Coeff_t> &lower,
+				   std::vector<Coeff_t> &diag,
+				   std::vector<Coeff_t> &upper)
+{
+  int Ls =this->Ls;
+  GridBase *grid=psi._grid;
+  assert(phi.checkerboard == psi.checkerboard);
+  chi.checkerboard=psi.checkerboard;
+
+PARALLEL_FOR_LOOP
+  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+    auto tmp = psi._odata[0];
+    for(int s=0;s<Ls;s++){
+      if ( s==0 ) {
+	spProj5p(tmp,psi._odata[ss+s+1]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	spProj5m(tmp,psi._odata[ss+Ls-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      } else if ( s==(Ls-1)) {
+	spProj5p(tmp,psi._odata[ss+0]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	spProj5m(tmp,psi._odata[ss+s-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      } else { 
+	spProj5p(tmp,psi._odata[ss+s+1]);
+	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
+
+	spProj5m(tmp,psi._odata[ss+s-1]);
+	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
+      }
+    }
+  }
+}
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
+{
+  GridBase *grid=psi._grid;
+  int Ls=this->Ls;
+
+  chi.checkerboard=psi.checkerboard;
+
+PARALLEL_FOR_LOOP
+  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+    auto tmp = psi._odata[0];
+
+    // Apply (L^{\prime})^{-1}
+    chi[ss]=psi[ss]; // chi[0]=psi[0]
+    for(int s=1;s<Ls;s++){
+                            spProj5p(tmp,chi[ss+s-1]);  
+      chi[ss+s] = psi[ss+s]-lee[s-1]*tmp;
+    }
+    // L_m^{-1} 
+    for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+                                   spProj5m(tmp,chi[ss+s]);    
+      chi[ss+Ls-1] = chi[ss+Ls-1] - leem[s]*tmp;
+    }
+    // U_m^{-1} D^{-1}
+    for (int s=0;s<Ls-1;s++){
+      // Chi[s] + 1/d chi[s] 
+                                                spProj5p(tmp,chi[ss+Ls-1]); 
+      chi[ss+s] = (1.0/dee[s])*chi[ss+s]-(ueem[s]/dee[Ls-1])*tmp;
+    }	
+    chi[ss+Ls-1]= (1.0/dee[Ls-1])*chi[ss+Ls-1];
+      
+    // Apply U^{-1}
+    for (int s=Ls-2;s>=0;s--){
+                            spProj5m(tmp,chi[ss+s+1]);  
+      chi[ss+s] = chi[ss+s] - uee[s]*tmp;
+    }
+  }
+}
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
+{
+  GridBase *grid=psi._grid;
+  int Ls=this->Ls;
+
+  assert(psi.checkerboard == psi.checkerboard);
+  chi.checkerboard=psi.checkerboard;
+
+
+PARALLEL_FOR_LOOP
+  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+
+    auto tmp = psi._odata[0];
+
+    // Apply (U^{\prime})^{-dagger}
+    chi[ss]=psi[ss];
+    for (int s=1;s<Ls;s++){
+                            spProj5m(tmp,chi[ss+s-1]);
+      chi[ss+s] = psi[ss+s]-uee[s-1]*tmp;
+    }
+    // U_m^{-\dagger} 
+    for (int s=0;s<Ls-1;s++){
+                                   spProj5p(tmp,chi[ss+s]);
+      chi[ss+Ls-1] = chi[ss+Ls-1] - ueem[s]*tmp;
+    }
+
+    // L_m^{-\dagger} D^{-dagger}
+    for (int s=0;s<Ls-1;s++){
+      spProj5m(tmp,chi[ss+Ls-1]);
+      chi[ss+s] = (1.0/dee[s])*chi[ss+s]-(leem[s]/dee[Ls-1])*tmp;
+    }	
+    chi[ss+Ls-1]= (1.0/dee[Ls-1])*chi[ss+Ls-1];
+  
+    // Apply L^{-dagger}
+    for (int s=Ls-2;s>=0;s--){
+      spProj5p(tmp,chi[ss+s+1]);
+      chi[ss+s] = chi[ss+s] - lee[s]*tmp;
+    }
+  }
+}
+
+#ifdef CAYLEY_DPERP_CACHE
+  INSTANTIATE_DPERP(WilsonImplF);
+  INSTANTIATE_DPERP(WilsonImplD);
+  INSTANTIATE_DPERP(GparityWilsonImplF);
+  INSTANTIATE_DPERP(GparityWilsonImplD);
+  INSTANTIATE_DPERP(ZWilsonImplF);
+  INSTANTIATE_DPERP(ZWilsonImplD);
+#endif
+
+}}
--- a/lib/qcd/action/fermion/CayleyFermion5Ddense.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Ddense.cc
@@ -0,0 +1,133 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid/Eigen/Dense>
+#include <Grid.h>
+
+
+namespace Grid {
+namespace QCD {
+  /*
+   * Dense matrix versions of routines
+   */
+
+  /*
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
+{
+  this->MooeeInternal(psi,chi,DaggerYes,InverseYes);
+}
+  
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
+{
+  this->MooeeInternal(psi,chi,DaggerNo,InverseYes);
+}
+  */
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
+{
+  int Ls=this->Ls;
+  int LLs = psi._grid->_rdimensions[0];
+  int vol = psi._grid->oSites()/LLs;
+  
+  chi.checkerboard=psi.checkerboard;
+  
+  assert(Ls==LLs);
+  
+  Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
+  Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
+  
+  for(int s=0;s<Ls;s++){
+    Pplus(s,s) = bee[s];
+    Pminus(s,s)= bee[s];
+  }
+  
+  for(int s=0;s<Ls-1;s++){
+    Pminus(s,s+1) = -cee[s];
+  }
+  
+  for(int s=0;s<Ls-1;s++){
+    Pplus(s+1,s) = -cee[s+1];
+  }
+  Pplus (0,Ls-1) = mass*cee[0];
+  Pminus(Ls-1,0) = mass*cee[Ls-1];
+  
+  Eigen::MatrixXd PplusMat ;
+  Eigen::MatrixXd PminusMat;
+  
+  if ( inv ) {
+    PplusMat =Pplus.inverse();
+    PminusMat=Pminus.inverse();
+  } else { 
+    PplusMat =Pplus;
+    PminusMat=Pminus;
+  }
+  
+  if(dag){
+    PplusMat.adjointInPlace();
+    PminusMat.adjointInPlace();
+  }
+
+  // For the non-vectorised s-direction this is simple
+  
+  for(auto site=0;site<vol;site++){
+    
+    SiteSpinor     SiteChi;
+    SiteHalfSpinor SitePplus;
+    SiteHalfSpinor SitePminus;
+    
+    for(int s1=0;s1<Ls;s1++){
+      SiteChi =zero;
+      for(int s2=0;s2<Ls;s2++){
+	int lex2 = s2+Ls*site;
+	
+	if ( PplusMat(s1,s2) != 0.0 ) {
+	  spProj5p(SitePplus,psi[lex2]);
+	  accumRecon5p(SiteChi,PplusMat (s1,s2)*SitePplus);
+	}
+	
+	if ( PminusMat(s1,s2) != 0.0 ) {
+	  spProj5m(SitePminus,psi[lex2]);
+	  accumRecon5m(SiteChi,PminusMat(s1,s2)*SitePminus);
+	}
+      }
+      chi[s1+Ls*site] = SiteChi*0.5;
+    }
+  }
+}
+
+template void CayleyFermion5D<GparityWilsonImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<GparityWilsonImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<WilsonImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<WilsonImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+
+}}
--- a/lib/qcd/action/fermion/CayleyFermion5Dssp.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dssp.cc
@@ -0,0 +1,149 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid.h>
+
+
+namespace Grid {
+namespace QCD {
+
+  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
+
+  // Pminus fowards
+  // Pplus  backwards
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
+				const FermionField &phi, 
+				FermionField &chi,
+				std::vector<Coeff_t> &lower,
+				std::vector<Coeff_t> &diag,
+				std::vector<Coeff_t> &upper)
+{
+  int Ls=this->Ls;
+  for(int s=0;s<Ls;s++){
+    if ( s==0 ) {
+      axpby_ssp_pminus(chi,diag[s],phi,upper[s],psi,s,s+1);
+      axpby_ssp_pplus (chi,1.0,chi,lower[s],psi,s,Ls-1);
+    } else if ( s==(Ls-1)) { 
+      axpby_ssp_pminus(chi,diag[s],phi,upper[s],psi,s,0);
+      axpby_ssp_pplus (chi,1.0,chi,lower[s],psi,s,s-1);
+    } else {
+      axpby_ssp_pminus(chi,diag[s],phi,upper[s],psi,s,s+1);
+      axpby_ssp_pplus(chi,1.0,chi,lower[s],psi,s,s-1);
+    }
+  }
+}
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
+				   const FermionField &phi, 
+				   FermionField &chi,
+				   std::vector<Coeff_t> &lower,
+				   std::vector<Coeff_t> &diag,
+				   std::vector<Coeff_t> &upper)
+{
+  int Ls=this->Ls;
+  for(int s=0;s<Ls;s++){
+    if ( s==0 ) {
+      axpby_ssp_pplus (chi,diag[s],phi,upper[s],psi,s,s+1);
+      axpby_ssp_pminus(chi,1.0,chi,lower[s],psi,s,Ls-1);
+    } else if ( s==(Ls-1)) { 
+      axpby_ssp_pplus (chi,diag[s],phi,upper[s],psi,s,0);
+      axpby_ssp_pminus(chi,1.0,chi,lower[s],psi,s,s-1);
+    } else {
+      axpby_ssp_pplus (chi,diag[s],phi,upper[s],psi,s,s+1);
+      axpby_ssp_pminus(chi,1.0,chi,lower[s],psi,s,s-1);
+    }
+  }
+}
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
+{
+  chi.checkerboard=psi.checkerboard;
+  int Ls=this->Ls;
+  // Apply (L^{\prime})^{-1}
+  axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0]
+  for (int s=1;s<Ls;s++){
+    axpby_ssp_pplus(chi,1.0,psi,-lee[s-1],chi,s,s-1);// recursion Psi[s] -lee P_+ chi[s-1]
+  }
+  // L_m^{-1} 
+  for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
+    axpby_ssp_pminus(chi,1.0,chi,-leem[s],chi,Ls-1,s);
+  }
+  // U_m^{-1} D^{-1}
+  for (int s=0;s<Ls-1;s++){
+    // Chi[s] + 1/d chi[s] 
+    axpby_ssp_pplus(chi,1.0/dee[s],chi,-ueem[s]/dee[Ls-1],chi,s,Ls-1);
+  }	
+  axpby_ssp(chi,1.0/dee[Ls-1],chi,0.0,chi,Ls-1,Ls-1); // Modest avoidable 
+  
+  // Apply U^{-1}
+  for (int s=Ls-2;s>=0;s--){
+    axpby_ssp_pminus (chi,1.0,chi,-uee[s],chi,s,s+1);  // chi[Ls]
+  }
+}
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
+{
+  chi.checkerboard=psi.checkerboard;
+  int Ls=this->Ls;
+  // Apply (U^{\prime})^{-dagger}
+  axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0]
+  for (int s=1;s<Ls;s++){
+    axpby_ssp_pminus(chi,1.0,psi,-uee[s-1],chi,s,s-1);
+  }
+  // U_m^{-\dagger} 
+  for (int s=0;s<Ls-1;s++){
+    axpby_ssp_pplus(chi,1.0,chi,-ueem[s],chi,Ls-1,s);
+  }
+  // L_m^{-\dagger} D^{-dagger}
+  for (int s=0;s<Ls-1;s++){
+    axpby_ssp_pminus(chi,1.0/dee[s],chi,-leem[s]/dee[Ls-1],chi,s,Ls-1);
+  }	
+  axpby_ssp(chi,1.0/dee[Ls-1],chi,0.0,chi,Ls-1,Ls-1); // Modest avoidable 
+  
+  // Apply L^{-dagger}
+  for (int s=Ls-2;s>=0;s--){
+    axpby_ssp_pplus (chi,1.0,chi,-lee[s],chi,s,s+1);  // chi[Ls]
+  }
+}
+
+
+#ifdef CAYLEY_DPERP_LINALG
+  INSTANTIATE(WilsonImplF);
+  INSTANTIATE(WilsonImplD);
+  INSTANTIATE(GparityWilsonImplF);
+  INSTANTIATE(GparityWilsonImplD);
+#endif
+
+}
+}
--- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
@@ -0,0 +1,309 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid/Eigen/Dense>
+#include <Grid.h>
+
+
+namespace Grid {
+namespace QCD {
+  /*
+   * Dense matrix versions of routines
+   */
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
+{
+  this->MooeeInternal(psi,chi,DaggerYes,InverseYes);
+}
+  
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
+{
+  this->MooeeInternal(psi,chi,DaggerNo,InverseYes);
+}
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
+				const FermionField &phi, 
+				FermionField &chi,
+				std::vector<Coeff_t> &lower,
+				std::vector<Coeff_t> &diag,
+				std::vector<Coeff_t> &upper)
+{
+  GridBase *grid=psi._grid;
+  int Ls   = this->Ls;
+  int LLs  = grid->_rdimensions[0];
+  int nsimd= Simd::Nsimd();
+
+  Vector<iSinglet<Simd> > u(LLs);
+  Vector<iSinglet<Simd> > l(LLs);
+  Vector<iSinglet<Simd> > d(LLs);
+
+  assert(Ls/LLs==nsimd);
+  assert(phi.checkerboard == psi.checkerboard);
+
+  chi.checkerboard=psi.checkerboard;
+
+  // just directly address via type pun
+  typedef typename Simd::scalar_type scalar_type;
+  scalar_type * u_p = (scalar_type *)&u[0];
+  scalar_type * l_p = (scalar_type *)&l[0];
+  scalar_type * d_p = (scalar_type *)&d[0];
+
+  for(int o=0;o<LLs;o++){ // outer
+  for(int i=0;i<nsimd;i++){ //inner
+    int s  = o+i*LLs;
+    int ss = o*nsimd+i;
+    u_p[ss] = upper[s];
+    l_p[ss] = lower[s];
+    d_p[ss] = diag[s];
+  }}
+
+PARALLEL_FOR_LOOP
+  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
+
+    alignas(64) SiteHalfSpinor hp;
+    alignas(64) SiteHalfSpinor hm;
+    alignas(64) SiteSpinor fp;
+    alignas(64) SiteSpinor fm;
+
+    for(int v=0;v<LLs;v++){
+
+      int vp=(v+1)%LLs;
+      int vm=(v+LLs-1)%LLs;
+
+      spProj5m(hp,psi[ss+vp]);
+      spProj5p(hm,psi[ss+vm]);
+      
+      if ( vp<=v ) rotate(hp,hp,1);
+      if ( vm>=v ) rotate(hm,hm,nsimd-1);
+
+      hp=hp*0.5;
+      hm=hm*0.5;
+      spRecon5m(fp,hp);
+      spRecon5p(fm,hm);
+
+      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
+      chi[ss+v] = chi[ss+v]     +l[v]*fm;
+
+    }
+  }
+}
+
+template<class Impl>  
+void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
+				   const FermionField &phi, 
+				   FermionField &chi,
+				   std::vector<Coeff_t> &lower,
+				   std::vector<Coeff_t> &diag,
+				   std::vector<Coeff_t> &upper)
+{
+  GridBase *grid=psi._grid;
+  int Ls   = this->Ls;
+  int LLs  = grid->_rdimensions[0];
+  int nsimd= Simd::Nsimd();
+
+  Vector<iSinglet<Simd> > u(LLs);
+  Vector<iSinglet<Simd> > l(LLs);
+  Vector<iSinglet<Simd> > d(LLs);
+
+  assert(Ls/LLs==nsimd);
+  assert(phi.checkerboard == psi.checkerboard);
+
+  chi.checkerboard=psi.checkerboard;
+
+  // just directly address via type pun
+  typedef typename Simd::scalar_type scalar_type;
+  scalar_type * u_p = (scalar_type *)&u[0];
+  scalar_type * l_p = (scalar_type *)&l[0];
+  scalar_type * d_p = (scalar_type *)&d[0];
+
+  for(int o=0;o<LLs;o++){ // outer
+  for(int i=0;i<nsimd;i++){ //inner
+    int s  = o+i*LLs;
+    int ss = o*nsimd+i;
+    u_p[ss] = upper[s];
+    l_p[ss] = lower[s];
+    d_p[ss] = diag[s];
+  }}
+
+PARALLEL_FOR_LOOP
+  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
+
+    alignas(64) SiteHalfSpinor hp;
+    alignas(64) SiteHalfSpinor hm;
+    alignas(64) SiteSpinor fp;
+    alignas(64) SiteSpinor fm;
+
+    for(int v=0;v<LLs;v++){
+
+      int vp=(v+1)%LLs;
+      int vm=(v+LLs-1)%LLs;
+
+      spProj5p(hp,psi[ss+vp]);
+      spProj5m(hm,psi[ss+vm]);
+
+      if ( vp<=v ) rotate(hp,hp,1);
+      if ( vm>=v ) rotate(hm,hm,nsimd-1);
+      
+      hp=hp*0.5;
+      hm=hm*0.5;
+      spRecon5p(fp,hp);
+      spRecon5m(fm,hm);
+
+      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
+      chi[ss+v] = chi[ss+v]     +l[v]*fm;
+
+    }
+  }
+}
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
+{
+  int Ls=this->Ls;
+  int LLs = psi._grid->_rdimensions[0];
+  int vol = psi._grid->oSites()/LLs;
+
+  chi.checkerboard=psi.checkerboard;
+  
+  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
+  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
+  
+  for(int s=0;s<Ls;s++){
+    Pplus(s,s) = bee[s];
+    Pminus(s,s)= bee[s];
+  }
+  
+  for(int s=0;s<Ls-1;s++){
+    Pminus(s,s+1) = -cee[s];
+  }
+  
+  for(int s=0;s<Ls-1;s++){
+    Pplus(s+1,s) = -cee[s+1];
+  }
+  Pplus (0,Ls-1) = mass*cee[0];
+  Pminus(Ls-1,0) = mass*cee[Ls-1];
+  
+  Eigen::MatrixXcd PplusMat ;
+  Eigen::MatrixXcd PminusMat;
+  
+  if ( inv ) {
+    PplusMat =Pplus.inverse();
+    PminusMat=Pminus.inverse();
+  } else { 
+    PplusMat =Pplus;
+    PminusMat=Pminus;
+  }
+  
+  if(dag){
+    PplusMat.adjointInPlace();
+    PminusMat.adjointInPlace();
+  }
+  
+  typedef typename SiteHalfSpinor::scalar_type scalar_type;
+  const int Nsimd=Simd::Nsimd();
+  Vector<iSinglet<Simd> > Matp(Ls*LLs);
+  Vector<iSinglet<Simd> > Matm(Ls*LLs);
+
+  for(int s2=0;s2<Ls;s2++){
+  for(int s1=0;s1<LLs;s1++){
+    int istride = LLs;
+    int ostride = 1;
+      Simd Vp;
+      Simd Vm;
+      scalar_type *sp = (scalar_type *)&Vp;
+      scalar_type *sm = (scalar_type *)&Vm;
+      for(int l=0;l<Nsimd;l++){
+	sp[l] = PplusMat (l*istride+s1*ostride ,s2);
+	sm[l] = PminusMat(l*istride+s1*ostride,s2);
+      }
+      Matp[LLs*s2+s1] = Vp;
+      Matm[LLs*s2+s1] = Vm;
+    }
+  }
+  
+  // Dynamic allocate on stack to get per thread without serialised heap acces
+PARALLEL_FOR_LOOP
+  for(auto site=0;site<vol;site++){
+    
+    //    SiteHalfSpinor *SitePplus =(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
+    //    SiteHalfSpinor *SitePminus=(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
+    //    SiteSpinor     *SiteChi   =(SiteSpinor *)     alloca(LLs*sizeof(SiteSpinor));
+
+    Vector<SiteHalfSpinor> SitePplus(LLs);
+    Vector<SiteHalfSpinor> SitePminus(LLs);
+    Vector<SiteHalfSpinor> SiteChiP(LLs);
+    Vector<SiteHalfSpinor> SiteChiM(LLs);
+    Vector<SiteSpinor>     SiteChi(LLs);
+
+    SiteHalfSpinor BcastP;
+    SiteHalfSpinor BcastM;
+
+    for(int s=0;s<LLs;s++){
+      int lex = s+LLs*site;
+      spProj5p(SitePplus[s] ,psi[lex]);
+      spProj5m(SitePminus[s],psi[lex]);
+      SiteChiP[s]=zero;
+      SiteChiM[s]=zero;
+    }
+      
+    int s=0;
+    for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
+      for(int s2=0;s2<LLs;s2++){ // Column loop of right hand side
+	vbroadcast(BcastP,SitePplus [s2],l);
+	vbroadcast(BcastM,SitePminus[s2],l);
+	for(int s1=0;s1<LLs;s1++){ // Column loop of reduction variables
+	  SiteChiP[s1]=SiteChiP[s1]+Matp[LLs*s+s1]*BcastP;
+	  SiteChiM[s1]=SiteChiM[s1]+Matm[LLs*s+s1]*BcastM;
+	}
+      s++;
+    }}
+
+    for(int s=0;s<LLs;s++){
+      int lex = s+LLs*site;
+      spRecon5p(SiteChi[s],SiteChiP[s]);
+      accumRecon5m(SiteChi[s],SiteChiM[s]);
+      chi[lex] = SiteChi[s]*0.5;
+    }
+  }
+}
+
+INSTANTIATE_DPERP(DomainWallVec5dImplD);
+INSTANTIATE_DPERP(DomainWallVec5dImplF);
+INSTANTIATE_DPERP(ZDomainWallVec5dImplD);
+INSTANTIATE_DPERP(ZDomainWallVec5dImplF);
+
+template void CayleyFermion5D<DomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<DomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+
+}}
--- a/lib/qcd/action/fermion/DomainWallFermion.h
+++ b/lib/qcd/action/fermion/DomainWallFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_DOMAIN_WALL_FERMION_H
 #define  GRID_QCD_DOMAIN_WALL_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -1,35 +1,36 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/qcd/action/fermion/FermionOperatorImpl.h
+Source file: ./lib/qcd/action/fermion/FermionOperatorImpl.h

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef  GRID_QCD_FERMION_OPERATOR_IMPL_H
-#define  GRID_QCD_FERMION_OPERATOR_IMPL_H
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_QCD_FERMION_OPERATOR_IMPL_H
+#define GRID_QCD_FERMION_OPERATOR_IMPL_H

 namespace Grid {

@@ -75,7 +76,7 @@ namespace Grid {
    //
    //
    // template<class Impl>
-    // class MyOp : pubic<Impl> { 
+    // class MyOp : public<Impl> { 
    // public:
    //
    //    INHERIT_ALL_IMPL_TYPES(Impl);
@@ -99,247 +100,281 @@ namespace Grid {
    typedef typename Impl::SiteSpinor               SiteSpinor;		\
    typedef typename Impl::SiteHalfSpinor       SiteHalfSpinor;		\
    typedef typename Impl::Compressor               Compressor;		\
-    typedef typename Impl::StencilImpl              StencilImpl;	\
-    typedef typename Impl::ImplParams ImplParams;
+    typedef typename Impl::StencilImpl             StencilImpl;		\
+    typedef typename Impl::ImplParams ImplParams;			\
+    typedef typename Impl::Coeff_t       Coeff_t;

 #define INHERIT_IMPL_TYPES(Base) \
-    INHERIT_GIMPL_TYPES(Base)\
+    INHERIT_GIMPL_TYPES(Base)	 \
    INHERIT_FIMPL_TYPES(Base)
-
+    
    ///////
    // Single flavour four spinors with colour index
    ///////
-    template<class S,int Nrepresentation=Nc>
-    class WilsonImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
+    template <class S, class Representation = FundamentalRepresentation,class _Coeff_t = RealD >
+    class WilsonImpl
+      : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
    public:
+      static const int Dimension = Representation::Dimension;
+      typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
+      
+      //Necessary?
+      constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
+
+      const bool LsVectorised=false;
+      typedef _Coeff_t Coeff_t;

-      typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl;

      INHERIT_GIMPL_TYPES(Gimpl);
-
-      template<typename vtype> using iImplSpinor             = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
-      template<typename vtype> using iImplHalfSpinor         = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
-      template<typename vtype> using iImplDoubledGaugeField  = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds >;
-    
-      typedef iImplSpinor    <Simd>           SiteSpinor;
-      typedef iImplHalfSpinor<Simd>           SiteHalfSpinor;
-      typedef iImplDoubledGaugeField<Simd>    SiteDoubledGaugeField;
-
-      typedef Lattice<SiteSpinor>                 FermionField;
+      
+      template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
+      template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
+      template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
+      
+      typedef iImplSpinor<Simd>            SiteSpinor;
+      typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
+      typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
+      
+      typedef Lattice<SiteSpinor>            FermionField;
      typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
-
-      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
+      
+      typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
      typedef WilsonImplParams ImplParams;
-      typedef WilsonStencil<SiteSpinor,SiteHalfSpinor> StencilImpl;
-
+      typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
+      
      ImplParams Params;
-
-      WilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
-
+      
+      WilsonImpl(const ImplParams &p = ImplParams()) : Params(p){};
+      
      bool overlapCommsCompute(void) { return Params.overlapCommsCompute; };
-    
-      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St){
-        mult(&phi(),&U(mu),&chi());
-      }
-
-      template<class ref>
-      inline void loadLinkElement(Simd & reg,ref &memory){
-	reg = memory;
-      }
-      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
-      {
-        conformable(Uds._grid,GaugeGrid);
-        conformable(Umu._grid,GaugeGrid);
-        GaugeLinkField U(GaugeGrid);
-        for(int mu=0;mu<Nd;mu++){
-  	  U = PeekIndex<LorentzIndex>(Umu,mu);
-	  PokeIndex<LorentzIndex>(Uds,U,mu);
-	  U = adj(Cshift(U,mu,-1));
-	  PokeIndex<LorentzIndex>(Uds,U,mu+4);
-	}
+      
+      inline void multLink(SiteHalfSpinor &phi,
+			   const SiteDoubledGaugeField &U,
+			   const SiteHalfSpinor &chi,
+			   int mu,
+			   StencilEntry *SE,
+			   StencilImpl &St) {
+	mult(&phi(), &U(mu), &chi());
      }
      
+      template <class ref>
+      inline void loadLinkElement(Simd &reg,
+				  ref &memory) {
+	reg = memory;
+      }
+      
+      inline void DoubleStore(GridBase *GaugeGrid,
+			      DoubledGaugeField &Uds,
+			      const GaugeField &Umu) {
+	conformable(Uds._grid, GaugeGrid);
+	conformable(Umu._grid, GaugeGrid);
+	GaugeLinkField U(GaugeGrid);
+	for (int mu = 0; mu < Nd; mu++) {
+	  U = PeekIndex<LorentzIndex>(Umu, mu);
+	  PokeIndex<LorentzIndex>(Uds, U, mu);
+	  U = adj(Cshift(U, mu, -1));
+	  PokeIndex<LorentzIndex>(Uds, U, mu + 4);
+	}
+      }
+
      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
 	GaugeLinkField link(mat._grid);
 	link = TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
 	PokeIndex<LorentzIndex>(mat,link,mu);
      }   
-
+      
      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
-
+	
 	int Ls=Btilde._grid->_fdimensions[0];
-
 	GaugeLinkField tmp(mat._grid);
 	tmp = zero;
-PARALLEL_FOR_LOOP
-	for(int sss=0;sss<tmp._grid->oSites();sss++){
-	  int sU=sss;
-	  for(int s=0;s<Ls;s++){
-	    int sF = s+Ls*sU;
-	    tmp[sU] = tmp[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde[sF],Atilde[sF])); // ordering here
+
+        PARALLEL_FOR_LOOP
+	  for(int sss=0;sss<tmp._grid->oSites();sss++){
+	    int sU=sss;
+	    for(int s=0;s<Ls;s++){
+	      int sF = s+Ls*sU;
+	      tmp[sU] = tmp[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde[sF],Atilde[sF])); // ordering here
+	    }
 	  }
-	}
 	PokeIndex<LorentzIndex>(mat,tmp,mu);
-	
+
      }
-
    };

-
-
    ///////
    // Single flavour four spinors with colour index, 5d redblack
    ///////
-    template<class S,int Nrepresentation=Nc>
-    class DomainWallRedBlack5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
+    template<class S,int Nrepresentation=Nc,class _Coeff_t = RealD>
+    class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
    public:
-
-      typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl;
+      
+      static const int Dimension = Nrepresentation;
+      const bool LsVectorised=true;
+      typedef _Coeff_t Coeff_t;      
+      typedef PeriodicGaugeImpl<GaugeImplTypes<S, Nrepresentation> > Gimpl;

      INHERIT_GIMPL_TYPES(Gimpl);
      
-      template<typename vtype> using iImplSpinor             = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
-      template<typename vtype> using iImplHalfSpinor         = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
-      template<typename vtype> using iImplDoubledGaugeField  = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds >;
-      template<typename vtype> using iImplGaugeField         = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd >;
-      template<typename vtype> using iImplGaugeLink          = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
-    
-      typedef iImplSpinor    <Simd>           SiteSpinor;
-      typedef iImplHalfSpinor<Simd>           SiteHalfSpinor;
-      typedef Lattice<SiteSpinor>             FermionField;
-
+      template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
+      template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
+      template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>;
+      template <typename vtype> using iImplGaugeField        = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd>;
+      template <typename vtype> using iImplGaugeLink         = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
+      
+      typedef iImplSpinor<Simd> SiteSpinor;
+      typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
+      typedef Lattice<SiteSpinor> FermionField;
+      
      // Make the doubled gauge field a *scalar*
-      typedef iImplDoubledGaugeField<typename Simd::scalar_type>    SiteDoubledGaugeField; // This is a scalar
-      typedef iImplGaugeField<typename Simd::scalar_type>           SiteScalarGaugeField;  // scalar
-      typedef iImplGaugeLink <typename Simd::scalar_type>           SiteScalarGaugeLink;   // scalar
-
-      typedef Lattice<SiteDoubledGaugeField>                  DoubledGaugeField;
-
-      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
+      typedef iImplDoubledGaugeField<typename Simd::scalar_type>
+      SiteDoubledGaugeField;  // This is a scalar
+      typedef iImplGaugeField<typename Simd::scalar_type>
+      SiteScalarGaugeField;  // scalar
+      typedef iImplGaugeLink<typename Simd::scalar_type>
+      SiteScalarGaugeLink;  // scalar
+      
+      typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
+      
+      typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
      typedef WilsonImplParams ImplParams;
-      typedef WilsonStencil<SiteSpinor,SiteHalfSpinor> StencilImpl;
-
+      typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
+      
      ImplParams Params;
-
-      DomainWallRedBlack5dImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
-
+      
+      DomainWallVec5dImpl(const ImplParams &p = ImplParams()) : Params(p){};
+      
      bool overlapCommsCompute(void) { return false; };
-    
-      template<class ref>
-      inline void loadLinkElement(Simd & reg,ref &memory){
-	vsplat(reg,memory);
+      
+      template <class ref>
+      inline void loadLinkElement(Simd &reg, ref &memory) {
+	vsplat(reg, memory);
      }
-      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
-      {
+      inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
+			   const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
+			   StencilImpl &St) {
 	SiteGaugeLink UU;
-	for(int i=0;i<Nrepresentation;i++){
-	  for(int j=0;j<Nrepresentation;j++){
-	    vsplat(UU()()(i,j),U(mu)()(i,j));
+	for (int i = 0; i < Nrepresentation; i++) {
+	  for (int j = 0; j < Nrepresentation; j++) {
+	    vsplat(UU()()(i, j), U(mu)()(i, j));
 	  }
 	}
-        mult(&phi(),&UU(),&chi());
+	mult(&phi(), &UU(), &chi());
      }
-
-      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
-      {
-	SiteScalarGaugeField  ScalarUmu;
+      
+      inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,
+			      const GaugeField &Umu) {
+	SiteScalarGaugeField ScalarUmu;
 	SiteDoubledGaugeField ScalarUds;
-
-        GaugeLinkField U   (Umu._grid);
-	GaugeField     Uadj(Umu._grid);
-        for(int mu=0;mu<Nd;mu++){
-  	  U = PeekIndex<LorentzIndex>(Umu,mu);
-	  U = adj(Cshift(U,mu,-1));
-	  PokeIndex<LorentzIndex>(Uadj,U,mu);
-	}
-
-	for(int lidx=0;lidx<GaugeGrid->lSites();lidx++){
-	  std::vector<int> lcoor;
-	  GaugeGrid->LocalIndexToLocalCoor(lidx,lcoor);
-
-	  peekLocalSite(ScalarUmu,Umu,lcoor);
-	  for(int mu=0;mu<4;mu++) ScalarUds(mu) = ScalarUmu(mu);
-
-	  peekLocalSite(ScalarUmu,Uadj,lcoor);
-	  for(int mu=0;mu<4;mu++) ScalarUds(mu+4) = ScalarUmu(mu);
-
-	  pokeLocalSite(ScalarUds,Uds,lcoor);
-	}
-
-      }
 	
-      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
-	assert(0);
-      }   
-
-      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
+	GaugeLinkField U(Umu._grid);
+	GaugeField Uadj(Umu._grid);
+	for (int mu = 0; mu < Nd; mu++) {
+	  U = PeekIndex<LorentzIndex>(Umu, mu);
+	  U = adj(Cshift(U, mu, -1));
+	  PokeIndex<LorentzIndex>(Uadj, U, mu);
+	}
+	
+	for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
+	  std::vector<int> lcoor;
+	  GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
+	  
+	  peekLocalSite(ScalarUmu, Umu, lcoor);
+	  for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
+	  
+	  peekLocalSite(ScalarUmu, Uadj, lcoor);
+	  for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
+	  
+	  pokeLocalSite(ScalarUds, Uds, lcoor);
+	}
+      }
+      
+      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,
+				FermionField &A, int mu) {
+	assert(0);
+      }
+      
+      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,
+				FermionField &Atilde, int mu) {
 	assert(0);
      }
-
    };
-
-
+    
    ////////////////////////////////////////////////////////////////////////////////////////
    // Flavour doubled spinors; is Gparity the only? what about C*?
    ////////////////////////////////////////////////////////////////////////////////////////
-
-    template<class S,int Nrepresentation>
-    class GparityWilsonImpl : public ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> >{ 
-    public:
-
-      typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;
-
-      INHERIT_GIMPL_TYPES(Gimpl);
-
-      template<typename vtype> using iImplSpinor             = iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp >;
-      template<typename vtype> using iImplHalfSpinor         = iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp >;
-      template<typename vtype> using iImplDoubledGaugeField  = iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds >, Ngp >;
    
-      typedef iImplSpinor    <Simd>           SiteSpinor;
-      typedef iImplHalfSpinor<Simd>           SiteHalfSpinor;
-      typedef iImplDoubledGaugeField<Simd>    SiteDoubledGaugeField;
+    template <class S, int Nrepresentation,class _Coeff_t = RealD>
+    class GparityWilsonImpl
+      : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
+    public:
+      static const int Dimension = Nrepresentation;

-      typedef Lattice<SiteSpinor>                 FermionField;
+      const bool LsVectorised=false;
+
+      typedef _Coeff_t Coeff_t;
+      typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;
+      
+      INHERIT_GIMPL_TYPES(Gimpl);
+      
+      template <typename vtype>
+      using iImplSpinor =
+      iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp>;
+      template <typename vtype>
+      using iImplHalfSpinor =
+	iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp>;
+      template <typename vtype>
+      using iImplDoubledGaugeField =
+	iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>;
+      
+      typedef iImplSpinor<Simd> SiteSpinor;
+      typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
+      typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
+      
+      typedef Lattice<SiteSpinor> FermionField;
      typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
-
-      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
-      typedef WilsonStencil<SiteSpinor,SiteHalfSpinor> StencilImpl;
+      
+      typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
+      typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;

      typedef GparityWilsonImplParams ImplParams;
-
+      
      ImplParams Params;

-      GparityWilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
-      
+
+      GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){};
+
      bool overlapCommsCompute(void) { return Params.overlapCommsCompute; };

-      // provide the multiply by link that is differentiated between Gparity (with flavour index) and non-Gparity
-      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St){
-
+      // provide the multiply by link that is differentiated between Gparity (with
+      // flavour index) and non-Gparity
+      inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
+			   const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
+			   StencilImpl &St) {
 	typedef SiteHalfSpinor vobj;
 	typedef typename SiteHalfSpinor::scalar_object sobj;
-
+	
 	vobj vtmp;
 	sobj stmp;
 	
 	GridBase *grid = St._grid;
-      
+	
 	const int Nsimd = grid->Nsimd();
 	
-	int direction    = St._directions[mu];
-	int distance     = St._distances[mu];
-	int ptype        = St._permute_type[mu]; 
-	int sl           = St._grid->_simd_layout[direction];
-
+	int direction = St._directions[mu];
+	int distance = St._distances[mu];
+	int ptype = St._permute_type[mu];
+	int sl = St._grid->_simd_layout[direction];
+	
 	// Fixme X.Y.Z.T hardcode in stencil
-	int mmu          = mu % Nd;
-
+	int mmu = mu % Nd;
+	
 	// assert our assumptions
-	assert((distance==1)||(distance==-1)); // nearest neighbour stencil hard code
-	assert((sl==1)||(sl==2));
+	assert((distance == 1) || (distance == -1));  // nearest neighbour stencil hard code
+	assert((sl == 1) || (sl == 2));
 	
 	std::vector<int> icoor;
-      
+	
 	if ( SE->_around_the_world && Params.twists[mmu] ) {

 	  if ( sl == 2 ) {
@@ -380,7 +415,7 @@ PARALLEL_FOR_LOOP
 	  mult(&phi(1),&U(1)(mu),&chi(1));
 	}
 	
-      }
+  }

      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
      {
@@ -393,7 +428,7 @@ PARALLEL_FOR_LOOP
 	GaugeLinkField Uconj(GaugeGrid);
 	
 	Lattice<iScalar<vInteger> > coor(GaugeGrid);
-
+	
 	
 	for(int mu=0;mu<Nd;mu++){
 	  
@@ -401,19 +436,19 @@ PARALLEL_FOR_LOOP
 	  
 	  U     = PeekIndex<LorentzIndex>(Umu,mu);
 	  Uconj = conjugate(U);
-
+	  
 	  // This phase could come from a simple bc 1,1,-1,1 ..
 	  int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
 	  if ( Params.twists[mu] ) { 
 	    Uconj = where(coor==neglink,-Uconj,Uconj);
 	  }
-
 	  
-PARALLEL_FOR_LOOP
-	  for(auto ss=U.begin();ss<U.end();ss++){
-	    Uds[ss](0)(mu) = U[ss]();
-	    Uds[ss](1)(mu) = Uconj[ss]();
-	  }
+	  
+	  PARALLEL_FOR_LOOP
+	    for(auto ss=U.begin();ss<U.end();ss++){
+	      Uds[ss](0)(mu) = U[ss]();
+	      Uds[ss](1)(mu) = Uconj[ss]();
+	    }
 	  
 	  U     = adj(Cshift(U    ,mu,-1));      // correct except for spanning the boundary
 	  Uconj = adj(Cshift(Uconj,mu,-1));
@@ -423,68 +458,86 @@ PARALLEL_FOR_LOOP
 	    Utmp = where(coor==0,Uconj,Utmp);
 	  }
 	  
-PARALLEL_FOR_LOOP
-	  for(auto ss=U.begin();ss<U.end();ss++){
-	    Uds[ss](0)(mu+4) = Utmp[ss]();
-	  }
+	  PARALLEL_FOR_LOOP
+	    for(auto ss=U.begin();ss<U.end();ss++){
+	      Uds[ss](0)(mu+4) = Utmp[ss]();
+	    }
 	  
 	  Utmp = Uconj;
 	  if ( Params.twists[mu] ) { 
 	    Utmp = where(coor==0,U,Utmp);
 	  }
 	  
-PARALLEL_FOR_LOOP
-	  for(auto ss=U.begin();ss<U.end();ss++){
-	    Uds[ss](1)(mu+4) = Utmp[ss]();
-	  }
+	  PARALLEL_FOR_LOOP
+	    for(auto ss=U.begin();ss<U.end();ss++){
+	      Uds[ss](1)(mu+4) = Utmp[ss]();
+	    }
 	  
 	}
      }
-
-      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
-	
+      
+      
+      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,
+				FermionField &A, int mu) {
 	// DhopDir provides U or Uconj depending on coor/flavour.
 	GaugeLinkField link(mat._grid);
 	// use lorentz for flavour as hack.
-	auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde,A));  
-PARALLEL_FOR_LOOP
-        for(auto ss=tmp.begin();ss<tmp.end();ss++){
-	  link[ss]() = tmp[ss](0,0) - conjugate(tmp[ss](1,1)) ;
-	}
-	PokeIndex<LorentzIndex>(mat,link,mu);
+	auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
+	PARALLEL_FOR_LOOP
+	  for (auto ss = tmp.begin(); ss < tmp.end(); ss++) {
+	    link[ss]() = tmp[ss](0, 0) - conjugate(tmp[ss](1, 1));
+	  }
+	PokeIndex<LorentzIndex>(mat, link, mu);
 	return;
      }
-      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
-
-	int Ls=Btilde._grid->_fdimensions[0];
-
+      
+      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,
+				FermionField &Atilde, int mu) {
+	int Ls = Btilde._grid->_fdimensions[0];
+	
 	GaugeLinkField tmp(mat._grid);
 	tmp = zero;
-PARALLEL_FOR_LOOP
-	for(int ss=0;ss<tmp._grid->oSites();ss++){
-	  for(int s=0;s<Ls;s++){
-	    int sF = s+Ls*ss;
-	    auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde[sF],Atilde[sF]));
-	    tmp[ss]() = tmp[ss]()+ ttmp(0,0) + conjugate(ttmp(1,1));
+	PARALLEL_FOR_LOOP
+	  for (int ss = 0; ss < tmp._grid->oSites(); ss++) {
+	    for (int s = 0; s < Ls; s++) {
+	      int sF = s + Ls * ss;
+	      auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde[sF], Atilde[sF]));
+	      tmp[ss]() = tmp[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
+	    }
 	  }
-	}
-	PokeIndex<LorentzIndex>(mat,tmp,mu);
+	PokeIndex<LorentzIndex>(mat, tmp, mu);
 	return;
      }
    };

-    typedef WilsonImpl<vComplex ,Nc> WilsonImplR; // Real.. whichever prec
-    typedef WilsonImpl<vComplexF,Nc> WilsonImplF; // Float
-    typedef WilsonImpl<vComplexD,Nc> WilsonImplD; // Double
+    typedef WilsonImpl<vComplex,  FundamentalRepresentation > WilsonImplR;   // Real.. whichever prec
+    typedef WilsonImpl<vComplexF, FundamentalRepresentation > WilsonImplF;  // Float
+    typedef WilsonImpl<vComplexD, FundamentalRepresentation > WilsonImplD;  // Double

-    typedef DomainWallRedBlack5dImpl<vComplex ,Nc> DomainWallRedBlack5dImplR; // Real.. whichever prec
-    typedef DomainWallRedBlack5dImpl<vComplexF,Nc> DomainWallRedBlack5dImplF; // Float
-    typedef DomainWallRedBlack5dImpl<vComplexD,Nc> DomainWallRedBlack5dImplD; // Double

-    typedef GparityWilsonImpl<vComplex ,Nc> GparityWilsonImplR; // Real.. whichever prec
-    typedef GparityWilsonImpl<vComplexF,Nc> GparityWilsonImplF; // Float
-    typedef GparityWilsonImpl<vComplexD,Nc> GparityWilsonImplD; // Double
+    typedef WilsonImpl<vComplex,  FundamentalRepresentation, ComplexD > ZWilsonImplR; // Real.. whichever prec
+    typedef WilsonImpl<vComplexF, FundamentalRepresentation, ComplexD > ZWilsonImplF; // Float
+    typedef WilsonImpl<vComplexD, FundamentalRepresentation, ComplexD > ZWilsonImplD; // Double

-  }
+    typedef WilsonImpl<vComplex,  AdjointRepresentation > WilsonAdjImplR;   // Real.. whichever prec
+    typedef WilsonImpl<vComplexF, AdjointRepresentation > WilsonAdjImplF;  // Float
+    typedef WilsonImpl<vComplexD, AdjointRepresentation > WilsonAdjImplD;  // Double
+
+    typedef WilsonImpl<vComplex,  TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplR;   // Real.. whichever prec
+    typedef WilsonImpl<vComplexF, TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplF;  // Float
+    typedef WilsonImpl<vComplexD, TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplD;  // Double
+
+    typedef DomainWallVec5dImpl<vComplex ,Nc> DomainWallVec5dImplR; // Real.. whichever prec
+    typedef DomainWallVec5dImpl<vComplexF,Nc> DomainWallVec5dImplF; // Float
+    typedef DomainWallVec5dImpl<vComplexD,Nc> DomainWallVec5dImplD; // Double
+    
+    typedef DomainWallVec5dImpl<vComplex ,Nc,ComplexD> ZDomainWallVec5dImplR; // Real.. whichever prec
+    typedef DomainWallVec5dImpl<vComplexF,Nc,ComplexD> ZDomainWallVec5dImplF; // Float
+    typedef DomainWallVec5dImpl<vComplexD,Nc,ComplexD> ZDomainWallVec5dImplD; // Double
+
+    typedef GparityWilsonImpl<vComplex, Nc>  GparityWilsonImplR;  // Real.. whichever prec
+    typedef GparityWilsonImpl<vComplexF, Nc> GparityWilsonImplF;  // Float
+    typedef GparityWilsonImpl<vComplexD, Nc> GparityWilsonImplD;  // Double
+}
 }
 #endif
--- a/lib/qcd/action/fermion/MobiusFermion.h
+++ b/lib/qcd/action/fermion/MobiusFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_MOBIUS_FERMION_H
 #define  GRID_QCD_MOBIUS_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/MobiusZolotarevFermion.h
+++ b/lib/qcd/action/fermion/MobiusZolotarevFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_MOBIUS_ZOLOTAREV_FERMION_H
 #define  GRID_QCD_MOBIUS_ZOLOTAREV_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef OVERLAP_WILSON_CAYLEY_TANH_FERMION_H
 #define OVERLAP_WILSON_CAYLEY_TANH_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  OVERLAP_WILSON_CAYLEY_ZOLOTAREV_FERMION_H
 #define  OVERLAP_WILSON_CAYLEY_ZOLOTAREV_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef OVERLAP_WILSON_CONTFRAC_TANH_FERMION_H
 #define OVERLAP_WILSON_CONTFRAC_TANH_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef OVERLAP_WILSON_CONTFRAC_ZOLOTAREV_FERMION_H
 #define OVERLAP_WILSON_CONTFRAC_ZOLOTAREV_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef OVERLAP_WILSON_PARTFRAC_TANH_FERMION_H
 #define OVERLAP_WILSON_PARTFRAC_TANH_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef OVERLAP_WILSON_PARTFRAC_ZOLOTAREV_FERMION_H
 #define OVERLAP_WILSON_PARTFRAC_ZOLOTAREV_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/ScaledShamirFermion.h
+++ b/lib/qcd/action/fermion/ScaledShamirFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_SCALED_SHAMIR_FERMION_H
 #define  GRID_QCD_SCALED_SHAMIR_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/ShamirZolotarevFermion.h
+++ b/lib/qcd/action/fermion/ShamirZolotarevFermion.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_SHAMIR_ZOLOTAREV_FERMION_H
 #define  GRID_QCD_SHAMIR_ZOLOTAREV_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/WilsonFermion.cc
+++ b/lib/qcd/action/fermion/WilsonFermion.cc
@@ -1,319 +1,315 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/qcd/action/fermion/WilsonFermion.cc
+Source file: ./lib/qcd/action/fermion/WilsonFermion.cc

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 #include <Grid.h>

 namespace Grid {
 namespace QCD {

-  const std::vector<int> WilsonFermionStatic::directions   ({0,1,2,3, 0, 1, 2, 3});
-  const std::vector<int> WilsonFermionStatic::displacements({1,1,1,1,-1,-1,-1,-1});
-  int WilsonFermionStatic::HandOptDslash;
+const std::vector<int> WilsonFermionStatic::directions({0, 1, 2, 3, 0, 1, 2,
+                                                        3});
+const std::vector<int> WilsonFermionStatic::displacements({1, 1, 1, 1, -1, -1,
+                                                           -1, -1});
+int WilsonFermionStatic::HandOptDslash;

-  /////////////////////////////////
-  // Constructor and gauge import
-  /////////////////////////////////
+/////////////////////////////////
+// Constructor and gauge import
+/////////////////////////////////

-  template<class Impl>
-  WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu,
-				     GridCartesian         &Fgrid,
-				     GridRedBlackCartesian &Hgrid, 
-				     RealD _mass,const ImplParams &p) :
-        Kernels(p),
-        _grid(&Fgrid),
-	_cbgrid(&Hgrid),
-	Stencil    (&Fgrid,npoint,Even,directions,displacements),
-	StencilEven(&Hgrid,npoint,Even,directions,displacements), // source is Even
-	StencilOdd (&Hgrid,npoint,Odd ,directions,displacements), // source is Odd
-	mass(_mass),
-	Lebesgue(_grid),
-	LebesgueEvenOdd(_cbgrid),
-	Umu(&Fgrid),
-	UmuEven(&Hgrid),
-	UmuOdd (&Hgrid) 
-  {
-    // Allocate the required comms buffer
-    ImportGauge(_Umu);
+template <class Impl>
+WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
+                                   GridRedBlackCartesian &Hgrid, RealD _mass,
+                                   const ImplParams &p)
+    : Kernels(p),
+      _grid(&Fgrid),
+      _cbgrid(&Hgrid),
+      Stencil(&Fgrid, npoint, Even, directions, displacements),
+      StencilEven(&Hgrid, npoint, Even, directions,
+                  displacements),  // source is Even
+      StencilOdd(&Hgrid, npoint, Odd, directions,
+                 displacements),  // source is Odd
+      mass(_mass),
+      Lebesgue(_grid),
+      LebesgueEvenOdd(_cbgrid),
+      Umu(&Fgrid),
+      UmuEven(&Hgrid),
+      UmuOdd(&Hgrid) {
+  // Allocate the required comms buffer
+  ImportGauge(_Umu);
+}
+
+template <class Impl>
+void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu) {
+  GaugeField HUmu(_Umu._grid);
+  HUmu = _Umu * (-0.5);
+  Impl::DoubleStore(GaugeGrid(), Umu, HUmu);
+  pickCheckerboard(Even, UmuEven, Umu);
+  pickCheckerboard(Odd, UmuOdd, Umu);
+}
+
+/////////////////////////////
+// Implement the interface
+/////////////////////////////
+
+template <class Impl>
+RealD WilsonFermion<Impl>::M(const FermionField &in, FermionField &out) {
+  out.checkerboard = in.checkerboard;
+  Dhop(in, out, DaggerNo);
+  return axpy_norm(out, 4 + mass, in, out);
+}
+
+template <class Impl>
+RealD WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
+  out.checkerboard = in.checkerboard;
+  Dhop(in, out, DaggerYes);
+  return axpy_norm(out, 4 + mass, in, out);
+}
+
+template <class Impl>
+void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
+  if (in.checkerboard == Odd) {
+    DhopEO(in, out, DaggerNo);
+  } else {
+    DhopOE(in, out, DaggerNo);
  }
-
-  template<class Impl>
-  void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
-  {
-    GaugeField HUmu(_Umu._grid);
-    HUmu = _Umu*(-0.5);
-    Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
-    pickCheckerboard(Even,UmuEven,Umu);
-    pickCheckerboard(Odd ,UmuOdd,Umu);
-  }
-  
-  /////////////////////////////
-  // Implement the interface
-  /////////////////////////////
-      
-  template<class Impl>
-  RealD WilsonFermion<Impl>::M(const FermionField &in, FermionField &out) 
-  {
-    out.checkerboard=in.checkerboard;
-    Dhop(in,out,DaggerNo);
-    return axpy_norm(out,4+mass,in,out);
+}
+template <class Impl>
+void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
+  if (in.checkerboard == Odd) {
+    DhopEO(in, out, DaggerYes);
+  } else {
+    DhopOE(in, out, DaggerYes);
  }
+}

-  template<class Impl>
-  RealD WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out) 
-  {
-    out.checkerboard=in.checkerboard;
-    Dhop(in,out,DaggerYes);
-    return axpy_norm(out,4+mass,in,out);
-  }
+template <class Impl>
+void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
+  out.checkerboard = in.checkerboard;
+  typename FermionField::scalar_type scal(4.0 + mass);
+  out = scal * in;
+}

-  template<class Impl>
-  void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out) 
-  {
-    if ( in.checkerboard == Odd ) {
-      DhopEO(in,out,DaggerNo);
-    } else {
-      DhopOE(in,out,DaggerNo);
-    }
-  }
-  template<class Impl>
-  void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) 
-  {
-    if ( in.checkerboard == Odd ) {
-      DhopEO(in,out,DaggerYes);
-    } else {
-      DhopOE(in,out,DaggerYes);
+template <class Impl>
+void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
+  out.checkerboard = in.checkerboard;
+  Mooee(in, out);
+}
+
+template <class Impl>
+void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
+  out.checkerboard = in.checkerboard;
+  out = (1.0 / (4.0 + mass)) * in;
+}
+
+template <class Impl>
+void WilsonFermion<Impl>::MooeeInvDag(const FermionField &in,
+                                      FermionField &out) {
+  out.checkerboard = in.checkerboard;
+  MooeeInv(in, out);
+}
+
+///////////////////////////////////
+// Internal
+///////////////////////////////////
+
+template <class Impl>
+void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
+                                        GaugeField &mat, const FermionField &A,
+                                        const FermionField &B, int dag) {
+  assert((dag == DaggerNo) || (dag == DaggerYes));
+
+  Compressor compressor(dag);
+
+  FermionField Btilde(B._grid);
+  FermionField Atilde(B._grid);
+  Atilde = A;
+
+  st.HaloExchange(B, compressor);
+
+  for (int mu = 0; mu < Nd; mu++) {
+    ////////////////////////////////////////////////////////////////////////
+    // Flip gamma (1+g)<->(1-g) if dag
+    ////////////////////////////////////////////////////////////////////////
+    int gamma = mu;
+    if (!dag) gamma += Nd;
+
+    ////////////////////////
+    // Call the single hop
+    ////////////////////////
+    PARALLEL_FOR_LOOP
+    for (int sss = 0; sss < B._grid->oSites(); sss++) {
+      Kernels::DiracOptDhopDir(st, U, st.comm_buf, sss, sss, B, Btilde, mu,
+                               gamma);
+    }
+
+    //////////////////////////////////////////////////
+    // spin trace outer product
+    //////////////////////////////////////////////////
+    Impl::InsertForce4D(mat, Btilde, Atilde, mu);
+  }
+}
+
+template <class Impl>
+void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U,
+                                    const FermionField &V, int dag) {
+  conformable(U._grid, _grid);
+  conformable(U._grid, V._grid);
+  conformable(U._grid, mat._grid);
+
+  mat.checkerboard = U.checkerboard;
+
+  DerivInternal(Stencil, Umu, mat, U, V, dag);
+}
+
+template <class Impl>
+void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U,
+                                      const FermionField &V, int dag) {
+  conformable(U._grid, _cbgrid);
+  conformable(U._grid, V._grid);
+  conformable(U._grid, mat._grid);
+
+  assert(V.checkerboard == Even);
+  assert(U.checkerboard == Odd);
+  mat.checkerboard = Odd;
+
+  DerivInternal(StencilEven, UmuOdd, mat, U, V, dag);
+}
+
+template <class Impl>
+void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U,
+                                      const FermionField &V, int dag) {
+  conformable(U._grid, _cbgrid);
+  conformable(U._grid, V._grid);
+  conformable(U._grid, mat._grid);
+
+  assert(V.checkerboard == Odd);
+  assert(U.checkerboard == Even);
+  mat.checkerboard = Even;
+
+  DerivInternal(StencilOdd, UmuEven, mat, U, V, dag);
+}
+
+template <class Impl>
+void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out,
+                               int dag) {
+  conformable(in._grid, _grid);  // verifies full grid
+  conformable(in._grid, out._grid);
+
+  out.checkerboard = in.checkerboard;
+
+  DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
+}
+
+template <class Impl>
+void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out,
+                                 int dag) {
+  conformable(in._grid, _cbgrid);    // verifies half grid
+  conformable(in._grid, out._grid);  // drops the cb check
+
+  assert(in.checkerboard == Even);
+  out.checkerboard = Odd;
+
+  DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
+}
+
+template <class Impl>
+void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,
+                                 int dag) {
+  conformable(in._grid, _cbgrid);    // verifies half grid
+  conformable(in._grid, out._grid);  // drops the cb check
+
+  assert(in.checkerboard == Odd);
+  out.checkerboard = Even;
+
+  DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
+}
+
+template <class Impl>
+void WilsonFermion<Impl>::Mdir(const FermionField &in, FermionField &out,
+                               int dir, int disp) {
+  DhopDir(in, out, dir, disp);
+}
+
+template <class Impl>
+void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out,
+                                  int dir, int disp) {
+  int skip = (disp == 1) ? 0 : 1;
+  int dirdisp = dir + skip * 4;
+  int gamma = dir + (1 - skip) * 4;
+
+  DhopDirDisp(in, out, dirdisp, gamma, DaggerNo);
+};
+
+template <class Impl>
+void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,
+                                      int dirdisp, int gamma, int dag) {
+  Compressor compressor(dag);
+
+  Stencil.HaloExchange(in, compressor);
+
+  PARALLEL_FOR_LOOP
+  for (int sss = 0; sss < in._grid->oSites(); sss++) {
+    Kernels::DiracOptDhopDir(Stencil, Umu, Stencil.comm_buf, sss, sss, in, out,
+                             dirdisp, gamma);
+  }
+};
+
+template <class Impl>
+void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
+                                       DoubledGaugeField &U,
+                                       const FermionField &in,
+                                       FermionField &out, int dag) {
+  assert((dag == DaggerNo) || (dag == DaggerYes));
+
+  Compressor compressor(dag);
+  st.HaloExchange(in, compressor);
+
+  if (dag == DaggerYes) {
+    PARALLEL_FOR_LOOP
+    for (int sss = 0; sss < in._grid->oSites(); sss++) {
+      Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sss, sss, 1, 1, in,
+                                   out);
+    }
+  } else {
+    PARALLEL_FOR_LOOP
+    for (int sss = 0; sss < in._grid->oSites(); sss++) {
+      Kernels::DiracOptDhopSite(st, lo, U, st.comm_buf, sss, sss, 1, 1, in,
+                                out);
    }
  }
+};

-  template<class Impl>
-  void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
-    out.checkerboard = in.checkerboard;
-    typename FermionField::scalar_type scal(4.0+mass);
-    out = scal*in;
-  }
-  
-  template<class Impl>
-  void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
-    out.checkerboard = in.checkerboard;
-    Mooee(in,out);
-  }
-  
-  template<class Impl>
-  void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
-    out.checkerboard = in.checkerboard;
-    out = (1.0/(4.0+mass))*in;
-  }
-  
-  template<class Impl>
-  void WilsonFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) {
-    out.checkerboard = in.checkerboard;
-    MooeeInv(in,out);
-  }
-  
-  ///////////////////////////////////
-  // Internal
-  ///////////////////////////////////
-
-  template<class Impl>
-  void WilsonFermion<Impl>::DerivInternal(StencilImpl & st,
-					  DoubledGaugeField & U,
-					  GaugeField &mat,
-					  const FermionField &A,
-					  const FermionField &B,int dag) {
-	
-    assert((dag==DaggerNo) ||(dag==DaggerYes));
-    
-    Compressor compressor(dag);
-    
-    FermionField Btilde(B._grid);
-    FermionField Atilde(B._grid);
-    Atilde = A;
-
-    st.HaloExchange(B,compressor);
-    
-    for(int mu=0;mu<Nd;mu++){
-      
-      ////////////////////////////////////////////////////////////////////////
-      // Flip gamma (1+g)<->(1-g) if dag
-      ////////////////////////////////////////////////////////////////////////
-      int gamma = mu;
-      if ( !dag ) gamma+= Nd;
-      
-      ////////////////////////
-      // Call the single hop
-      ////////////////////////
-PARALLEL_FOR_LOOP
-	for(int sss=0;sss<B._grid->oSites();sss++){
-	  Kernels::DiracOptDhopDir(st,U,st.comm_buf,sss,sss,B,Btilde,mu,gamma);
-	}
-      
-      //////////////////////////////////////////////////
-      // spin trace outer product
-      //////////////////////////////////////////////////
-      Impl::InsertForce4D(mat,Btilde,Atilde,mu);
-
-    }
-  }
-  
-  template<class Impl>
-  void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-  {
-    conformable(U._grid,_grid);  
-    conformable(U._grid,V._grid);
-    conformable(U._grid,mat._grid);
-    
-    mat.checkerboard = U.checkerboard;
-    
-    DerivInternal(Stencil,Umu,mat,U,V,dag);
-  }
-  
-  template<class Impl>
-  void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-  {
-    conformable(U._grid,_cbgrid);  
-    conformable(U._grid,V._grid);
-    conformable(U._grid,mat._grid);
-    
-    assert(V.checkerboard==Even);
-    assert(U.checkerboard==Odd);
-    mat.checkerboard = Odd;
-    
-    DerivInternal(StencilEven,UmuOdd,mat,U,V,dag);
-  }
-  
-  template<class Impl>
-  void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-  {
-    conformable(U._grid,_cbgrid);  
-    conformable(U._grid,V._grid);
-    conformable(U._grid,mat._grid);
-	
-    assert(V.checkerboard==Odd);
-    assert(U.checkerboard==Even);
-    mat.checkerboard = Even;
-	
-    DerivInternal(StencilOdd,UmuEven,mat,U,V,dag);
-  }
-  
-
-  template<class Impl>
-  void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out,int dag) {
-    conformable(in._grid,_grid); // verifies full grid
-    conformable(in._grid,out._grid);
-    
-    out.checkerboard = in.checkerboard;
-    
-    DhopInternal(Stencil,Lebesgue,Umu,in,out,dag);
-  }
-  
-  template<class Impl>
-  void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag) {
-    conformable(in._grid,_cbgrid);    // verifies half grid
-    conformable(in._grid,out._grid); // drops the cb check
-    
-    assert(in.checkerboard==Even);
-    out.checkerboard = Odd;
-    
-    DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,in,out,dag);
-  }
-  
-  template<class Impl>
-  void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) {
-    conformable(in._grid,_cbgrid);    // verifies half grid
-    conformable(in._grid,out._grid); // drops the cb check
-    
-    assert(in.checkerboard==Odd);
-    out.checkerboard = Even;
-    
-    DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,in,out,dag);
-  }
-  
-  template<class Impl>
-  void WilsonFermion<Impl>::Mdir (const FermionField &in, FermionField &out,int dir,int disp) {
-    DhopDir(in,out,dir,disp);
-  }
-  
-
-  template<class Impl>
-  void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir,int disp){
-    
-    int skip = (disp==1) ? 0 : 1;
-    int dirdisp  = dir+skip*4;
-    int gamma    = dir+(1-skip)*4;
-    
-    DhopDirDisp(in,out,dirdisp,gamma,DaggerNo);
-    
-  };
-  
-  template<class Impl>
-  void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,int dirdisp,int gamma,int dag) {
-    
-    Compressor compressor(dag);
-    
-    Stencil.HaloExchange(in,compressor);
-    
-PARALLEL_FOR_LOOP
-      for(int sss=0;sss<in._grid->oSites();sss++){
-	Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.comm_buf,sss,sss,in,out,dirdisp,gamma);
-      }
-    
-  };
-
-  template<class Impl>
-  void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,LebesgueOrder& lo,DoubledGaugeField & U,
-					 const FermionField &in, FermionField &out,int dag) 
-  {
-    assert((dag==DaggerNo) ||(dag==DaggerYes));
-
-    Compressor compressor(dag);
-    st.HaloExchange(in,compressor);
-    
-    if ( dag == DaggerYes ) {
-PARALLEL_FOR_LOOP
-      for(int sss=0;sss<in._grid->oSites();sss++){
-	Kernels::DiracOptDhopSiteDag(st,lo,U,st.comm_buf,sss,sss,1,1,in,out);
-      }
-    } else {
-PARALLEL_FOR_LOOP
-      for(int sss=0;sss<in._grid->oSites();sss++){
-	Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sss,sss,1,1,in,out);
-      }
-    }
-  };
-
- 
-  FermOpTemplateInstantiate(WilsonFermion);
-  GparityFermOpTemplateInstantiate(WilsonFermion);
-
-
-}}
-
-
-
+FermOpTemplateInstantiate(WilsonFermion);
+AdjointFermOpTemplateInstantiate(WilsonFermion);
+TwoIndexFermOpTemplateInstantiate(WilsonFermion);
+GparityFermOpTemplateInstantiate(WilsonFermion);
+}
+}
--- a/lib/qcd/action/fermion/WilsonFermion.h
+++ b/lib/qcd/action/fermion/WilsonFermion.h
@@ -1,161 +1,155 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/qcd/action/fermion/WilsonFermion.h
+Source file: ./lib/qcd/action/fermion/WilsonFermion.h

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef  GRID_QCD_WILSON_FERMION_H
-#define  GRID_QCD_WILSON_FERMION_H
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_QCD_WILSON_FERMION_H
+#define GRID_QCD_WILSON_FERMION_H

 namespace Grid {

-  namespace QCD {
+namespace QCD {

-    class WilsonFermionStatic {
-    public:
-      static int HandOptDslash; // these are a temporary hack
-      static int MortonOrder;
-      static const std::vector<int> directions   ;
-      static const std::vector<int> displacements;
-      static const int npoint=8;
-    };
+class WilsonFermionStatic {
+ public:
+  static int HandOptDslash;  // these are a temporary hack
+  static int MortonOrder;
+  static const std::vector<int> directions;
+  static const std::vector<int> displacements;
+  static const int npoint = 8;
+};

-    template<class Impl>
-    class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic
-    {
-    public:
-    INHERIT_IMPL_TYPES(Impl);
-    typedef WilsonKernels<Impl> Kernels;
+template <class Impl>
+class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
+ public:
+  INHERIT_IMPL_TYPES(Impl);
+  typedef WilsonKernels<Impl> Kernels;

-      ///////////////////////////////////////////////////////////////
-      // Implement the abstract base
-      ///////////////////////////////////////////////////////////////
-      GridBase *GaugeGrid(void)              { return _grid ;}
-      GridBase *GaugeRedBlackGrid(void)      { return _cbgrid ;}
-      GridBase *FermionGrid(void)            { return _grid;}
-      GridBase *FermionRedBlackGrid(void)    { return _cbgrid;}
+  ///////////////////////////////////////////////////////////////
+  // Implement the abstract base
+  ///////////////////////////////////////////////////////////////
+  GridBase *GaugeGrid(void) { return _grid; }
+  GridBase *GaugeRedBlackGrid(void) { return _cbgrid; }
+  GridBase *FermionGrid(void) { return _grid; }
+  GridBase *FermionRedBlackGrid(void) { return _cbgrid; }

-      //////////////////////////////////////////////////////////////////
-      // override multiply; cut number routines if pass dagger argument
-      // and also make interface more uniformly consistent
-      //////////////////////////////////////////////////////////////////
-      RealD M(const FermionField &in, FermionField &out);
-      RealD Mdag(const FermionField &in, FermionField &out);
+  //////////////////////////////////////////////////////////////////
+  // override multiply; cut number routines if pass dagger argument
+  // and also make interface more uniformly consistent
+  //////////////////////////////////////////////////////////////////
+  RealD M(const FermionField &in, FermionField &out);
+  RealD Mdag(const FermionField &in, FermionField &out);

-      /////////////////////////////////////////////////////////
-      // half checkerboard operations
-      // could remain virtual so we  can derive Clover from Wilson base
-      /////////////////////////////////////////////////////////
-      void Meooe(const FermionField &in, FermionField &out) ;
-      void MeooeDag(const FermionField &in, FermionField &out) ;
+  /////////////////////////////////////////////////////////
+  // half checkerboard operations
+  // could remain virtual so we  can derive Clover from Wilson base
+  /////////////////////////////////////////////////////////
+  void Meooe(const FermionField &in, FermionField &out);
+  void MeooeDag(const FermionField &in, FermionField &out);

-      // allow override for twisted mass and clover
-      virtual void Mooee(const FermionField &in, FermionField &out) ;
-      virtual void MooeeDag(const FermionField &in, FermionField &out) ;
-      virtual void MooeeInv(const FermionField &in, FermionField &out) ;
-      virtual void MooeeInvDag(const FermionField &in, FermionField &out) ;
+  // allow override for twisted mass and clover
+  virtual void Mooee(const FermionField &in, FermionField &out);
+  virtual void MooeeDag(const FermionField &in, FermionField &out);
+  virtual void MooeeInv(const FermionField &in, FermionField &out);
+  virtual void MooeeInvDag(const FermionField &in, FermionField &out);

-      ////////////////////////
-      // Derivative interface
-      ////////////////////////
-      // Interface calls an internal routine
-      void DhopDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-      void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-      void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  ////////////////////////
+  // Derivative interface
+  ////////////////////////
+  // Interface calls an internal routine
+  void DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V,
+                 int dag);
+  void DhopDerivOE(GaugeField &mat, const FermionField &U,
+                   const FermionField &V, int dag);
+  void DhopDerivEO(GaugeField &mat, const FermionField &U,
+                   const FermionField &V, int dag);
+
+  ///////////////////////////////////////////////////////////////
+  // non-hermitian hopping term; half cb or both
+  ///////////////////////////////////////////////////////////////
+  void Dhop(const FermionField &in, FermionField &out, int dag);
+  void DhopOE(const FermionField &in, FermionField &out, int dag);
+  void DhopEO(const FermionField &in, FermionField &out, int dag);
+
+  ///////////////////////////////////////////////////////////////
+  // Multigrid assistance; force term uses too
+  ///////////////////////////////////////////////////////////////
+  void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
+  void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);
+  void DhopDirDisp(const FermionField &in, FermionField &out, int dirdisp,
+                   int gamma, int dag);
+
+  ///////////////////////////////////////////////////////////////
+  // Extra methods added by derived
+  ///////////////////////////////////////////////////////////////
+  void DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat,
+                     const FermionField &A, const FermionField &B, int dag);
+
+  void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+                    const FermionField &in, FermionField &out, int dag);
+
+  // Constructor
+  WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
+                GridRedBlackCartesian &Hgrid, RealD _mass,
+                const ImplParams &p = ImplParams());
+
+  // DoubleStore impl dependent
+  void ImportGauge(const GaugeField &_Umu);
+
+  ///////////////////////////////////////////////////////////////
+  // Data members require to support the functionality
+  ///////////////////////////////////////////////////////////////
+
+  //    protected:
+ public:
+  RealD mass;
+
+  GridBase *_grid;
+  GridBase *_cbgrid;
+
+  // Defines the stencils for even and odd
+  StencilImpl Stencil;
+  StencilImpl StencilEven;
+  StencilImpl StencilOdd;
+
+  // Copy of the gauge field , with even and odd subsets
+  DoubledGaugeField Umu;
+  DoubledGaugeField UmuEven;
+  DoubledGaugeField UmuOdd;
+
+  LebesgueOrder Lebesgue;
+  LebesgueOrder LebesgueEvenOdd;
+};
+
+typedef WilsonFermion<WilsonImplF> WilsonFermionF;
+typedef WilsonFermion<WilsonImplD> WilsonFermionD;


-      ///////////////////////////////////////////////////////////////
-      // non-hermitian hopping term; half cb or both
-      ///////////////////////////////////////////////////////////////
-      void Dhop(const FermionField &in, FermionField &out,int dag) ;
-      void DhopOE(const FermionField &in, FermionField &out,int dag) ;
-      void DhopEO(const FermionField &in, FermionField &out,int dag) ;
-
-      ///////////////////////////////////////////////////////////////
-      // Multigrid assistance; force term uses too
-      ///////////////////////////////////////////////////////////////
-      void Mdir (const FermionField &in, FermionField &out,int dir,int disp) ;
-      void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
-      void DhopDirDisp(const FermionField &in, FermionField &out,int dirdisp,int gamma,int dag) ;
-
-      ///////////////////////////////////////////////////////////////
-      // Extra methods added by derived
-      ///////////////////////////////////////////////////////////////
-      void DerivInternal(StencilImpl & st,
-			 DoubledGaugeField & U,
-			 GaugeField &mat,
-			 const FermionField &A,
-			 const FermionField &B,
-			 int dag);
-
-      void DhopInternal(StencilImpl & st,LebesgueOrder & lo,DoubledGaugeField & U,
-			const FermionField &in, FermionField &out,int dag) ;
-
-      // Constructor
-      WilsonFermion(GaugeField &_Umu,
-		    GridCartesian         &Fgrid,
-		    GridRedBlackCartesian &Hgrid, 
-		    RealD _mass,
-		    const ImplParams &p= ImplParams()
-		    ) ;
-
-      // DoubleStore impl dependent
-      void ImportGauge(const GaugeField &_Umu);
-
-      ///////////////////////////////////////////////////////////////
-      // Data members require to support the functionality
-      ///////////////////////////////////////////////////////////////
-
-      //    protected:
-    public:
-
-      RealD                        mass;
-
-      GridBase                     *    _grid; 
-      GridBase                     *  _cbgrid;
-
-      //Defines the stencils for even and odd
-      StencilImpl Stencil; 
-      StencilImpl StencilEven; 
-      StencilImpl StencilOdd; 
-
-      // Copy of the gauge field , with even and odd subsets
-      DoubledGaugeField Umu;
-      DoubledGaugeField UmuEven;
-      DoubledGaugeField UmuOdd;
-
-      LebesgueOrder Lebesgue;
-      LebesgueOrder LebesgueEvenOdd;
-
-      
-    };
-
-    typedef WilsonFermion<WilsonImplF> WilsonFermionF;
-    typedef WilsonFermion<WilsonImplD> WilsonFermionD;
-
-  }
+}
 }
 #endif
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -42,15 +42,15 @@ const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1
  // 5d lattice for DWF.
 template<class Impl>
 WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
-				       GridCartesian         &FiveDimGrid,
-				       GridRedBlackCartesian &FiveDimRedBlackGrid,
-				       GridCartesian         &FourDimGrid,
-				       GridRedBlackCartesian &FourDimRedBlackGrid,
-				       RealD _M5,const ImplParams &p) :
+               GridCartesian         &FiveDimGrid,
+               GridRedBlackCartesian &FiveDimRedBlackGrid,
+               GridCartesian         &FourDimGrid,
+               GridRedBlackCartesian &FourDimRedBlackGrid,
+               RealD _M5,const ImplParams &p) :
  Kernels(p),
-  _FiveDimGrid(&FiveDimGrid),
+  _FiveDimGrid        (&FiveDimGrid),
  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
-  _FourDimGrid(&FourDimGrid),
+  _FourDimGrid        (&FourDimGrid),
  _FourDimRedBlackGrid(&FourDimRedBlackGrid),
  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements),
  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
@@ -62,60 +62,83 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  Lebesgue(_FourDimGrid),
  LebesgueEvenOdd(_FourDimRedBlackGrid)
 {
-  // some assertions
-  assert(FiveDimGrid._ndimension==5);
-  assert(FourDimGrid._ndimension==4);
-  assert(FiveDimRedBlackGrid._ndimension==5);
-  assert(FourDimRedBlackGrid._ndimension==4);
-  assert(FiveDimRedBlackGrid._checker_dim==1);
+  if (Impl::LsVectorised) { 

-  // Dimension zero of the five-d is the Ls direction
-  Ls=FiveDimGrid._fdimensions[0];
-  assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
-  assert(FiveDimRedBlackGrid._processors[0] ==1);
-  assert(FiveDimRedBlackGrid._simd_layout[0]==1);
-  assert(FiveDimGrid._processors[0]         ==1);
-  assert(FiveDimGrid._simd_layout[0]        ==1);
+    int nsimd = Simd::Nsimd();
+    
+    // some assertions
+    assert(FiveDimGrid._ndimension==5);
+    assert(FiveDimRedBlackGrid._ndimension==5);
+    assert(FiveDimRedBlackGrid._checker_dim==1); // Don't checker the s direction
+    assert(FourDimGrid._ndimension==4);

-  // Other dimensions must match the decomposition of the four-D fields 
-  for(int d=0;d<4;d++){
-    assert(FourDimRedBlackGrid._fdimensions[d]  ==FourDimGrid._fdimensions[d]);
-    assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
+    // Dimension zero of the five-d is the Ls direction
+    Ls=FiveDimGrid._fdimensions[0];
+    assert(FiveDimGrid._processors[0]         ==1);
+    assert(FiveDimGrid._simd_layout[0]        ==nsimd);

-    assert(FourDimRedBlackGrid._processors[d]   ==FourDimGrid._processors[d]);
-    assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
+    assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
+    assert(FiveDimRedBlackGrid._processors[0] ==1);
+    assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);

-    assert(FourDimRedBlackGrid._simd_layout[d]  ==FourDimGrid._simd_layout[d]);
-    assert(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]);
+    // Other dimensions must match the decomposition of the four-D fields 
+    for(int d=0;d<4;d++){
+      assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
+      assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
+      
+      assert(FourDimGrid._simd_layout[d]=1);
+      assert(FourDimRedBlackGrid._simd_layout[d]=1);
+      assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);

-    assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
-    assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
-    assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
+      assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
+      assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
+      assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
+    }
+
+  } else {
+
+    // some assertions
+    assert(FiveDimGrid._ndimension==5);
+    assert(FourDimGrid._ndimension==4);
+    assert(FiveDimRedBlackGrid._ndimension==5);
+    assert(FourDimRedBlackGrid._ndimension==4);
+    assert(FiveDimRedBlackGrid._checker_dim==1);
+    
+    // Dimension zero of the five-d is the Ls direction
+    Ls=FiveDimGrid._fdimensions[0];
+    assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
+    assert(FiveDimRedBlackGrid._processors[0] ==1);
+    assert(FiveDimRedBlackGrid._simd_layout[0]==1);
+    assert(FiveDimGrid._processors[0]         ==1);
+    assert(FiveDimGrid._simd_layout[0]        ==1);
+    
+    // Other dimensions must match the decomposition of the four-D fields 
+    for(int d=0;d<4;d++){
+      assert(FourDimRedBlackGrid._fdimensions[d]  ==FourDimGrid._fdimensions[d]);
+      assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
+      
+      assert(FourDimRedBlackGrid._processors[d]   ==FourDimGrid._processors[d]);
+      assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
+      
+      assert(FourDimRedBlackGrid._simd_layout[d]  ==FourDimGrid._simd_layout[d]);
+      assert(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]);
+      
+      assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
+      assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
+      assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
+    }
  }
-
+    
  // Allocate the required comms buffer
  ImportGauge(_Umu);
-}  
-
+}
+  /*
 template<class Impl>
 WilsonFermion5D<Impl>::WilsonFermion5D(int simd,GaugeField &_Umu,
-				       GridCartesian         &FiveDimGrid,
-				       GridRedBlackCartesian &FiveDimRedBlackGrid,
-				       GridCartesian         &FourDimGrid,
-				       RealD _M5,const ImplParams &p) :
-  Kernels(p),
-  _FiveDimGrid        (&FiveDimGrid),
-  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
-  _FourDimGrid        (&FourDimGrid),
-  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements),
-  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
-  StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd
-  M5(_M5),
-  Umu(_FourDimGrid),
-  UmuEven(_FourDimGrid),
-  UmuOdd (_FourDimGrid),
-  Lebesgue(_FourDimGrid),
-  LebesgueEvenOdd(_FourDimGrid)
+               GridCartesian         &FiveDimGrid,
+               GridRedBlackCartesian &FiveDimRedBlackGrid,
+               GridCartesian         &FourDimGrid,
+               RealD _M5,const ImplParams &p) :
 {
  int nsimd = Simd::Nsimd();

@@ -148,13 +171,75 @@ WilsonFermion5D<Impl>::WilsonFermion5D(int simd,GaugeField &_Umu,
  }

  {
-    GaugeField HUmu(_Umu._grid);
-    HUmu = _Umu*(-0.5);
-    Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
-    UmuEven=Umu;// Really want a reference.
-    UmuOdd =Umu;
  }
 }  
+  */
+     
+template<class Impl>
+void WilsonFermion5D<Impl>::Report(void)
+{
+    std::vector<int> latt = GridDefaultLatt();          
+    RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
+    RealD NP = _FourDimGrid->_Nprocessors;
+
+  if ( DhopCalls > 0 ) {
+    std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D Number of Dhop Calls     : " << DhopCalls  << std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " << DhopCommTime
+              << " us" << std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls           : "
+              << DhopCommTime / DhopCalls << " us" << std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D Total Compute time       : "
+              << DhopComputeTime << " us" << std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls        : "
+              << DhopComputeTime / DhopCalls << " us" << std::endl;
+
+    RealD mflops = 1344*volume*DhopCalls/DhopComputeTime;
+    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
+    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NP << std::endl;
+
+   }
+
+  if ( DerivCalls > 0 ) {
+  std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
+  std::cout << GridLogMessage << "WilsonFermion5D Number of Deriv Calls    : " <<DerivCalls <<std::endl;
+  std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " <<DerivCommTime <<" us"<<std::endl;
+  std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls           : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
+  std::cout << GridLogMessage << "WilsonFermion5D Total Compute time       : " <<DerivComputeTime <<" us"<<std::endl;
+  std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls        : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
+  std::cout << GridLogMessage << "WilsonFermion5D Total Dhop Compute time  : " <<DerivDhopComputeTime <<" us"<<std::endl;
+  std::cout << GridLogMessage << "WilsonFermion5D Dhop ComputeTime/Calls   : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
+
+
+
+  RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
+  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NP << std::endl;
+
+  }
+
+  if (DerivCalls > 0 || DhopCalls > 0){
+  std::cout << GridLogMessage << "WilsonFermion5D Stencil"<<std::endl;  Stencil.Report();
+  std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report();
+  std::cout << GridLogMessage << "WilsonFermion5D StencilOdd"<<std::endl;  StencilOdd.Report();
+  }
+}
+
+template<class Impl>
+void WilsonFermion5D<Impl>::ZeroCounters(void) {
+  DhopCalls       = 0;
+  DhopCommTime    = 0;
+  DhopComputeTime = 0;
+
+  DerivCalls       = 0;
+  DerivCommTime    = 0;
+  DerivComputeTime = 0;
+  DerivDhopComputeTime = 0;
+
+  Stencil.ZeroCounters();
+  StencilEven.ZeroCounters();
+  StencilOdd.ZeroCounters();
+}


 template<class Impl>
@@ -197,12 +282,13 @@ PARALLEL_FOR_LOOP

 template<class Impl>
 void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
-					  DoubledGaugeField & U,
-					  GaugeField &mat,
-					  const FermionField &A,
-					  const FermionField &B,
-					  int dag)
+            DoubledGaugeField & U,
+            GaugeField &mat,
+            const FermionField &A,
+            const FermionField &B,
+            int dag)
 {
+  DerivCalls++;
  assert((dag==DaggerNo) ||(dag==DaggerYes));

  conformable(st._grid,A._grid);
@@ -213,51 +299,53 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
  FermionField Btilde(B._grid);
  FermionField Atilde(B._grid);

+  DerivCommTime-=usecond();
  st.HaloExchange(B,compressor);
+  DerivCommTime+=usecond();

  Atilde=A;

-  for(int mu=0;mu<Nd;mu++){
-      
+  DerivComputeTime-=usecond();
+  for (int mu = 0; mu < Nd; mu++) {
    ////////////////////////////////////////////////////////////////////////
    // Flip gamma if dag
    ////////////////////////////////////////////////////////////////////////
    int gamma = mu;
-    if ( !dag ) gamma+= Nd;
+    if (!dag) gamma += Nd;

    ////////////////////////
    // Call the single hop
    ////////////////////////

-PARALLEL_FOR_LOOP
-    for(int sss=0;sss<U._grid->oSites();sss++){
-      for(int s=0;s<Ls;s++){
-	int sU=sss;
-	int sF = s+Ls*sU;
+    DerivDhopComputeTime -= usecond();
+    PARALLEL_FOR_LOOP
+    for (int sss = 0; sss < U._grid->oSites(); sss++) {
+      for (int s = 0; s < Ls; s++) {
+        int sU = sss;
+        int sF = s + Ls * sU;

-	assert ( sF< B._grid->oSites());
-	assert ( sU< U._grid->oSites());
+        assert(sF < B._grid->oSites());
+        assert(sU < U._grid->oSites());

-	Kernels::DiracOptDhopDir(st,U,st.comm_buf,sF,sU,B,Btilde,mu,gamma);
-
-    ////////////////////////////
-    // spin trace outer product
-    ////////////////////////////
+        Kernels::DiracOptDhopDir(st, U, st.comm_buf, sF, sU, B, Btilde, mu,
+                                 gamma);

+        ////////////////////////////
+        // spin trace outer product
+        ////////////////////////////
      }
-
    }
-
-    Impl::InsertForce5D(mat,Btilde,Atilde,mu);
-
+    DerivDhopComputeTime += usecond();
+    Impl::InsertForce5D(mat, Btilde, Atilde, mu);
  }
+  DerivComputeTime += usecond();
 }

 template<class Impl>
 void WilsonFermion5D<Impl>::DhopDeriv(      GaugeField &mat,
-					    const FermionField &A,
-					    const FermionField &B,
-					    int dag)
+              const FermionField &A,
+              const FermionField &B,
+              int dag)
 {
  conformable(A._grid,FermionGrid());  
  conformable(A._grid,B._grid);
@@ -270,9 +358,9 @@ void WilsonFermion5D<Impl>::DhopDeriv(      GaugeField &mat,

 template<class Impl>
 void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
-					const FermionField &A,
-					const FermionField &B,
-					int dag)
+          const FermionField &A,
+          const FermionField &B,
+          int dag)
 {
  conformable(A._grid,FermionRedBlackGrid());
  conformable(GaugeRedBlackGrid(),mat._grid);
@@ -288,9 +376,9 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,

 template<class Impl>
 void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
-				  const FermionField &A,
-				  const FermionField &B,
-				  int dag)
+          const FermionField &A,
+          const FermionField &B,
+          int dag)
 {
  conformable(A._grid,FermionRedBlackGrid());
  conformable(GaugeRedBlackGrid(),mat._grid);
@@ -305,32 +393,61 @@ void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,

 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
-					 DoubledGaugeField & U,
-					 const FermionField &in, FermionField &out,int dag)
+           DoubledGaugeField & U,
+           const FermionField &in, FermionField &out,int dag)
 {
+  DhopCalls++;
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
  Compressor compressor(dag);

  int LLs = in._grid->_rdimensions[0];
  
+  DhopCommTime-=usecond();
  st.HaloExchange(in,compressor);
+  DhopCommTime+=usecond();
  
+  DhopComputeTime-=usecond();
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
-  if ( dag == DaggerYes ) {
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
-	int sF=LLs*sU;
-	Kernels::DiracOptDhopSiteDag(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
+  if (dag == DaggerYes) {
+    PARALLEL_FOR_LOOP
+    for (int ss = 0; ss < U._grid->oSites(); ss++) {
+      int sU = ss;
+      int sF = LLs * sU;
+      Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in,
+                                   out);
    }
+#ifdef AVX512
+  } else if (stat.is_init() ) {
+
+    int nthreads;
+    stat.start();
+    #pragma omp parallel
+    {
+    #pragma omp master
+    nthreads = omp_get_num_threads();
+    int mythread = omp_get_thread_num();
+    stat.enter(mythread);
+    #pragma omp for nowait
+   for(int ss=0;ss<U._grid->oSites();ss++)
+    {
+       int sU=ss;
+       int sF=LLs*sU;
+       Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
+     }
+    stat.exit(mythread);
+    }
+    stat.accum(nthreads);
+#endif
  } else {
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<U._grid->oSites();ss++){
-      int sU=ss;
-      int sF=LLs*sU;
-      Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
+    PARALLEL_FOR_LOOP
+    for (int ss = 0; ss < U._grid->oSites(); ss++) {
+      int sU = ss;
+      int sF = LLs * sU;
+      Kernels::DiracOptDhopSite(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in,
+                                out);
    }
  }
+  DhopComputeTime+=usecond();
 }


@@ -376,8 +493,6 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag

 FermOpTemplateInstantiate(WilsonFermion5D);
 GparityFermOpTemplateInstantiate(WilsonFermion5D);
-template class WilsonFermion5D<DomainWallRedBlack5dImplF>;		
-template class WilsonFermion5D<DomainWallRedBlack5dImplD>;
  
 }}

--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@@ -31,6 +31,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_WILSON_FERMION_5D_H
 #define  GRID_QCD_WILSON_FERMION_5D_H

+#include <Grid/Stat.h>
+
 namespace Grid {

  namespace QCD {
@@ -60,6 +62,18 @@ namespace Grid {
    public:
     INHERIT_IMPL_TYPES(Impl);
     typedef WilsonKernels<Impl> Kernels;
+     PmuStat stat;
+
+     void Report(void);
+     void ZeroCounters(void);
+     double DhopCalls;
+     double DhopCommTime;
+     double DhopComputeTime;
+
+     double DerivCalls;
+     double DerivCommTime;
+     double DerivComputeTime;
+     double DerivDhopComputeTime;

      ///////////////////////////////////////////////////////////////
      // Implement the abstract base
@@ -125,12 +139,14 @@ namespace Grid {
 		      double _M5,const ImplParams &p= ImplParams());

      // Constructors
+      /*
      WilsonFermion5D(int simd, 
 		      GaugeField &_Umu,
 		      GridCartesian         &FiveDimGrid,
 		      GridRedBlackCartesian &FiveDimRedBlackGrid,
 		      GridCartesian         &FourDimGrid,
 		      double _M5,const ImplParams &p= ImplParams());
+      */

      // DoubleStore
      void ImportGauge(const GaugeField &_Umu);
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -1,98 +1,54 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 #include <Grid.h>
 namespace Grid {
 namespace QCD {

-  int WilsonKernelsStatic::HandOpt;
-  int WilsonKernelsStatic::AsmOpt;
+int WilsonKernelsStatic::HandOpt;
+int WilsonKernelsStatic::AsmOpt;

-template<class Impl> 
-WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};
+template <class Impl>
+WilsonKernels<Impl>::WilsonKernels(const ImplParams &p) : Base(p){};

-template<class Impl> 
-void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-						  int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
-{
-#ifdef AVX512
-  if ( AsmOpt ) {
+////////////////////////////////////////////
+// Generic implementation; move to different file?
+////////////////////////////////////////////

-    WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
-
-  } else {
-#else
-  {  
-#endif
-    for(int site=0;site<Ns;site++) {
-      for(int s=0;s<Ls;s++) {
-	if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
-	else         WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
-	sF++;
-      }
-      sU++;
-    }
-
-  }
-}
-
-template<class Impl> 
-void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					   int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
-{
-  // No asm implementation yet.
-  //  if ( AsmOpt )     WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
-  //  else
-  for(int site=0;site<Ns;site++) {
-    for(int s=0;s<Ls;s++) {
-      if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
-      else         WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
-      sF++;
-    }
-    sU++;
-  }
-}
-
-
-  ////////////////////////////////////////////
-  // Generic implementation; move to different file?
-  ////////////////////////////////////////////
-
-template<class Impl> 
-void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					   int sF,int sU,const FermionField &in, FermionField &out)
-{
-  SiteHalfSpinor  tmp;    
-  SiteHalfSpinor  chi;    
+template <class Impl>
+void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(
+    StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+    std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, int sF,
+    int sU, const FermionField &in, FermionField &out) {
+  SiteHalfSpinor tmp;
+  SiteHalfSpinor chi;
  SiteHalfSpinor *chi_p;
  SiteHalfSpinor Uchi;
  SiteSpinor result;
@@ -102,176 +58,175 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,LebesgueOrd
  ///////////////////////////
  // Xp
  ///////////////////////////
-  SE=st.GetEntry(ptype,Xp,sF);
+  SE = st.GetEntry(ptype, Xp, sF);

-  if (SE->_is_local ) { 
+  if (SE->_is_local) {
    chi_p = &chi;
-    if ( SE->_permute ) {
-      spProjXp(tmp,in._odata[SE->_offset]);
-      permute(chi,tmp,ptype);
+    if (SE->_permute) {
+      spProjXp(tmp, in._odata[SE->_offset]);
+      permute(chi, tmp, ptype);
    } else {
-      spProjXp(chi,in._odata[SE->_offset]);
+      spProjXp(chi, in._odata[SE->_offset]);
    }
-  } else { 
-    chi_p=&buf[SE->_offset];
+  } else {
+    chi_p = &buf[SE->_offset];
  }
-  
-  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xp,SE,st);
-  spReconXp(result,Uchi);
-    
+
+  Impl::multLink(Uchi, U._odata[sU], *chi_p, Xp, SE, st);
+  spReconXp(result, Uchi);
+
  ///////////////////////////
  // Yp
  ///////////////////////////
-  SE=st.GetEntry(ptype,Yp,sF);
+  SE = st.GetEntry(ptype, Yp, sF);

-  if ( SE->_is_local ) { 
+  if (SE->_is_local) {
    chi_p = &chi;
-    if ( SE->_permute ) {
-      spProjYp(tmp,in._odata[SE->_offset]);
-      permute(chi,tmp,ptype);
+    if (SE->_permute) {
+      spProjYp(tmp, in._odata[SE->_offset]);
+      permute(chi, tmp, ptype);
    } else {
-      spProjYp(chi,in._odata[SE->_offset]);
+      spProjYp(chi, in._odata[SE->_offset]);
    }
-  } else { 
-    chi_p=&buf[SE->_offset];
+  } else {
+    chi_p = &buf[SE->_offset];
  }

-  Impl::multLink(Uchi,U._odata[sU],*chi_p,Yp,SE,st);
-  accumReconYp(result,Uchi);
+  Impl::multLink(Uchi, U._odata[sU], *chi_p, Yp, SE, st);
+  accumReconYp(result, Uchi);

  ///////////////////////////
  // Zp
  ///////////////////////////
-  SE=st.GetEntry(ptype,Zp,sF);
+  SE = st.GetEntry(ptype, Zp, sF);

-  if ( SE->_is_local ) { 
+  if (SE->_is_local) {
    chi_p = &chi;
-    if ( SE->_permute ) {
-      spProjZp(tmp,in._odata[SE->_offset]);
-      permute(chi,tmp,ptype);
+    if (SE->_permute) {
+      spProjZp(tmp, in._odata[SE->_offset]);
+      permute(chi, tmp, ptype);
    } else {
-      spProjZp(chi,in._odata[SE->_offset]);
+      spProjZp(chi, in._odata[SE->_offset]);
    }
-  } else { 
-    chi_p=&buf[SE->_offset];
+  } else {
+    chi_p = &buf[SE->_offset];
  }

-  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zp,SE,st);
-  accumReconZp(result,Uchi);
+  Impl::multLink(Uchi, U._odata[sU], *chi_p, Zp, SE, st);
+  accumReconZp(result, Uchi);

  ///////////////////////////
  // Tp
  ///////////////////////////
-  SE=st.GetEntry(ptype,Tp,sF);
+  SE = st.GetEntry(ptype, Tp, sF);

-  if ( SE->_is_local ) {
+  if (SE->_is_local) {
    chi_p = &chi;
-    if ( SE->_permute ) {
-      spProjTp(tmp,in._odata[SE->_offset]);
-      permute(chi,tmp,ptype);
+    if (SE->_permute) {
+      spProjTp(tmp, in._odata[SE->_offset]);
+      permute(chi, tmp, ptype);
    } else {
-      spProjTp(chi,in._odata[SE->_offset]);
+      spProjTp(chi, in._odata[SE->_offset]);
    }
  } else {
-    chi_p=&buf[SE->_offset];
+    chi_p = &buf[SE->_offset];
  }

-  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tp,SE,st);
-  accumReconTp(result,Uchi);
+  Impl::multLink(Uchi, U._odata[sU], *chi_p, Tp, SE, st);
+  accumReconTp(result, Uchi);

  ///////////////////////////
  // Xm
  ///////////////////////////
-  SE=st.GetEntry(ptype,Xm,sF);
+  SE = st.GetEntry(ptype, Xm, sF);

-  if ( SE->_is_local ) {
+  if (SE->_is_local) {
    chi_p = &chi;
-    if ( SE->_permute ) {
-      spProjXm(tmp,in._odata[SE->_offset]);
-      permute(chi,tmp,ptype);
+    if (SE->_permute) {
+      spProjXm(tmp, in._odata[SE->_offset]);
+      permute(chi, tmp, ptype);
    } else {
-      spProjXm(chi,in._odata[SE->_offset]);
+      spProjXm(chi, in._odata[SE->_offset]);
    }
  } else {
-    chi_p=&buf[SE->_offset];
+    chi_p = &buf[SE->_offset];
  }

-  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xm,SE,st);
-  accumReconXm(result,Uchi);
-  
+  Impl::multLink(Uchi, U._odata[sU], *chi_p, Xm, SE, st);
+  accumReconXm(result, Uchi);
+
  ///////////////////////////
  // Ym
  ///////////////////////////
-  SE=st.GetEntry(ptype,Ym,sF);
+  SE = st.GetEntry(ptype, Ym, sF);

-  if ( SE->_is_local ) {
+  if (SE->_is_local) {
    chi_p = &chi;
-    if ( SE->_permute ) {
-      spProjYm(tmp,in._odata[SE->_offset]);
-      permute(chi,tmp,ptype);
+    if (SE->_permute) {
+      spProjYm(tmp, in._odata[SE->_offset]);
+      permute(chi, tmp, ptype);
    } else {
-      spProjYm(chi,in._odata[SE->_offset]);
+      spProjYm(chi, in._odata[SE->_offset]);
    }
  } else {
-    chi_p=&buf[SE->_offset];
+    chi_p = &buf[SE->_offset];
  }

-  Impl::multLink(Uchi,U._odata[sU],*chi_p,Ym,SE,st);
-  accumReconYm(result,Uchi);
-  
+  Impl::multLink(Uchi, U._odata[sU], *chi_p, Ym, SE, st);
+  accumReconYm(result, Uchi);
+
  ///////////////////////////
  // Zm
  ///////////////////////////
-  SE=st.GetEntry(ptype,Zm,sF);
+  SE = st.GetEntry(ptype, Zm, sF);

-  if ( SE->_is_local ) {
+  if (SE->_is_local) {
    chi_p = &chi;
-    if ( SE->_permute ) {
-      spProjZm(tmp,in._odata[SE->_offset]);
-      permute(chi,tmp,ptype);
+    if (SE->_permute) {
+      spProjZm(tmp, in._odata[SE->_offset]);
+      permute(chi, tmp, ptype);
    } else {
-      spProjZm(chi,in._odata[SE->_offset]);
+      spProjZm(chi, in._odata[SE->_offset]);
    }
  } else {
-    chi_p=&buf[SE->_offset];
+    chi_p = &buf[SE->_offset];
  }

-  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zm,SE,st);
-  accumReconZm(result,Uchi);
+  Impl::multLink(Uchi, U._odata[sU], *chi_p, Zm, SE, st);
+  accumReconZm(result, Uchi);

  ///////////////////////////
  // Tm
  ///////////////////////////
-  SE=st.GetEntry(ptype,Tm,sF);
+  SE = st.GetEntry(ptype, Tm, sF);

-  if ( SE->_is_local ) {
+  if (SE->_is_local) {
    chi_p = &chi;
-    if ( SE->_permute ) {
-      spProjTm(tmp,in._odata[SE->_offset]);
-      permute(chi,tmp,ptype);
-    } else { 
-      spProjTm(chi,in._odata[SE->_offset]);
+    if (SE->_permute) {
+      spProjTm(tmp, in._odata[SE->_offset]);
+      permute(chi, tmp, ptype);
+    } else {
+      spProjTm(chi, in._odata[SE->_offset]);
    }
  } else {
-    chi_p=&buf[SE->_offset];
+    chi_p = &buf[SE->_offset];
  }

-  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tm,SE,st);
-  accumReconTm(result,Uchi);
+  Impl::multLink(Uchi, U._odata[sU], *chi_p, Tm, SE, st);
+  accumReconTm(result, Uchi);

-  vstream(out._odata[sF],result);
+  vstream(out._odata[sF], result);
 };

-
-  // Need controls to do interior, exterior, or both
-template<class Impl> 
-void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-						  int sF,int sU,const FermionField &in, FermionField &out)
-{
-  SiteHalfSpinor  tmp;    
-  SiteHalfSpinor  chi;    
-  SiteHalfSpinor *chi_p;    
+// Need controls to do interior, exterior, or both
+template <class Impl>
+void WilsonKernels<Impl>::DiracOptGenericDhopSite(
+    StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+    std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, int sF,
+    int sU, const FermionField &in, FermionField &out) {
+  SiteHalfSpinor tmp;
+  SiteHalfSpinor chi;
+  SiteHalfSpinor *chi_p;
  SiteHalfSpinor Uchi;
  SiteSpinor result;
  StencilEntry *SE;
@@ -280,299 +235,298 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st,LebesgueOrder
  ///////////////////////////
  // Xp
  ///////////////////////////
-  SE=st.GetEntry(ptype,Xm,sF);
+  SE = st.GetEntry(ptype, Xm, sF);

-  if ( SE->_is_local ) { 
+  if (SE->_is_local) {
    chi_p = &chi;
-    if ( SE->_permute ) {
-      spProjXp(tmp,in._odata[SE->_offset]);
-      permute(chi,tmp,ptype);
+    if (SE->_permute) {
+      spProjXp(tmp, in._odata[SE->_offset]);
+      permute(chi, tmp, ptype);
    } else {
-      spProjXp(chi,in._odata[SE->_offset]);
+      spProjXp(chi, in._odata[SE->_offset]);
    }
-  } else { 
-    chi_p=&buf[SE->_offset];
+  } else {
+    chi_p = &buf[SE->_offset];
  }
-  
-  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xm,SE,st);
-  spReconXp(result,Uchi);
-    
+
+  Impl::multLink(Uchi, U._odata[sU], *chi_p, Xm, SE, st);
+  spReconXp(result, Uchi);
+
  ///////////////////////////
  // Yp
  ///////////////////////////
-  SE=st.GetEntry(ptype,Ym,sF);
+  SE = st.GetEntry(ptype, Ym, sF);

-  if ( SE->_is_local ) { 
+  if (SE->_is_local) {
    chi_p = &chi;
-    if ( SE->_permute ) {
-      spProjYp(tmp,in._odata[SE->_offset]);
-      permute(chi,tmp,ptype);
+    if (SE->_permute) {
+      spProjYp(tmp, in._odata[SE->_offset]);
+      permute(chi, tmp, ptype);
    } else {
-      spProjYp(chi,in._odata[SE->_offset]);
+      spProjYp(chi, in._odata[SE->_offset]);
    }
-  } else { 
-    chi_p=&buf[SE->_offset];
+  } else {
+    chi_p = &buf[SE->_offset];
  }

-  Impl::multLink(Uchi,U._odata[sU],*chi_p,Ym,SE,st);
-  accumReconYp(result,Uchi);
+  Impl::multLink(Uchi, U._odata[sU], *chi_p, Ym, SE, st);
+  accumReconYp(result, Uchi);

  ///////////////////////////
  // Zp
  ///////////////////////////
-  SE=st.GetEntry(ptype,Zm,sF);
+  SE = st.GetEntry(ptype, Zm, sF);

-  if ( SE->_is_local ) { 
+  if (SE->_is_local) {
    chi_p = &chi;
-    if ( SE->_permute ) {
-      spProjZp(tmp,in._odata[SE->_offset]);
-      permute(chi,tmp,ptype);
+    if (SE->_permute) {
+      spProjZp(tmp, in._odata[SE->_offset]);
+      permute(chi, tmp, ptype);
    } else {
-      spProjZp(chi,in._odata[SE->_offset]);
+      spProjZp(chi, in._odata[SE->_offset]);
    }
-  } else { 
-    chi_p=&buf[SE->_offset];
+  } else {
+    chi_p = &buf[SE->_offset];
  }

-  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zm,SE,st);
-  accumReconZp(result,Uchi);
+  Impl::multLink(Uchi, U._odata[sU], *chi_p, Zm, SE, st);
+  accumReconZp(result, Uchi);

  ///////////////////////////
  // Tp
  ///////////////////////////
-  SE=st.GetEntry(ptype,Tm,sF);
+  SE = st.GetEntry(ptype, Tm, sF);

-  if ( SE->_is_local ) {
+  if (SE->_is_local) {
    chi_p = &chi;
-    if ( SE->_permute ) {
-      spProjTp(tmp,in._odata[SE->_offset]);
-      permute(chi,tmp,ptype);
+    if (SE->_permute) {
+      spProjTp(tmp, in._odata[SE->_offset]);
+      permute(chi, tmp, ptype);
    } else {
-      spProjTp(chi,in._odata[SE->_offset]);
+      spProjTp(chi, in._odata[SE->_offset]);
    }
  } else {
-    chi_p=&buf[SE->_offset];
+    chi_p = &buf[SE->_offset];
  }

-  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tm,SE,st);
-  accumReconTp(result,Uchi);
+  Impl::multLink(Uchi, U._odata[sU], *chi_p, Tm, SE, st);
+  accumReconTp(result, Uchi);

  ///////////////////////////
  // Xm
  ///////////////////////////
-  SE=st.GetEntry(ptype,Xp,sF);
+  SE = st.GetEntry(ptype, Xp, sF);

-  if ( SE->_is_local ) {
+  if (SE->_is_local) {
    chi_p = &chi;
-    if ( SE->_permute ) {
-      spProjXm(tmp,in._odata[SE->_offset]);
-      permute(chi,tmp,ptype);
+    if (SE->_permute) {
+      spProjXm(tmp, in._odata[SE->_offset]);
+      permute(chi, tmp, ptype);
    } else {
-      spProjXm(chi,in._odata[SE->_offset]);
+      spProjXm(chi, in._odata[SE->_offset]);
    }
  } else {
-    chi_p=&buf[SE->_offset];
+    chi_p = &buf[SE->_offset];
  }

-  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xp,SE,st);
-  accumReconXm(result,Uchi);
+  Impl::multLink(Uchi, U._odata[sU], *chi_p, Xp, SE, st);
+  accumReconXm(result, Uchi);

  ///////////////////////////
  // Ym
  ///////////////////////////
-  SE=st.GetEntry(ptype,Yp,sF);
+  SE = st.GetEntry(ptype, Yp, sF);

-  if ( SE->_is_local ) {
+  if (SE->_is_local) {
    chi_p = &chi;
-    if ( SE->_permute ) {
-      spProjYm(tmp,in._odata[SE->_offset]);
-      permute(chi,tmp,ptype);
+    if (SE->_permute) {
+      spProjYm(tmp, in._odata[SE->_offset]);
+      permute(chi, tmp, ptype);
    } else {
-      spProjYm(chi,in._odata[SE->_offset]);
+      spProjYm(chi, in._odata[SE->_offset]);
    }
  } else {
-    chi_p=&buf[SE->_offset];
+    chi_p = &buf[SE->_offset];
  }

-  Impl::multLink(Uchi,U._odata[sU],*chi_p,Yp,SE,st);
-  accumReconYm(result,Uchi);
-  
+  Impl::multLink(Uchi, U._odata[sU], *chi_p, Yp, SE, st);
+  accumReconYm(result, Uchi);
+
  ///////////////////////////
  // Zm
  ///////////////////////////
-  SE=st.GetEntry(ptype,Zp,sF);
+  SE = st.GetEntry(ptype, Zp, sF);

-  if ( SE->_is_local ) {
+  if (SE->_is_local) {
    chi_p = &chi;
-    if ( SE->_permute ) {
-      spProjZm(tmp,in._odata[SE->_offset]);
-      permute(chi,tmp,ptype);
+    if (SE->_permute) {
+      spProjZm(tmp, in._odata[SE->_offset]);
+      permute(chi, tmp, ptype);
    } else {
-      spProjZm(chi,in._odata[SE->_offset]);
+      spProjZm(chi, in._odata[SE->_offset]);
    }
  } else {
-    chi_p=&buf[SE->_offset];
+    chi_p = &buf[SE->_offset];
  }

-  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zp,SE,st);
-  accumReconZm(result,Uchi);
+  Impl::multLink(Uchi, U._odata[sU], *chi_p, Zp, SE, st);
+  accumReconZm(result, Uchi);

  ///////////////////////////
  // Tm
  ///////////////////////////
-  SE=st.GetEntry(ptype,Tp,sF);
+  SE = st.GetEntry(ptype, Tp, sF);

-  if ( SE->_is_local ) {
+  if (SE->_is_local) {
    chi_p = &chi;
-    if ( SE->_permute ) {
-      spProjTm(tmp,in._odata[SE->_offset]);
-      permute(chi,tmp,ptype);
-    } else { 
-      spProjTm(chi,in._odata[SE->_offset]);
+    if (SE->_permute) {
+      spProjTm(tmp, in._odata[SE->_offset]);
+      permute(chi, tmp, ptype);
+    } else {
+      spProjTm(chi, in._odata[SE->_offset]);
    }
  } else {
-    chi_p=&buf[SE->_offset];
+    chi_p = &buf[SE->_offset];
  }

-  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tp,SE,st);
-  accumReconTm(result,Uchi);
+  Impl::multLink(Uchi, U._odata[sU], *chi_p, Tp, SE, st);
+  accumReconTm(result, Uchi);

-  vstream(out._odata[sF],result);
+  vstream(out._odata[sF], result);
 };

-template<class Impl> 
-void WilsonKernels<Impl>::DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
-					  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					  int sF,int sU,const FermionField &in, FermionField &out,int dir,int gamma)
-{
-  SiteHalfSpinor  tmp;    
-  SiteHalfSpinor  chi;    
-  SiteSpinor   result;
+template <class Impl>
+void WilsonKernels<Impl>::DiracOptDhopDir(
+    StencilImpl &st, DoubledGaugeField &U,
+    std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, int sF,
+    int sU, const FermionField &in, FermionField &out, int dir, int gamma) {
+  SiteHalfSpinor tmp;
+  SiteHalfSpinor chi;
+  SiteSpinor result;
  SiteHalfSpinor Uchi;
  StencilEntry *SE;
  int ptype;

-  SE=st.GetEntry(ptype,dir,sF);
+  SE = st.GetEntry(ptype, dir, sF);

  // Xp
-  if(gamma==Xp){
-    if (  SE->_is_local && SE->_permute ) {
-      spProjXp(tmp,in._odata[SE->_offset]);
-      permute(chi,tmp,ptype);
-    } else if ( SE->_is_local ) {
-      spProjXp(chi,in._odata[SE->_offset]);
-    } else { 
-      chi=buf[SE->_offset];
+  if (gamma == Xp) {
+    if (SE->_is_local && SE->_permute) {
+      spProjXp(tmp, in._odata[SE->_offset]);
+      permute(chi, tmp, ptype);
+    } else if (SE->_is_local) {
+      spProjXp(chi, in._odata[SE->_offset]);
+    } else {
+      chi = buf[SE->_offset];
    }
-    Impl::multLink(Uchi,U._odata[sU],chi,dir,SE,st);
-    spReconXp(result,Uchi);
+    Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st);
+    spReconXp(result, Uchi);
  }

  // Yp
-  if ( gamma==Yp ){
-    if (  SE->_is_local && SE->_permute ) {
-      spProjYp(tmp,in._odata[SE->_offset]);
-      permute(chi,tmp,ptype);
-    } else if ( SE->_is_local ) {
-      spProjYp(chi,in._odata[SE->_offset]);
-    } else { 
-      chi=buf[SE->_offset];
+  if (gamma == Yp) {
+    if (SE->_is_local && SE->_permute) {
+      spProjYp(tmp, in._odata[SE->_offset]);
+      permute(chi, tmp, ptype);
+    } else if (SE->_is_local) {
+      spProjYp(chi, in._odata[SE->_offset]);
+    } else {
+      chi = buf[SE->_offset];
    }
-    Impl::multLink(Uchi,U._odata[sU],chi,dir,SE,st);
-    spReconYp(result,Uchi);
+    Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st);
+    spReconYp(result, Uchi);
  }
-  
+
  // Zp
-  if ( gamma ==Zp ){
-    if (  SE->_is_local && SE->_permute ) {
-      spProjZp(tmp,in._odata[SE->_offset]);
-      permute(chi,tmp,ptype);
-    } else if ( SE->_is_local ) {
-      spProjZp(chi,in._odata[SE->_offset]);
-    } else { 
-      chi=buf[SE->_offset];
+  if (gamma == Zp) {
+    if (SE->_is_local && SE->_permute) {
+      spProjZp(tmp, in._odata[SE->_offset]);
+      permute(chi, tmp, ptype);
+    } else if (SE->_is_local) {
+      spProjZp(chi, in._odata[SE->_offset]);
+    } else {
+      chi = buf[SE->_offset];
    }
-    Impl::multLink(Uchi,U._odata[sU],chi,dir,SE,st);
-    spReconZp(result,Uchi);
+    Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st);
+    spReconZp(result, Uchi);
  }
-  
+
  // Tp
-  if ( gamma ==Tp ){
-    if (  SE->_is_local && SE->_permute ) {
-      spProjTp(tmp,in._odata[SE->_offset]);
-      permute(chi,tmp,ptype);
-    } else if ( SE->_is_local ) {
-      spProjTp(chi,in._odata[SE->_offset]);
-    } else { 
-      chi=buf[SE->_offset];
+  if (gamma == Tp) {
+    if (SE->_is_local && SE->_permute) {
+      spProjTp(tmp, in._odata[SE->_offset]);
+      permute(chi, tmp, ptype);
+    } else if (SE->_is_local) {
+      spProjTp(chi, in._odata[SE->_offset]);
+    } else {
+      chi = buf[SE->_offset];
    }
-    Impl::multLink(Uchi,U._odata[sU],chi,dir,SE,st);
-    spReconTp(result,Uchi);
+    Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st);
+    spReconTp(result, Uchi);
  }

  // Xm
-  if ( gamma==Xm ){
-    if (  SE->_is_local && SE->_permute ) {
-      spProjXm(tmp,in._odata[SE->_offset]);
-      permute(chi,tmp,ptype);
-    } else if ( SE->_is_local ) {
-      spProjXm(chi,in._odata[SE->_offset]);
-    } else { 
-      chi=buf[SE->_offset];
+  if (gamma == Xm) {
+    if (SE->_is_local && SE->_permute) {
+      spProjXm(tmp, in._odata[SE->_offset]);
+      permute(chi, tmp, ptype);
+    } else if (SE->_is_local) {
+      spProjXm(chi, in._odata[SE->_offset]);
+    } else {
+      chi = buf[SE->_offset];
    }
-    Impl::multLink(Uchi,U._odata[sU],chi,dir,SE,st);
-    spReconXm(result,Uchi);
+    Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st);
+    spReconXm(result, Uchi);
  }

  // Ym
-  if ( gamma == Ym ){
-    if (  SE->_is_local && SE->_permute ) {
-      spProjYm(tmp,in._odata[SE->_offset]);
-      permute(chi,tmp,ptype);
-    } else if ( SE->_is_local ) {
-      spProjYm(chi,in._odata[SE->_offset]);
-    } else { 
-      chi=buf[SE->_offset];
+  if (gamma == Ym) {
+    if (SE->_is_local && SE->_permute) {
+      spProjYm(tmp, in._odata[SE->_offset]);
+      permute(chi, tmp, ptype);
+    } else if (SE->_is_local) {
+      spProjYm(chi, in._odata[SE->_offset]);
+    } else {
+      chi = buf[SE->_offset];
    }
-    Impl::multLink(Uchi,U._odata[sU],chi,dir,SE,st);
-    spReconYm(result,Uchi);
+    Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st);
+    spReconYm(result, Uchi);
  }

  // Zm
-  if ( gamma == Zm ){
-    if (  SE->_is_local && SE->_permute ) {
-      spProjZm(tmp,in._odata[SE->_offset]);
-      permute(chi,tmp,ptype);
-    } else if ( SE->_is_local ) {
-      spProjZm(chi,in._odata[SE->_offset]);
-    } else { 
-      chi=buf[SE->_offset];
+  if (gamma == Zm) {
+    if (SE->_is_local && SE->_permute) {
+      spProjZm(tmp, in._odata[SE->_offset]);
+      permute(chi, tmp, ptype);
+    } else if (SE->_is_local) {
+      spProjZm(chi, in._odata[SE->_offset]);
+    } else {
+      chi = buf[SE->_offset];
    }
-    Impl::multLink(Uchi,U._odata[sU],chi,dir,SE,st);
-    spReconZm(result,Uchi);
-  }
-  
-  // Tm
-  if ( gamma==Tm ) {
-    if (  SE->_is_local && SE->_permute ) {
-      spProjTm(tmp,in._odata[SE->_offset]);
-      permute(chi,tmp,ptype);
-    } else if ( SE->_is_local ) {
-      spProjTm(chi,in._odata[SE->_offset]);
-    } else { 
-      chi=buf[SE->_offset];
-    }
-    Impl::multLink(Uchi,U._odata[sU],chi,dir,SE,st);
-    spReconTm(result,Uchi);
+    Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st);
+    spReconZm(result, Uchi);
  }

-  vstream(out._odata[sF],result);
+  // Tm
+  if (gamma == Tm) {
+    if (SE->_is_local && SE->_permute) {
+      spProjTm(tmp, in._odata[SE->_offset]);
+      permute(chi, tmp, ptype);
+    } else if (SE->_is_local) {
+      spProjTm(chi, in._odata[SE->_offset]);
+    } else {
+      chi = buf[SE->_offset];
+    }
+    Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st);
+    spReconTm(result, Uchi);
+  }
+
+  vstream(out._odata[sF], result);
 }

-
-  FermOpTemplateInstantiate(WilsonKernels);
-
-template class WilsonKernels<DomainWallRedBlack5dImplF>;		
-template class WilsonKernels<DomainWallRedBlack5dImplD>;
+FermOpTemplateInstantiate(WilsonKernels);
+AdjointFermOpTemplateInstantiate(WilsonKernels);
+TwoIndexFermOpTemplateInstantiate(WilsonKernels);

 }}
+
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -1,34 +1,35 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/qcd/action/fermion/WilsonKernels.h
+Source file: ./lib/qcd/action/fermion/WilsonKernels.h

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef  GRID_QCD_DHOP_H
-#define  GRID_QCD_DHOP_H
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_QCD_DHOP_H
+#define GRID_QCD_DHOP_H

 namespace Grid {

@@ -48,51 +49,158 @@ namespace Grid {
    template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic { 
    public:

-     INHERIT_IMPL_TYPES(Impl);
-     typedef FermionOperator<Impl> Base;
+      INHERIT_IMPL_TYPES(Impl);
+      typedef FermionOperator<Impl> Base;
     
    public:

-     void DiracOptDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-			   int sF, int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
-      
-     void DiracOptDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-			      int sF,int sU,int Ls, int Ns, const FermionField &in,FermionField &out);
+      template <bool EnableBool = true>
+      typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
+	DiracOptDhopSite(
+			 StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+			 std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+			 int sF, int sU, int Ls, int Ns, const FermionField &in,
+			 FermionField &out) {
+#ifdef AVX512
+	if (AsmOpt) {
+	  WilsonKernels<Impl>::DiracOptAsmDhopSite(st, lo, U, buf, sF, sU, Ls, Ns,
+						   in, out);
+
+	} else {
+#else
+	  {
+#endif
+	    for (int site = 0; site < Ns; site++) {
+	      for (int s = 0; s < Ls; s++) {
+		if (HandOpt)
+		  WilsonKernels<Impl>::DiracOptHandDhopSite(st, lo, U, buf, sF, sU,
+							    in, out);
+		else
+		  WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU,
+							       in, out);
+		sF++;
+	      }
+	      sU++;
+	    }
+	  }
+	}
+
+	template <bool EnableBool = true>
+	  typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
+	  DiracOptDhopSite(
+			   StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+			   std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+			   int sF, int sU, int Ls, int Ns, const FermionField &in,
+			   FermionField &out) {
+	  for (int site = 0; site < Ns; site++) {
+	    for (int s = 0; s < Ls; s++) {
+	      WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in,
+							   out);
+	      sF++;
+	    }
+	    sU++;
+	  }
+	}
+
+	template <bool EnableBool = true>
+	  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,
+				  void>::type
+	  DiracOptDhopSiteDag(
+			      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+			      std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+			      int sF, int sU, int Ls, int Ns, const FermionField &in,
+			      FermionField &out) {
+#ifdef AVX512
+				    if (AsmOpt) {
+				      WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st, lo, U, buf, sF, sU, Ls,
+										  Ns, in, out);
+				    } else {
+#else
+				      {
+#endif
+					for (int site = 0; site < Ns; site++) {
+					  for (int s = 0; s < Ls; s++) {
+					    if (HandOpt)
+					      WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st, lo, U, buf, sF, sU,
+											   in, out);
+					    else
+					      WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF,
+											      sU, in, out);
+					    sF++;
+					  }
+					  sU++;
+					}
+				      }
+				    }
+
+				    template <bool EnableBool = true>
+				      typename std::enable_if<
+				      (Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,
+				      void>::type
+				      DiracOptDhopSiteDag(
+							  StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+							  std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+							  int sF, int sU, int Ls, int Ns, const FermionField &in,
+							  FermionField &out) {
+					for (int site = 0; site < Ns; site++) {
+					  for (int s = 0; s < Ls; s++) {
+					    WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF, sU,
+											    in, out);
+					    sF++;
+					  }
+					  sU++;
+					}
+				      }
+
+				    void DiracOptDhopDir(
+							 StencilImpl &st, DoubledGaugeField &U,
+							 std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+							 int sF, int sU, const FermionField &in, FermionField &out, int dirdisp,
+							 int gamma);
+
+	private:
+				    // Specialised variants
+				    void DiracOptGenericDhopSite(
+								 StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+								 std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+								 int sF, int sU, const FermionField &in, FermionField &out);
+
+				    void DiracOptGenericDhopSiteDag(
+								    StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+								    std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+								    int sF, int sU, const FermionField &in, FermionField &out);
+
+				    void DiracOptAsmDhopSite(
+							     StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+							     std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+							     int sF, int sU, int Ls, int Ns, const FermionField &in,
+							     FermionField &out);
+
+				    void DiracOptAsmDhopSiteDag(
+								StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+								std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+								int sF, int sU, int Ls, int Ns, const FermionField &in,
+								FermionField &out);
+
+				    void DiracOptHandDhopSite(
+							      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+							      std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+							      int sF, int sU, const FermionField &in, FermionField &out);
+
+				    void DiracOptHandDhopSiteDag(
+								 StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+								 std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+								 int sF, int sU, const FermionField &in, FermionField &out);
+
+	public:
+				    WilsonKernels(const ImplParams &p = ImplParams());
+				  };
+    
+      }
+    }
+
+
+
+

-     void DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
-			  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-			  int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);
-
-    private:
-     // Specialised variants
-     void DiracOptGenericDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-			   int sF,int sU, const FermionField &in, FermionField &out);
-      
-     void DiracOptGenericDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-			      int sF,int sU,const FermionField &in,FermionField &out);
-
-     void DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-			      int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
-
-
-     void DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-			      int sF,int sU,const FermionField &in, FermionField &out);
-     
-     void DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-				 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-				 int sF,int sU,const FermionField &in, FermionField &out);
-    public:
-
-     WilsonKernels(const ImplParams &p= ImplParams());
-     
-    };
-
-  }
-}
 #endif
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -26,59 +26,77 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */

 #include <Grid.h>

+
 namespace Grid {
-namespace QCD {
+  namespace QCD {
+    
+    ///////////////////////////////////////////////////////////
+    // Default to no assembler implementation
+    ///////////////////////////////////////////////////////////
+    template<class Impl>
+      void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+                             std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+                             int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+    {
+      assert(0);
+    }
+    template<class Impl>
+      void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+                                std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+                                int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+    {
+      assert(0);
+    }


-  ///////////////////////////////////////////////////////////
-  // Default to no assembler implementation
-  ///////////////////////////////////////////////////////////
-template<class Impl>
-void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-{
-  assert(0);
-}

 #if defined(AVX512) 
-
-
-  ///////////////////////////////////////////////////////////
-  // If we are AVX512 specialise the single precision routine
-  ///////////////////////////////////////////////////////////
-
+    
+    
+    ///////////////////////////////////////////////////////////
+    // If we are AVX512 specialise the single precision routine
+    ///////////////////////////////////////////////////////////
+    
 #include <simd/Intel512wilson.h>
 #include <simd/Intel512single.h>
-
-static Vector<vComplexF> signs;
-
-int setupSigns(void ){
-  Vector<vComplexF> bother(2);
-  signs = bother;
-  vrsign(signs[0]);
-  visign(signs[1]);
-  return 1;
-}
-static int signInit = setupSigns();
-
+    
+    static Vector<vComplexF> signs;
+    
+    int setupSigns(void ){
+      Vector<vComplexF> bother(2);
+      signs = bother;
+      vrsign(signs[0]);
+      visign(signs[1]);
+      return 1;
+    }
+    static int signInit = setupSigns();
+  
 #define label(A)  ilabel(A)
 #define ilabel(A) ".globl\n"  #A ":\n" 
-
+  
 #define MAYBEPERM(A,perm) if (perm) { A ; }
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
 #define FX(A) WILSONASM_ ##A
-template<>
-void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-						     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+  
+#undef KERNEL_DAG
+    template<>
+    void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+							 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
+      
+#define KERNEL_DAG
+    template<>
+    void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+							    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+				    
 #undef VMOVIDUP
 #undef VMOVRDUP
 #undef MAYBEPERM
@@ -89,32 +107,43 @@ void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrd
 #define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
 #define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
-template<>
-void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+				    
+#undef KERNEL_DAG
+    template<>
+    void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
+				    
+#define KERNEL_DAG
+    template<>
+    void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+								     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+				    
 #endif

-template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							      int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		

-template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-}}
+#define INSTANTIATE_ASM(A)\
+template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\
+                                   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
+                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
+template void WilsonKernels<A>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\
+                                   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
+                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
+
+
+INSTANTIATE_ASM(WilsonImplF);
+INSTANTIATE_ASM(WilsonImplD);
+INSTANTIATE_ASM(ZWilsonImplF);
+INSTANTIATE_ASM(ZWilsonImplD);
+INSTANTIATE_ASM(GparityWilsonImplF);
+INSTANTIATE_ASM(GparityWilsonImplD);
+INSTANTIATE_ASM(DomainWallVec5dImplF);
+INSTANTIATE_ASM(DomainWallVec5dImplD);
+INSTANTIATE_ASM(ZDomainWallVec5dImplF);
+INSTANTIATE_ASM(ZDomainWallVec5dImplD);
+  }
+}

--- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
@@ -30,7 +30,11 @@
  basep = st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);
+#ifdef KERNEL_DAG
+    XP_PROJMEM(base);
+#else 
    XM_PROJMEM(base);
+#endif
    MAYBEPERM(PERMUTE_DIR3,perm);
  } else { 
    LOAD_CHI(base);
@@ -41,15 +45,22 @@
    MULT_2SPIN_DIR_PFXP(Xp,basep);
  }
  LOAD64(%r10,isigns);
+#ifdef KERNEL_DAG
+  XP_RECON;
+#else
  XM_RECON;
-
+#endif
  ////////////////////////////////
  // Yp
  ////////////////////////////////
  basep = st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    YP_PROJMEM(base);
+#else
    YM_PROJMEM(base);
+#endif
    MAYBEPERM(PERMUTE_DIR2,perm);
  } else { 
    LOAD_CHI(base);
@@ -60,7 +71,11 @@
    MULT_2SPIN_DIR_PFYP(Yp,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  YP_RECON_ACCUM;
+#else
  YM_RECON_ACCUM;
+#endif

  ////////////////////////////////
  // Zp
@@ -68,7 +83,11 @@
  basep = st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    ZP_PROJMEM(base);
+#else
    ZM_PROJMEM(base);
+#endif
    MAYBEPERM(PERMUTE_DIR1,perm);
  } else { 
    LOAD_CHI(base);
@@ -79,7 +98,11 @@
    MULT_2SPIN_DIR_PFZP(Zp,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  ZP_RECON_ACCUM;
+#else
  ZM_RECON_ACCUM;
+#endif

  ////////////////////////////////
  // Tp
@@ -87,7 +110,11 @@
  basep = st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    TP_PROJMEM(base);
+#else
    TM_PROJMEM(base);
+#endif
    MAYBEPERM(PERMUTE_DIR0,perm);
  } else { 
    LOAD_CHI(base);
@@ -98,16 +125,26 @@
    MULT_2SPIN_DIR_PFTP(Tp,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  TP_RECON_ACCUM;
+#else
  TM_RECON_ACCUM;
+#endif

  ////////////////////////////////
  // Xm
  ////////////////////////////////
+#ifndef STREAM_STORE
  basep= (uint64_t) &out._odata[ss];
+#endif
  //  basep= st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    XM_PROJMEM(base);
+#else
    XP_PROJMEM(base);
+#endif
    MAYBEPERM(PERMUTE_DIR3,perm);
  } else { 
    LOAD_CHI(base);
@@ -118,7 +155,11 @@
    MULT_2SPIN_DIR_PFXM(Xm,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  XM_RECON_ACCUM;
+#else
  XP_RECON_ACCUM;
+#endif

  ////////////////////////////////
  // Ym
@@ -126,7 +167,11 @@
  basep= st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    YM_PROJMEM(base);
+#else
    YP_PROJMEM(base);
+#endif
    MAYBEPERM(PERMUTE_DIR2,perm);
  } else { 
    LOAD_CHI(base);
@@ -137,7 +182,11 @@
    MULT_2SPIN_DIR_PFYM(Ym,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  YM_RECON_ACCUM;
+#else
  YP_RECON_ACCUM;
+#endif

  ////////////////////////////////
  // Zm
@@ -145,7 +194,11 @@
  basep= st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    ZM_PROJMEM(base);
+#else
    ZP_PROJMEM(base);
+#endif
    MAYBEPERM(PERMUTE_DIR1,perm);
  } else { 
    LOAD_CHI(base);
@@ -156,7 +209,11 @@
    MULT_2SPIN_DIR_PFZM(Zm,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  ZM_RECON_ACCUM;
+#else
  ZP_RECON_ACCUM;
+#endif

  ////////////////////////////////
  // Tm
@@ -164,18 +221,28 @@
  basep= st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    TM_PROJMEM(base);
+#else
    TP_PROJMEM(base);
+#endif
    MAYBEPERM(PERMUTE_DIR0,perm);
  } else { 
    LOAD_CHI(base);
  }
  base= (uint64_t) &out._odata[ss];
+#ifndef STREAM_STORE
  PREFETCH_CHIMU(base);
+#endif
  {
    MULT_2SPIN_DIR_PFTM(Tm,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  TM_RECON_ACCUM;
+#else
  TP_RECON_ACCUM;
+#endif

  basep= st.GetPFInfo(nent,plocal); nent++;
  SAVE_RESULT(base,basep);
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -311,8 +311,8 @@ namespace Grid {
 namespace QCD {


-template<class Impl>
-void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+  template<class Impl>
+  void WilsonKernels<Impl>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					       int ss,int sU,const FermionField &in, FermionField &out)
 {
@@ -554,8 +554,8 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &l
  }
 }

-template<class Impl>
-void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+  template<class Impl>
+  void WilsonKernels<Impl>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					       int ss,int sU,const FermionField &in, FermionField &out)
 {
@@ -839,46 +839,23 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,
 ////////////// Wilson ; uses this implementation /////////////////////
 // Need Nc=3 though //

-template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								  int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+#define INSTANTIATE_THEM(A) \
+template void WilsonKernels<A>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
+							       int ss,int sU,const FermionField &in, FermionField &out);\
+template void WilsonKernels<A>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
+								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
 								  int ss,int sU,const FermionField &in, FermionField &out);

-
-template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-									 int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-									 int ss,int sU,const FermionField &in, FermionField &out);
-
-
-template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-									 int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-									 int ss,int sU,const FermionField &in, FermionField &out);
-
+INSTANTIATE_THEM(WilsonImplF);
+INSTANTIATE_THEM(WilsonImplD);
+INSTANTIATE_THEM(ZWilsonImplF);
+INSTANTIATE_THEM(ZWilsonImplD);
+INSTANTIATE_THEM(GparityWilsonImplF);
+INSTANTIATE_THEM(GparityWilsonImplD);
+INSTANTIATE_THEM(DomainWallVec5dImplF);
+INSTANTIATE_THEM(DomainWallVec5dImplD);
+INSTANTIATE_THEM(ZDomainWallVec5dImplF);
+INSTANTIATE_THEM(ZDomainWallVec5dImplD);

 }}
--- a/lib/qcd/action/fermion/WilsonTMFermion.h
+++ b/lib/qcd/action/fermion/WilsonTMFermion.h
@@ -28,7 +28,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_WILSON_TM_FERMION_H
 #define  GRID_QCD_WILSON_TM_FERMION_H

-#include <Grid.h>
+#include <Grid/Grid.h>

 namespace Grid {

--- a/lib/qcd/action/fermion/ZMobiusFermion.h
+++ b/lib/qcd/action/fermion/ZMobiusFermion.h
@@ -0,0 +1,79 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/MobiusFermion.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef  GRID_QCD_ZMOBIUS_FERMION_H
+#define  GRID_QCD_ZMOBIUS_FERMION_H
+
+#include <Grid/Grid.h>
+
+namespace Grid {
+
+  namespace QCD {
+
+    template<class Impl>
+    class ZMobiusFermion : public CayleyFermion5D<Impl>
+    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
+    public:
+
+      virtual void   Instantiatable(void) {};
+      // Constructors
+      ZMobiusFermion(GaugeField &_Umu,
+		     GridCartesian         &FiveDimGrid,
+		     GridRedBlackCartesian &FiveDimRedBlackGrid,
+		     GridCartesian         &FourDimGrid,
+		     GridRedBlackCartesian &FourDimRedBlackGrid,
+		     RealD _mass,RealD _M5,
+		     std::vector<ComplexD> &gamma, RealD b,RealD c,const ImplParams &p= ImplParams()) : 
+      
+      CayleyFermion5D<Impl>(_Umu,
+			    FiveDimGrid,
+			    FiveDimRedBlackGrid,
+			    FourDimGrid,
+			    FourDimRedBlackGrid,_mass,_M5,p)
+
+      {
+	RealD eps = 1.0;
+	
+	std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
+	std::vector<Coeff_t> zgamma(this->Ls);
+	for(int s=0;s<this->Ls;s++){
+	  zgamma[s] = gamma[s];
+	}
+
+	// Call base setter
+	this->SetCoefficientsInternal(1.0,zgamma,b,c);
+      }
+
+    };
+
+  }
+}
+
+#endif
--- a/lib/qcd/action/gauge/GaugeImpl.h
+++ b/lib/qcd/action/gauge/GaugeImpl.h
@@ -1,181 +1,194 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/qcd/action/gauge/GaugeImpl.h
+Source file: ./lib/qcd/action/gauge/GaugeImpl.h

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: paboyle <paboyle@ph.ed.ac.uk>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef  GRID_QCD_GAUGE_IMPL_H
-#define  GRID_QCD_GAUGE_IMPL_H
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_QCD_GAUGE_IMPL_H
+#define GRID_QCD_GAUGE_IMPL_H

 namespace Grid {
+namespace QCD {

-  namespace QCD {
+////////////////////////////////////////////////////////////////////////
+// Implementation dependent gauge types
+////////////////////////////////////////////////////////////////////////

-    
-    ////////////////////////////////////////////////////////////////////////
-    // Implementation dependent gauge types
-    ////////////////////////////////////////////////////////////////////////
+template <class Gimpl> class WilsonLoops;

-template<class Gimpl> class WilsonLoops;
+#define INHERIT_GIMPL_TYPES(GImpl)                                             \
+  typedef typename GImpl::Simd Simd;                                           \
+  typedef typename GImpl::GaugeLinkField GaugeLinkField;                       \
+  typedef typename GImpl::GaugeField GaugeField;                               \
+  typedef typename GImpl::SiteGaugeField SiteGaugeField;                       \
+  typedef typename GImpl::SiteGaugeLink SiteGaugeLink;

-#define INHERIT_GIMPL_TYPES(GImpl) \
-    typedef typename GImpl::Simd                           Simd;\
-    typedef typename GImpl::GaugeLinkField       GaugeLinkField;\
-    typedef typename GImpl::GaugeField               GaugeField;\
-    typedef typename GImpl::SiteGaugeField       SiteGaugeField;\
-    typedef typename GImpl::SiteGaugeLink         SiteGaugeLink;
+//
+template <class S, int Nrepresentation = Nc> class GaugeImplTypes {
+public:
+  typedef S Simd;

+  template <typename vtype>
+  using iImplGaugeLink  = iScalar<iScalar<iMatrix<vtype, Nrepresentation>>>;
+  template <typename vtype>
+  using iImplGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation>>, Nd>;

-    // 
-    template<class S,int Nrepresentation=Nc>
-    class GaugeImplTypes { 
-    public:
-    
-      typedef S Simd;
-    
-      template<typename vtype> using iImplGaugeLink          = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
-      template<typename vtype> using iImplGaugeField         = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd  >;
-    
-      typedef iImplGaugeLink    <Simd>           SiteGaugeLink;
-      typedef iImplGaugeField   <Simd>           SiteGaugeField;
-    
-      typedef Lattice<SiteGaugeLink>                GaugeLinkField; // bit ugly naming; polarised gauge field, lorentz... all ugly
-      typedef Lattice<SiteGaugeField>                   GaugeField;
+  typedef iImplGaugeLink<Simd> SiteGaugeLink;
+  typedef iImplGaugeField<Simd> SiteGaugeField;

-    };
+  typedef Lattice<SiteGaugeLink> GaugeLinkField; // bit ugly naming; polarised
+                                                 // gauge field, lorentz... all
+                                                 // ugly
+  typedef Lattice<SiteGaugeField> GaugeField;

-    // Composition with smeared link, bc's etc.. probably need multiple inheritance
-    // Variable precision "S" and variable Nc
-    template<class GimplTypes>
-    class PeriodicGaugeImpl : public GimplTypes  { 
-    public:
-
-    INHERIT_GIMPL_TYPES(GimplTypes);
-
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // Support needed for the assembly of loops including all boundary condition effects such as conjugate bcs
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    
-      template<class covariant>  static inline
-      Lattice<covariant> CovShiftForward (const GaugeLinkField &Link, int mu, const Lattice<covariant> &field) {
-	return PeriodicBC::CovShiftForward(Link,mu,field);
-      }
-
-      template<class covariant> static inline
-      Lattice<covariant> CovShiftBackward(const GaugeLinkField &Link, int mu,const Lattice<covariant> &field) {
-	return PeriodicBC::CovShiftBackward(Link,mu,field);
-      }
-      static inline
-      GaugeLinkField CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
-	return Cshift(adj(Link),mu,-1);
-      }
-      static inline
-      GaugeLinkField CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
-	return Link;
-      }
-      static inline
-      GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) {
-	return Cshift(Link,mu,1);
-      }
-
-      static inline bool isPeriodicGaugeField(void) {
-	return true;
-      }
-
-    };
-
-    
-    // Composition with smeared link, bc's etc.. probably need multiple inheritance
-    // Variable precision "S" and variable Nc
-    template<class GimplTypes>
-    class ConjugateGaugeImpl : public GimplTypes { 
-    public:
-
-      INHERIT_GIMPL_TYPES(GimplTypes);
-
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // Support needed for the assembly of loops including all boundary condition effects such as Gparity.
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    template<class covariant>  static
-    Lattice<covariant> CovShiftForward (const GaugeLinkField &Link, int mu, const Lattice<covariant> &field) {
-      return ConjugateBC::CovShiftForward(Link,mu,field);
+  // Move this elsewhere? FIXME
+  static inline void AddGaugeLink(GaugeField &U, GaugeLinkField &W,
+                                  int mu) { // U[mu] += W
+    PARALLEL_FOR_LOOP
+    for (auto ss = 0; ss < U._grid->oSites(); ss++) {
+      U._odata[ss]._internal[mu] =
+          U._odata[ss]._internal[mu] + W._odata[ss]._internal;
    }
-
-    template<class covariant> static
-    Lattice<covariant> CovShiftBackward(const GaugeLinkField &Link, int mu,const Lattice<covariant> &field) {
-      return ConjugateBC::CovShiftBackward(Link,mu,field);
-    }
-
-    static inline
-    GaugeLinkField CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
-      GridBase *grid = Link._grid;
-      int Lmu = grid->GlobalDimensions()[mu]-1;
-      
-      Lattice<iScalar<vInteger> > coor(grid);    LatticeCoordinate(coor,mu);
-
-      GaugeLinkField tmp (grid);
-      tmp=adj(Link);
-      tmp = where(coor==Lmu,conjugate(tmp),tmp);
-      return Cshift(tmp,mu,-1);// moves towards positive mu
-    }
-    static inline
-    GaugeLinkField CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
-      return Link;
-    }
-
-    static inline
-    GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) {
-      GridBase *grid = Link._grid;
-      int Lmu = grid->GlobalDimensions()[mu]-1;
-      
-      Lattice<iScalar<vInteger> > coor(grid);    LatticeCoordinate(coor,mu);
-
-      GaugeLinkField tmp (grid);
-      tmp=Cshift(Link,mu,1);
-      tmp=where(coor==Lmu,conjugate(tmp),tmp);
-      return tmp;
-    }
-
-    static inline bool isPeriodicGaugeField(void) {
-      return false;
-    }
-    
-    };
-
-    typedef GaugeImplTypes<vComplex,Nc>     GimplTypesR;
-    typedef GaugeImplTypes<vComplexF,Nc>    GimplTypesF;
-    typedef GaugeImplTypes<vComplexD,Nc>    GimplTypesD;
-
-    typedef PeriodicGaugeImpl<GimplTypesR> PeriodicGimplR; // Real.. whichever prec
-    typedef PeriodicGaugeImpl<GimplTypesF> PeriodicGimplF; // Float
-    typedef PeriodicGaugeImpl<GimplTypesD> PeriodicGimplD; // Double
-
-    typedef ConjugateGaugeImpl<GimplTypesR> ConjugateGimplR; // Real.. whichever prec
-    typedef ConjugateGaugeImpl<GimplTypesF> ConjugateGimplF; // Float
-    typedef ConjugateGaugeImpl<GimplTypesD> ConjugateGimplD; // Double
-
  }
+};
+
+// Composition with smeared link, bc's etc.. probably need multiple inheritance
+// Variable precision "S" and variable Nc
+template <class GimplTypes> class PeriodicGaugeImpl : public GimplTypes {
+public:
+  INHERIT_GIMPL_TYPES(GimplTypes);
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Support needed for the assembly of loops including all boundary condition
+  // effects such as conjugate bcs
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  template <class covariant>
+  static inline Lattice<covariant>
+  CovShiftForward(const GaugeLinkField &Link, int mu,
+                  const Lattice<covariant> &field) {
+    return PeriodicBC::CovShiftForward(Link, mu, field);
+  }
+
+  template <class covariant>
+  static inline Lattice<covariant>
+  CovShiftBackward(const GaugeLinkField &Link, int mu,
+                   const Lattice<covariant> &field) {
+    return PeriodicBC::CovShiftBackward(Link, mu, field);
+  }
+  static inline GaugeLinkField
+  CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
+    return Cshift(adj(Link), mu, -1);
+  }
+  static inline GaugeLinkField
+  CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
+    return Link;
+  }
+  static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) {
+    return Cshift(Link, mu, 1);
+  }
+
+  static inline bool isPeriodicGaugeField(void) { return true; }
+};
+
+// Composition with smeared link, bc's etc.. probably need multiple inheritance
+// Variable precision "S" and variable Nc
+template <class GimplTypes> class ConjugateGaugeImpl : public GimplTypes {
+public:
+  INHERIT_GIMPL_TYPES(GimplTypes);
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Support needed for the assembly of loops including all boundary condition
+  // effects such as Gparity.
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  template <class covariant>
+  static Lattice<covariant> CovShiftForward(const GaugeLinkField &Link, int mu,
+                                            const Lattice<covariant> &field) {
+    return ConjugateBC::CovShiftForward(Link, mu, field);
+  }
+
+  template <class covariant>
+  static Lattice<covariant> CovShiftBackward(const GaugeLinkField &Link, int mu,
+                                             const Lattice<covariant> &field) {
+    return ConjugateBC::CovShiftBackward(Link, mu, field);
+  }
+
+  static inline GaugeLinkField
+  CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
+    GridBase *grid = Link._grid;
+    int Lmu = grid->GlobalDimensions()[mu] - 1;
+
+    Lattice<iScalar<vInteger>> coor(grid);
+    LatticeCoordinate(coor, mu);
+
+    GaugeLinkField tmp(grid);
+    tmp = adj(Link);
+    tmp = where(coor == Lmu, conjugate(tmp), tmp);
+    return Cshift(tmp, mu, -1); // moves towards positive mu
+  }
+  static inline GaugeLinkField
+  CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
+    return Link;
+  }
+
+  static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) {
+    GridBase *grid = Link._grid;
+    int Lmu = grid->GlobalDimensions()[mu] - 1;
+
+    Lattice<iScalar<vInteger>> coor(grid);
+    LatticeCoordinate(coor, mu);
+
+    GaugeLinkField tmp(grid);
+    tmp = Cshift(Link, mu, 1);
+    tmp = where(coor == Lmu, conjugate(tmp), tmp);
+    return tmp;
+  }
+
+  static inline bool isPeriodicGaugeField(void) { return false; }
+};
+
+typedef GaugeImplTypes<vComplex, Nc> GimplTypesR;
+typedef GaugeImplTypes<vComplexF, Nc> GimplTypesF;
+typedef GaugeImplTypes<vComplexD, Nc> GimplTypesD;
+
+typedef GaugeImplTypes<vComplex, SU<Nc>::AdjointDimension> GimplAdjointTypesR;
+typedef GaugeImplTypes<vComplexF, SU<Nc>::AdjointDimension> GimplAdjointTypesF;
+typedef GaugeImplTypes<vComplexD, SU<Nc>::AdjointDimension> GimplAdjointTypesD;
+
+typedef PeriodicGaugeImpl<GimplTypesR> PeriodicGimplR; // Real.. whichever prec
+typedef PeriodicGaugeImpl<GimplTypesF> PeriodicGimplF; // Float
+typedef PeriodicGaugeImpl<GimplTypesD> PeriodicGimplD; // Double
+
+typedef PeriodicGaugeImpl<GimplAdjointTypesR> PeriodicGimplAdjR; // Real.. whichever prec
+typedef PeriodicGaugeImpl<GimplAdjointTypesF> PeriodicGimplAdjF; // Float
+typedef PeriodicGaugeImpl<GimplAdjointTypesD> PeriodicGimplAdjD; // Double
+
+typedef ConjugateGaugeImpl<GimplTypesR> ConjugateGimplR; // Real.. whichever prec
+typedef ConjugateGaugeImpl<GimplTypesF> ConjugateGimplF; // Float
+typedef ConjugateGaugeImpl<GimplTypesD> ConjugateGimplD; // Double
+}
 }

 #endif
--- a/lib/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
+++ b/lib/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
@@ -1,212 +1,214 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+Grid physics library, www.github.com/paboyle/Grid

-    Source file: ./lib/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
+Source file: ./lib/qcd/action/pseudofermion/OneFlavourEvenOddRational.h

-    Copyright (C) 2015
+Copyright (C) 2015

 Author: Peter Boyle <paboyle@ph.ed.ac.uk>

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.

-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.

-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef QCD_PSEUDOFERMION_ONE_FLAVOUR_EVEN_ODD_RATIONAL_H
 #define QCD_PSEUDOFERMION_ONE_FLAVOUR_EVEN_ODD_RATIONAL_H

-namespace Grid{
-  namespace QCD{
+namespace Grid {
+namespace QCD {

-    ///////////////////////////////////////
-    // One flavour rational
-    ///////////////////////////////////////
+///////////////////////////////////////
+// One flavour rational
+///////////////////////////////////////

-    // S_f = chi^dag *  N(Mpc^dag*Mpc)/D(Mpc^dag*Mpc) * chi
+// S_f = chi^dag *  N(Mpc^dag*Mpc)/D(Mpc^dag*Mpc) * chi
+//
+// Here, M is some operator
+// N and D makeup the rat. poly
+//
+
+template <class Impl>
+class OneFlavourEvenOddRationalPseudoFermionAction
+    : public Action<typename Impl::GaugeField> {
+ public:
+  INHERIT_IMPL_TYPES(Impl);
+
+  typedef OneFlavourRationalParams Params;
+  Params param;
+
+  MultiShiftFunction PowerHalf;
+  MultiShiftFunction PowerNegHalf;
+  MultiShiftFunction PowerQuarter;
+  MultiShiftFunction PowerNegQuarter;
+
+ private:
+  FermionOperator<Impl> &FermOp;  // the basic operator
+
+  // NOT using "Nroots"; IroIro is -- perhaps later, but this wasn't good for us
+  // historically
+  // and hasenbusch works better
+
+  FermionField PhiEven;  // the pseudo fermion field for this trajectory
+  FermionField PhiOdd;   // the pseudo fermion field for this trajectory
+
+ public:
+  OneFlavourEvenOddRationalPseudoFermionAction(FermionOperator<Impl> &Op,
+                                               Params &p)
+      : FermOp(Op),
+        PhiEven(Op.FermionRedBlackGrid()),
+        PhiOdd(Op.FermionRedBlackGrid()),
+        param(p) {
+    AlgRemez remez(param.lo, param.hi, param.precision);
+
+    // MdagM^(+- 1/2)
+    std::cout << GridLogMessage << "Generating degree " << param.degree
+              << " for x^(1/2)" << std::endl;
+    remez.generateApprox(param.degree, 1, 2);
+    PowerHalf.Init(remez, param.tolerance, false);
+    PowerNegHalf.Init(remez, param.tolerance, true);
+
+    // MdagM^(+- 1/4)
+    std::cout << GridLogMessage << "Generating degree " << param.degree
+              << " for x^(1/4)" << std::endl;
+    remez.generateApprox(param.degree, 1, 4);
+    PowerQuarter.Init(remez, param.tolerance, false);
+    PowerNegQuarter.Init(remez, param.tolerance, true);
+  };
+
+  virtual void refresh(const GaugeField &U, GridParallelRNG &pRNG) {
+    // P(phi) = e^{- phi^dag (MpcdagMpc)^-1/2 phi}
+    //        = e^{- phi^dag (MpcdagMpc)^-1/4 (MpcdagMpc)^-1/4 phi}
+    // Phi = MpcdagMpc^{1/4} eta
    //
-    // Here, M is some operator 
-    // N and D makeup the rat. poly 
+    // P(eta) = e^{- eta^dag eta}
    //
-  
-    template<class Impl>
-    class OneFlavourEvenOddRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
-    public:
-      INHERIT_IMPL_TYPES(Impl);
+    // e^{x^2/2 sig^2} => sig^2 = 0.5.
+    //
+    // So eta should be of width sig = 1/sqrt(2).

-      typedef OneFlavourRationalParams Params;
-      Params param;
+    RealD scale = std::sqrt(0.5);

-      MultiShiftFunction PowerHalf   ;
-      MultiShiftFunction PowerNegHalf;
-      MultiShiftFunction PowerQuarter;
-      MultiShiftFunction PowerNegQuarter;
+    FermionField eta(FermOp.FermionGrid());
+    FermionField etaOdd(FermOp.FermionRedBlackGrid());
+    FermionField etaEven(FermOp.FermionRedBlackGrid());

-    private:
-     
-      FermionOperator<Impl> & FermOp;// the basic operator
+    gaussian(pRNG, eta);
+    eta = eta * scale;

-      // NOT using "Nroots"; IroIro is -- perhaps later, but this wasn't good for us historically
-      // and hasenbusch works better
+    pickCheckerboard(Even, etaEven, eta);
+    pickCheckerboard(Odd, etaOdd, eta);

-      FermionField PhiEven; // the pseudo fermion field for this trajectory
-      FermionField PhiOdd; // the pseudo fermion field for this trajectory
-                        
+    FermOp.ImportGauge(U);

-    public:
+    // mutishift CG
+    SchurDifferentiableOperator<Impl> Mpc(FermOp);
+    ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter, PowerQuarter);
+    msCG(Mpc, etaOdd, PhiOdd);

-      OneFlavourEvenOddRationalPseudoFermionAction(FermionOperator<Impl>  &Op, 
-						   Params & p ) : FermOp(Op), 
-	PhiEven(Op.FermionRedBlackGrid()), 
-	PhiOdd (Op.FermionRedBlackGrid()), 
-	param(p) 
-      {
-	AlgRemez remez(param.lo,param.hi,param.precision);
+    //////////////////////////////////////////////////////
+    // FIXME : Clover term not yet..
+    //////////////////////////////////////////////////////

-	// MdagM^(+- 1/2)
-	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
-	remez.generateApprox(param.degree,1,2);
-	PowerHalf.Init(remez,param.tolerance,false);
-	PowerNegHalf.Init(remez,param.tolerance,true);
+    assert(FermOp.ConstEE() == 1);
+    PhiEven = zero;
+  };

-	// MdagM^(+- 1/4)
-	std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
-	remez.generateApprox(param.degree,1,4);
-   	PowerQuarter.Init(remez,param.tolerance,false);
-	PowerNegQuarter.Init(remez,param.tolerance,true);
-      };
-      
-      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
+  //////////////////////////////////////////////////////
+  // S = phi^dag (Mdag M)^-1/2 phi
+  //////////////////////////////////////////////////////
+  virtual RealD S(const GaugeField &U) {
+    FermOp.ImportGauge(U);

-	// P(phi) = e^{- phi^dag (MpcdagMpc)^-1/2 phi}
-	//        = e^{- phi^dag (MpcdagMpc)^-1/4 (MpcdagMpc)^-1/4 phi}
-	// Phi = MpcdagMpc^{1/4} eta 
-	//
-	// P(eta) = e^{- eta^dag eta}
-	//
-	// e^{x^2/2 sig^2} => sig^2 = 0.5.
-	// 
-	// So eta should be of width sig = 1/sqrt(2).
+    FermionField Y(FermOp.FermionRedBlackGrid());

-	RealD scale = std::sqrt(0.5);
+    SchurDifferentiableOperator<Impl> Mpc(FermOp);

-	FermionField eta    (FermOp.FermionGrid());
-	FermionField etaOdd (FermOp.FermionRedBlackGrid());
-	FermionField etaEven(FermOp.FermionRedBlackGrid());
+    ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,
+                                                   PowerNegQuarter);

-	gaussian(pRNG,eta);	eta=eta*scale;
+    msCG(Mpc, PhiOdd, Y);

-	pickCheckerboard(Even,etaEven,eta);
-	pickCheckerboard(Odd,etaOdd,eta);
+    RealD action = norm2(Y);
+    std::cout << GridLogMessage << "Pseudofermion action FIXME -- is -1/4 "
+                                   "solve or -1/2 solve faster??? "
+              << action << std::endl;

-	FermOp.ImportGauge(U);
+    return action;
+  };

-	// mutishift CG
-	SchurDifferentiableOperator<Impl> Mpc(FermOp);
-	ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,PowerQuarter);
-	msCG(Mpc,etaOdd,PhiOdd);
+  //////////////////////////////////////////////////////
+  // Need
+  // dS_f/dU = chi^dag   d[N/D]  chi
+  //
+  // N/D is expressed as partial fraction expansion:
+  //
+  //           a0 + \sum_k ak/(M^dagM + bk)
+  //
+  // d[N/D] is then
+  //
+  //          \sum_k -ak [M^dagM+bk]^{-1}  [ dM^dag M + M^dag dM ] [M^dag M +
+  //          bk]^{-1}
+  //
+  // Need
+  //       Mf Phi_k = [MdagM+bk]^{-1} Phi
+  //       Mf Phi   = \sum_k ak [MdagM+bk]^{-1} Phi
+  //
+  // With these building blocks
+  //
+  //       dS/dU =  \sum_k -ak Mf Phi_k^dag      [ dM^dag M + M^dag dM ] Mf
+  //       Phi_k
+  //        S    = innerprodReal(Phi,Mf Phi);
+  //////////////////////////////////////////////////////
+  virtual void deriv(const GaugeField &U, GaugeField &dSdU) {
+    const int Npole = PowerNegHalf.poles.size();

-	//////////////////////////////////////////////////////
-	// FIXME : Clover term not yet..
-	//////////////////////////////////////////////////////
+    std::vector<FermionField> MPhi_k(Npole, FermOp.FermionRedBlackGrid());

-	assert(FermOp.ConstEE() == 1);
-	PhiEven = zero;
-	
-      };
+    FermionField X(FermOp.FermionRedBlackGrid());
+    FermionField Y(FermOp.FermionRedBlackGrid());

-      //////////////////////////////////////////////////////
-      // S = phi^dag (Mdag M)^-1/2 phi
-      //////////////////////////////////////////////////////
-      virtual RealD S(const GaugeField &U) {
+    GaugeField tmp(FermOp.GaugeGrid());

-	FermOp.ImportGauge(U);
+    FermOp.ImportGauge(U);

-	FermionField Y(FermOp.FermionRedBlackGrid());
-	
-	SchurDifferentiableOperator<Impl> Mpc(FermOp);
+    SchurDifferentiableOperator<Impl> Mpc(FermOp);

-	ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,PowerNegQuarter);
+    ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter, PowerNegHalf);

-	msCG(Mpc,PhiOdd,Y);
+    msCG(Mpc, PhiOdd, MPhi_k);

-	RealD action = norm2(Y);
-	std::cout << GridLogMessage << "Pseudofermion action FIXME -- is -1/4 solve or -1/2 solve faster??? "<<action<<std::endl;
+    dSdU = zero;
+    for (int k = 0; k < Npole; k++) {
+      RealD ak = PowerNegHalf.residues[k];

-	return action;
-      };
+      X = MPhi_k[k];

-      //////////////////////////////////////////////////////
-      // Need
-      // dS_f/dU = chi^dag   d[N/D]  chi
-      //
-      // N/D is expressed as partial fraction expansion:
-      //
-      //           a0 + \sum_k ak/(M^dagM + bk)
-      //
-      // d[N/D] is then
-      //
-      //          \sum_k -ak [M^dagM+bk]^{-1}  [ dM^dag M + M^dag dM ] [M^dag M + bk]^{-1}
-      //
-      // Need
-      //       Mf Phi_k = [MdagM+bk]^{-1} Phi
-      //       Mf Phi   = \sum_k ak [MdagM+bk]^{-1} Phi
-      //
-      // With these building blocks
-      //
-      //       dS/dU =  \sum_k -ak Mf Phi_k^dag      [ dM^dag M + M^dag dM ] Mf Phi_k
-      //        S    = innerprodReal(Phi,Mf Phi);
-      //////////////////////////////////////////////////////
-      virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
+      Mpc.Mpc(X, Y);
+      Mpc.MpcDeriv(tmp, Y, X);
+      dSdU = dSdU + ak * tmp;
+      Mpc.MpcDagDeriv(tmp, X, Y);
+      dSdU = dSdU + ak * tmp;
+    }

-	const int Npole = PowerNegHalf.poles.size();
-
-	std::vector<FermionField> MPhi_k (Npole,FermOp.FermionRedBlackGrid());
-
-	FermionField X(FermOp.FermionRedBlackGrid());
-	FermionField Y(FermOp.FermionRedBlackGrid());
-
-	GaugeField   tmp(FermOp.GaugeGrid());
-
-	FermOp.ImportGauge(U);
-
-	SchurDifferentiableOperator<Impl> Mpc(FermOp);
-
-	ConjugateGradientMultiShift<FermionField> msCG(param.MaxIter,PowerNegHalf);
-
-	msCG(Mpc,PhiOdd,MPhi_k);
-
-	dSdU = zero;
-	for(int k=0;k<Npole;k++){
-
-	  RealD ak = PowerNegHalf.residues[k];
-
-	  X  = MPhi_k[k];
-
-	  Mpc.Mpc(X,Y);
-	  Mpc.MpcDeriv   (tmp , Y, X );  dSdU=dSdU+ak*tmp;
-	  Mpc.MpcDagDeriv(tmp , X, Y );  dSdU=dSdU+ak*tmp;
-
-	}
-
-	dSdU = Ta(dSdU);
-
-      };
-    };
-  }
+    // dSdU = Ta(dSdU);
+  };
+};
+}
 }

-
 #endif
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Peter Boyle	d68937654b	Merge pull request #50 from waterret/develop use sha256 based splittable rng	2016-10-20 16:40:31 +01:00
paboyle	7af9b87318	Cache face tables to improve performance. Extract merge now looking poor.	2016-10-18 09:51:37 +01:00
Luchang Jin	70f386f9c6	switch to use Output Feedback Split mode https://github.com/waterret/RngState-OFS	2016-10-17 14:20:59 -04:00
Luchang Jin	89cda5971a	update rng number algo	2016-10-17 13:31:31 -04:00
Luchang Jin	c39ec3b607	update license and if guards	2016-10-17 13:31:31 -04:00
Luchang Jin	8afcc8fb8b	fix state size	2016-10-17 13:31:31 -04:00
Luchang Jin	1abbe2fd0c	update rng-state, change output format	2016-10-17 13:31:31 -04:00
Luchang Jin	4fb37ececd	fix sprng-sha256 seed with seq	2016-10-17 13:31:30 -04:00
Luchang Jin	71eaa7c79e	use sha256 based splittable rng	2016-10-17 13:31:30 -04:00
paboyle	811ca45473	GNU clang hack for AVX512 since there are missing reduce intrinsics in Clang 3.9 and GCC-6 AVX512 support	2016-10-17 16:23:21 +01:00
paboyle	bc1a4d40ba	Faster integer handling avoid push_back	2016-10-17 16:16:44 +01:00
paboyle	c8079e6621	Time the face gateher in x-dir more carefully	2016-10-13 22:28:50 +01:00
azusayamaguchi	8b0d171c9a	32bit issue on the KNL code variant where byte offsets were stored	2016-10-12 17:49:32 +01:00
azusayamaguchi	1f293b76b4	Merge branch 'feature/knl-stats' into develop	2016-10-12 13:47:58 +01:00
azusayamaguchi	8bbd9ebc27	Reversing changes to Stencil class	2016-10-12 13:47:20 +01:00
azusayamaguchi	6472b431f0	__rdpmc needed for gcc, clang++	2016-10-12 12:29:08 +01:00
azusayamaguchi	bd205a3293	Fixing for non x86 and non KNL	2016-10-12 12:09:15 +01:00
azusayamaguchi	496beffa88	Fix non-KNL build	2016-10-12 12:06:08 +01:00
azusayamaguchi	9b63e97108	align not absolutely required and confuses clang++	2016-10-12 11:51:21 +01:00
azusayamaguchi	81f2aeaece	KNL streaming stores, and KNL performance coutners	2016-10-12 11:45:22 +01:00
paboyle	2d4a45c758	Typecast pointer	2016-10-12 09:14:15 +01:00
paboyle	0f182f033b	Drop macos with gcc	2016-10-11 22:29:06 +01:00
paboyle	7240d73184	Parallelise the x faces; fix the segv on KNL with comms	2016-10-11 22:21:07 +01:00
paboyle	42cd148f5e	Base pointer for comms buffer under AVX512 assembly	2016-10-11 16:06:06 +01:00
Guido Cossu	611b5d74ba	Fix for AVX+FMA3 compilation	2016-10-10 15:26:17 +01:00
Guido Cossu	b56c9ffa52	Fix for AVXFMA	2016-10-10 14:43:37 +01:00
Antonin Portelli	70c32fa49b	Merge branch 'develop' of github.com:paboyle/Grid into develop	2016-10-09 12:55:46 +01:00
Antonin Portelli	77c8a94dae	AVXFMA4 flag fix for Intel Compiler	2016-10-09 12:55:12 +01:00
Guido Cossu	2e453dfbf5	Added some instrumentation to benchmark the force computation	2016-10-06 17:52:45 +01:00
paboyle	4089984431	Timing hooks	2016-10-06 09:25:12 +01:00
Antonin Portelli	98439847cf	configure portability fix	2016-10-05 14:57:20 +01:00
Guido Cossu	c78bbd0f8c	Fix ASM compilation	2016-10-04 15:37:32 +01:00
Antonin Portelli	7ea4b959a4	hopefully more portable configure output	2016-09-27 11:54:37 +01:00
Antonin Portelli	536e2ff073	*.inc removed: please don't commit these files either!	2016-09-27 11:54:03 +01:00
Antonin Portelli	798ff34d7e	configure removed: please don't commit configure!	2016-09-27 11:29:31 +01:00
Guido Cossu	04a437c92c	Minor modification to the filelist script	2016-09-23 11:12:45 +01:00
Guido Cossu	5c190a1b8c	Merge branch 'develop' into feature/hirep	2016-09-23 11:06:06 +01:00
Guido Cossu	15d8f5c88c	Small change to the configure.ac to include the canonical names	2016-09-23 11:05:36 +01:00
Guido Cossu	c4ac6e7e8f	Consolidating HMC interface Uniformed interface for standard action in fundamental rep and Hirep	2016-09-23 10:47:42 +01:00
Guido Cossu	510e340e16	Debugged last commit for the Two index representation	2016-09-22 22:16:21 +01:00
Guido Cossu	6ffadca153	Restored number of colours to 3	2016-09-22 14:22:54 +01:00
Guido Cossu	b6597b74e7	Added support for the Two index Symmetric and Antisymmetric representations Tested for HMC convergence: OK Added also a test file showing an example for mixed representations	2016-09-22 14:17:37 +01:00
Antonin Portelli	d2573189d8	build system: FFTW fix	2016-09-20 12:30:24 +01:00
Antonin Portelli	65ca174dbb	gitignore update	2016-09-20 11:25:06 +01:00
Antonin Portelli	0724f7af75	QPX single precision implementation	2016-09-19 18:09:12 +01:00
Antonin Portelli	2e74520821	removed libtool use (BG/Q compatibility)	2016-09-16 15:25:49 +01:00
Antonin Portelli	6dd75ad9e5	Merge branch 'develop' of github.com:paboyle/Grid into feature/bgq	2016-09-16 15:07:54 +01:00
Guido Cossu	fda408ee6f	Added first lines for supporting Two Index representations	2016-09-13 10:43:30 +01:00
Guido Cossu	b9c80318a2	Merge branch 'develop' into feature/hirep	2016-09-13 10:01:51 +01:00
Guido Cossu	5df5d52d41	Fix for the Intel compiler	2016-09-12 17:17:20 +01:00
Guido Cossu	f76f281e58	Cleaning files after fix	2016-09-09 11:34:25 +01:00
Guido Cossu	aa20cc8b52	Fixing compilation error with AVX512 flag	2016-09-09 02:58:52 -07:00
Guido Cossu	0fd179fb33	Merge branch 'develop' into feature/hirep	2016-09-01 12:59:53 +01:00
Guido Cossu	f45ef8d114	Minor modification in ActionBase.h	2016-09-01 11:46:46 +01:00
Guido Cossu	fd5614738d	Merge branch 'develop' into feature/hirep	2016-08-30 18:21:36 +01:00
Guido Cossu	005dcc51aa	Reset travis	2016-08-30 14:44:10 +01:00
Guido Cossu	655c893f86	Another test on travis	2016-08-30 14:38:42 +01:00
Guido Cossu	843f5783b4	Again travis test separating single and double	2016-08-30 14:29:09 +01:00
Guido Cossu	8986c9fedd	Single and double precision travis matrix	2016-08-30 14:25:24 +01:00
Guido Cossu	c80a1d427c	Retest original version of travis yaml	2016-08-30 14:05:05 +01:00
Guido Cossu	ae57032500	Separate single and double builds in travis	2016-08-30 14:00:34 +01:00
Guido Cossu	f75468728f	Another error on travis	2016-08-30 13:56:23 +01:00
Guido Cossu	5acd856663	Correction of error in travis	2016-08-30 13:49:49 +01:00
Guido Cossu	b0d3e4bb2c	Separating travis builds	2016-08-30 13:44:07 +01:00
Guido Cossu	b512ccbee6	HMC for Adjoint fermions works Accepts and reproduces known results Check initial instability of inverters when starting from hot configurations	2016-08-30 11:31:25 +01:00
paboyle	8c89391c02	FFTW unresolved fixed when no fftw3.h	2016-08-24 16:41:47 +01:00
paboyle	bfac5195b8	tidy up	2016-08-24 16:38:36 +01:00
paboyle	a782ca3238	Merge branch 'feature/fft-flop-count' into develop	2016-08-24 15:06:17 +01:00
paboyle	744691097f	Printing	2016-08-24 15:05:56 +01:00
paboyle	ff6da364e8	FFT double and single precision gives good performance now in multithreaded code.	2016-08-24 15:05:00 +01:00
Antonin Portelli	4d11a6f5f2	first commit for QPX intrinsics	2016-08-23 14:41:44 +01:00
paboyle	88be3b39bb	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2016-08-22 18:29:36 +01:00
paboyle	8a02824e08	Merge branch 'feature/FFT' into develop	2016-08-22 16:25:04 +01:00
paboyle	356e7940fd	fftw can be switched off	2016-08-22 16:24:49 +01:00
paboyle	73ce476890	Include fftw headers	2016-08-22 16:24:21 +01:00
paboyle	29c4ef41de	Adding a test for libfftw3	2016-08-22 16:21:01 +01:00
paboyle	e423a09974	FFT improved and test_FFT passing under MPI 8 processes, 8^4 for LatticeComplexD and LatticeSpinMatrixD	2016-08-18 02:23:21 +01:00
paboyle	17097a93ec	FFTW test ran over 4 mpi processes.	2016-08-17 01:33:55 +01:00
paboyle	94a6373a7f	Merge branch 'feature/eigen-cleanup' into develop	2016-08-15 23:58:34 +01:00
paboyle	4ab7dbfd57	Instantiate	2016-08-15 23:00:40 +01:00
paboyle	90e70790f3	Feature for z-Mobius prep	2016-08-15 22:31:29 +01:00
Guido Cossu	9c2e8d5e28	Nc=3 just to let all the test pass in Travis	2016-08-09 15:46:57 +01:00
Guido Cossu	147e2025b9	Added unit tests on the representation transformations Status: Passing all tests	2016-08-08 16:54:22 +01:00
Antonin Portelli	573b8c6020	build system: -O3 is not overriden by env CXXFLAGS	2016-08-06 01:26:24 +01:00
Antonin Portelli	15218ec57f	more Travis MPI fix	2016-08-06 00:49:14 +01:00
Antonin Portelli	ec68e08dd2	Travis MPI fix	2016-08-06 00:36:05 +01:00
paboyle	fc25d2295c	fftw download	2016-08-06 00:28:52 +01:00
paboyle	8dc2cfcedb	Adding fftw header pulling	2016-08-06 00:28:28 +01:00
Antonin Portelli	836f93780c	first try at including MPI tests in Travis	2016-08-05 13:41:52 +01:00
paboyle	5a68715be3	Richards sweep test	2016-08-05 10:51:57 +01:00
paboyle	32bc7a6ab8	MPI back out of change that hangs AVX2 for clang, gcc needs the -mfma flag.	2016-08-05 10:36:00 +01:00
Antonin Portelli	b65e72e521	Merge pull request #43 from rprollins/bench/output-format Benchmark_dwf_sweep and Benchmark_zmm output formats	2016-08-04 16:47:01 +01:00
Antonin Portelli	d1aaff65e8	README update	2016-08-04 16:27:02 +01:00
Antonin Portelli	93d29bb699	build system improvements after discussion with Peter	2016-08-04 16:19:59 +01:00
Antonin Portelli	3b376ed54e	build system: error if MPI not found	2016-08-03 15:23:38 +01:00
Antonin Portelli	d5c1f614ba	gitignore update	2016-08-03 15:14:33 +01:00
Antonin Portelli	2edc24225d	untracking ltmain.sh	2016-08-03 15:12:44 +01:00
Antonin Portelli	629283726b	build system: local Grid link flag moved to configure.ac	2016-08-03 15:07:42 +01:00
Antonin Portelli	6adb66dd08	build system: finer management of GMP/MPFR dependence	2016-08-03 15:06:45 +01:00
Antonin Portelli	5be92bb708	link fix in README	2016-08-03 12:40:56 +01:00
Antonin Portelli	f4c049ea6d	README update	2016-08-03 12:38:54 +01:00
Antonin Portelli	bc092ad30f	build system fix	2016-08-03 11:47:38 +01:00
Antonin Portelli	dad642ed1b	various build system fixes and improvements	2016-08-03 11:39:20 +01:00
Antonin Portelli	63ae39abc7	proper propagation of OpenMP flags	2016-08-02 17:41:32 +01:00
Antonin Portelli	9e5b934d21	improved LAPACK configuration	2016-08-02 17:26:54 +01:00
Antonin Portelli	a7b483d67a	Tests in subdirectories are not built by default	2016-08-02 12:14:28 +01:00
Antonin Portelli	bb99ce0680	bootstrap script fix	2016-08-01 09:51:06 +01:00
Antonin Portelli	83307df1af	travis update for new build system	2016-08-01 09:38:40 +01:00
Guido Cossu	49b5c49851	Checked the hermiticity of the op in derivative, ok Still CG fails to converge	2016-07-31 12:37:33 +01:00
Antonin Portelli	e9f30cab2c	first working version for the new build system	2016-07-30 17:53:18 +01:00
Guido Cossu	089f0ab582	Debugged HMC for Creutz relation	2016-07-28 16:44:41 +01:00
Richard Rollins	df6c9f55d1	Use common benchmark output format for dwf_sweep and zmm	2016-07-20 17:38:56 +01:00
Guido Cossu	b93e18ed50	Modified the Dirac Kernel class to compile with different number of colours Added the general push_back functionality to accomodate for all defined representations Compiles, not tested	2016-07-18 16:36:28 +01:00
Guido Cossu	9c77bb69a5	Added all elements for Hirep HMC TODO: Test and debug	2016-07-18 12:05:23 +01:00
paboyle	27f3ecc833	Merge branch 'feature/bugfix-ck-cj' into develop	2016-07-16 01:59:52 +01:00
paboyle	f9e90eeb1f	Sign error on the force for 4d fields fixed	2016-07-16 01:52:44 +01:00
paboyle	fad5c675eb	sign error on the 4d gparity force	2016-07-16 01:51:56 +01:00
paboyle	4908b77d46	Fixed conflicts. PLEASE avoid making wholesale cosmetic only changes, this created a HUGE amount of difficult to resolve and understand conflicts . Wholesale formatting, reordering functions etc... in a central file like Tensor_class or Grid_vector_types while others are also editing without making substantial functionality changes creates pain.	2016-07-15 20:59:07 +01:00
paboyle	f4dd5062d7	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2016-07-15 19:26:06 +01:00
paboyle	da34d75841	Merge branch 'feature/Ls-vectorised-actions' into develop	2016-07-15 19:09:47 +01:00
paboyle	980ff18956	Solving the instantiation no compile issue	2016-07-15 17:19:44 +01:00
Guido Cossu	7edf4c6c04	Added HMC utitities for the higher representations TODO: Inherit types for the pseudofermions, Debugging, testing	2016-07-15 13:39:47 +01:00
paboyle	1a6c7204ac	Disable instantiation; Use cache version instead	2016-07-15 00:34:39 +01:00
paboyle	49310fbab3	Done with red black change over	2016-07-15 00:08:43 +01:00
paboyle	6049d5ac47	Update	2016-07-15 00:08:32 +01:00
paboyle	35d0d35238	Updated file list	2016-07-15 00:02:53 +01:00
paboyle	c0e878705e	Updated file list	2016-07-15 00:02:39 +01:00
paboyle	5c0c8efb9e	Updated file list	2016-07-15 00:02:11 +01:00
paboyle	dfd714e1ef	Multiple implementations for the 5d hopping terms, depending on cache friendly ops and/or the 5th direction being vectorised All use 4d redblack.	2016-07-15 00:00:09 +01:00
paboyle	79a8ca1a62	Rewrite for performance. Impl dependent instantiations give 4d linalg impls of the 5d hopping terms (and inverse) Cache friendly loop orderings of the above Dense matrix stored and apply to the above -- Switch to Ls vectorised, and use dense matrix approach for the MooeeInv and rotate/shift of the Mooee M5D routines.	2016-07-14 23:58:15 +01:00
paboyle	fb45eb2eb2	5d ls vec rename of impl class	2016-07-14 23:57:26 +01:00
paboyle	a307274c96	Fermion impl rename for ls vectorised 5d approaches	2016-07-14 23:56:13 +01:00
paboyle	3f2c44a5fe	Updating the class to 5d selection based on impl type	2016-07-14 23:55:26 +01:00
paboyle	48fb1cdc11	Update domain 5d vectorised impl type, move the type over to 4d redblack with the dense OO inverse	2016-07-14 23:54:35 +01:00
paboyle	8a79e93cc2	Rename the 5d domain wall fermion vectorised Ls impl class	2016-07-14 23:53:00 +01:00
paboyle	3493b51879	Modest updates	2016-07-14 23:52:13 +01:00
paboyle	de3e79d300	red black for Ls vectorised is 4d red black. Update accordingly now I've made this choice	2016-07-14 23:49:42 +01:00
paboyle	dd62a61c5c	Added broadcast and rotation of simd vectors	2016-07-14 23:49:00 +01:00
paboyle	8f47d0b5ab	Rotation needed for hopping term in fifth dim with Ls vectorised fields	2016-07-14 23:45:36 +01:00
paboyle	42af132dab	Fix for chris kellys request to peek poke on checkerboarded fields	2016-07-14 23:44:48 +01:00
paboyle	9db2c6525d	updating benchmarks for red black 4d for Ls vectorised code	2016-07-14 23:44:02 +01:00
paboyle	adbc7c1188	Adding files for multiple implementations (cache opt) and Ls vectorisation of the 5D cayley form chiral fermions for the 5d matrix. With Ls entirely in the vector direction, s-hopping terms involve rotations. The serial dependence of the LDU inversion for Mobius and 4d even odd checkerboarding is removed by simply applying Ls^2 operations (vectorised many ways) as a dense matrix operation. This should give similar throughput but high flops (non-compulsory flops) but enable use of the KNL cache friendly kernels throughout the code. Ls is still constrained to be a multiple of Nsimd, which is as much as 8 for AVX512 with single precision.	2016-07-14 22:59:21 +01:00
Guido Cossu	9dc345e8e8	Debugged smearing and adding HMC functions for hirep	2016-07-13 17:51:18 +01:00
Christopher Kelly	8b9301a74c	Merge branch 'feature/bugfixes' into develop	2016-07-13 12:31:34 -04:00
Christopher Kelly	6f47fbb1e2	Disabled parallel for loops in ExtractSlice and InsertSlice due to race conditions. Likely will need to do so for localConvert too.	2016-07-13 10:49:18 -04:00
Guido Cossu	a9ae30f868	Added representations definitions for the HMC	2016-07-12 13:36:10 +01:00
Christopher Kelly	a3c0fb79b6	Fix to iVector and iMatrix pokeIndex and checkerboard local site indexing.	2016-07-11 17:15:22 -04:00
paboyle	62601bb649	Bug fix	2016-07-08 20:46:29 +01:00
paboyle	ef97e32152	Adding persistent communicators	2016-07-08 17:16:08 +01:00
Guido Cossu	daea5297ee	Wrote the projector in the adjoint representation algebra	2016-07-08 16:14:16 +01:00
Guido Cossu	5028969d4b	Added generators for the adjoint representation	2016-07-08 15:40:11 +01:00
paboyle	c667d9fdcc	Trying to make compile clean on travis; seem to have a make -j 4 problem with fftw	2016-07-07 23:26:39 +01:00
paboyle	7dbb94bab2	Update	2016-07-07 22:51:37 +01:00
paboyle	236dcc820b	typo fix	2016-07-07 22:46:11 +01:00
paboyle	a42a441a6a	Rename the reconfigure script to ./autogen.sh	2016-07-07 22:35:45 +01:00
paboyle	a0676beeb1	Open up dependency on Eigen and FFTW	2016-07-07 22:31:07 +01:00
Christopher Kelly	c5106d0c03	Bugfix	2016-07-07 16:06:30 -04:00
Guido Cossu	fbf96b1bbb	]Merge branch 'develop' into feature/hirep	2016-07-07 14:20:10 +01:00
Guido Cossu	3c49ddfaa4	Merge branch 'temporary-smearing' into develop	2016-07-07 14:04:59 +01:00
Guido Cossu	ffb8b3116c	Tested smeared RHMC Wilson1p1, accepting	2016-07-07 11:49:36 +01:00
Christopher Kelly	290493e162	Merge branch 'feature/multi_prec' into develop	2016-07-06 19:29:57 -04:00
Christopher Kelly	dd8cfff111	Another fix for pedantic compilers	2016-07-06 18:22:15 -04:00
Christopher Kelly	184642adb0	Fix for pedantic compilers	2016-07-06 18:15:15 -04:00
Christopher Kelly	4774a3bcd2	Generalized HotConfiguration and functions it calls to accept gauge fields with precision other than the default.	2016-07-06 18:01:08 -04:00
Christopher Kelly	25fafa9a89	Comment	2016-07-06 16:19:41 -04:00
Christopher Kelly	713520d3d2	Added tester for mixed CG	2016-07-06 16:18:19 -04:00
Christopher Kelly	85ed8175cb	Implemented mixed precision CG. Fixed filelist to exclude lib/Old directory and include Config.h.	2016-07-06 15:57:04 -04:00
Christopher Kelly	df5c788ef2	Merge branch 'develop' into feature/multi_prec	2016-07-06 14:52:28 -04:00
Christopher Kelly	15f22425c8	Added option to prevent CG from exiting when it fails to converge	2016-07-06 14:50:01 -04:00
Guido Cossu	e87182cf98	Debugged the copy constructor of the Lattice class	2016-07-06 15:31:00 +01:00
Guido Cossu	e3d5319470	Debugged the real() and imag() functions and added tests to Test_Simd	2016-07-06 14:16:03 +01:00
Guido Cossu	ffedeb1c58	Minor modifications	2016-07-06 11:41:27 +01:00
Guido Cossu	3e3b367aa9	Small changes in the Log files	2016-07-05 15:05:28 +01:00
Guido Cossu	3e80947c2b	Cleaned up HMC output. Tested smeared HMCs for single precision (OK)	2016-07-05 12:03:54 +01:00
Guido Cossu	fdfbf11c6d	Merge branch 'develop' into temporary-smearing	2016-07-04 18:45:10 +01:00
Guido Cossu	9cb90f714e	Merge remote-tracking branch 'origin/develop' into temporary-smearing	2016-07-04 17:28:40 +01:00
Guido Cossu	6ce174cd60	Testing smearing for RHMC routines	2016-07-04 16:36:49 +01:00
Guido Cossu	17ca5240f7	Testet smeared EOWilsonRatio, accepts	2016-07-04 16:25:15 +01:00
Guido Cossu	2daffdf95d	Tested smeared WilsonRatio action, accepts	2016-07-04 16:17:28 +01:00
Guido Cossu	149f826601	Tested smearing for Nf2 WilsonFermionAction, non EO: accepts	2016-07-04 16:09:19 +01:00
Guido Cossu	cd8ee27080	Simple change in iGamma for smearing	2016-07-04 16:02:57 +01:00
Guido Cossu	0fa66e8f3c	Debugged smearing for EOWilson, accepts	2016-07-04 15:35:37 +01:00
Guido Cossu	8dd099267d	Corrected a bug in the Expression Templates (acso and asin were wrong)	2016-07-03 12:28:25 +01:00
Guido Cossu	1a6d65c6a4	Converted set_uw and set_fj to all complex functions	2016-07-03 10:27:43 +01:00
paboyle	fc4a043663	Colors and banner clean up	2016-07-02 16:15:38 +01:00
paboyle	61ba50665e	Merge branch 'hotfix/v0.5.1' into develop	2016-07-01 16:34:30 +01:00
paboyle	bfe14000a9	Double compile fix	2016-07-01 16:33:51 +01:00
Guido Cossu	092fa0d8da	Debugged set_fj, to be fixed: BUG in imag()	2016-07-01 16:06:20 +01:00
paboyle	1ceff48133	Merge branch 'release/v0.5.0' into develop	2016-06-30 15:15:59 -07:00
paboyle	680645f849	Merge branch 'release/v0.5.0'	2016-06-30 15:15:03 -07:00
Guido Cossu	565e9329ba	Changed the colouring classes	2016-06-30 16:51:03 +01:00
Guido Cossu	5e02392f9c	Fixed compilation error for benchmark_dwf Some parts were assuming floating point precision	2016-06-20 12:30:51 +01:00
neo	339be37dba	Debugging smeared HMC	2016-04-13 17:00:14 +09:00
neo	a87b744621	HMC runs but does not accept with smearing on	2016-04-07 16:45:11 +09:00
Guido Cossu	97d0d56bcb	Debugging Smearing routines (set_fj)	2016-04-06 17:58:43 +09:00
Guido Cossu	7c7ea35ffb	Putting the Traceless Antihermitian part outside the deriv in pseudofermion actions	2016-04-05 16:28:09 +09:00
Guido Cossu	4b1cf580e0	Debugging the Smearing routines	2016-04-05 16:19:30 +09:00
Guido Cossu	2d8bb356e3	Smearing routines compile (still untested)	2016-02-25 02:43:59 +09:00
Guido Cossu	a7251f28c7	Stout smearing compiles (untested)	2016-02-24 03:16:50 +09:00
neo	c1b1b89d17	More on smearing routines, writing APEsmear (dev)	2016-02-19 17:15:27 +09:00
neo	771235017d	Adding smearing routines (development)	2016-02-19 15:30:41 +09:00